data-hydrator 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_hydrator/__init__.py +4 -0
- data_hydrator/collection/__init__.py +0 -0
- data_hydrator/collection/entry_builder.py +98 -0
- data_hydrator/collection/serializer.py +42 -0
- data_hydrator/collection/slug_generator.py +33 -0
- data_hydrator/config.py +121 -0
- data_hydrator/extraction/__init__.py +0 -0
- data_hydrator/extraction/content_extractor.py +95 -0
- data_hydrator/extraction/field_extractor.py +168 -0
- data_hydrator/extraction/unknown_extractor.py +213 -0
- data_hydrator/io/__init__.py +0 -0
- data_hydrator/io/page_loader.py +74 -0
- data_hydrator/io/phase_cache.py +100 -0
- data_hydrator/io/staging_writer.py +98 -0
- data_hydrator/manifest/__init__.py +0 -0
- data_hydrator/manifest/manifest_builder.py +67 -0
- data_hydrator/media/__init__.py +0 -0
- data_hydrator/media/image_processor.py +102 -0
- data_hydrator/media/r2_client.py +70 -0
- data_hydrator/media/url_cache.py +57 -0
- data_hydrator/models.py +261 -0
- data_hydrator/phases/__init__.py +0 -0
- data_hydrator/phases/phase2_segmentation.py +103 -0
- data_hydrator/phases/phase3_classification.py +170 -0
- data_hydrator/phases/phase4_mapping.py +164 -0
- data_hydrator/phases/phase5_hydration.py +348 -0
- data_hydrator/pipeline.py +138 -0
- data_hydrator/schema/__init__.py +0 -0
- data_hydrator/schema/representative_picker.py +67 -0
- data_hydrator/schema/schema_cache.py +72 -0
- data_hydrator/schema/schema_generator.py +159 -0
- data_hydrator/urls/__init__.py +0 -0
- data_hydrator/urls/category_inferrer.py +72 -0
- data_hydrator/urls/redirect_builder.py +65 -0
- data_hydrator/urls/url_generator.py +83 -0
- data_hydrator-0.2.0.dist-info/METADATA +421 -0
- data_hydrator-0.2.0.dist-info/RECORD +39 -0
- data_hydrator-0.2.0.dist-info/WHEEL +4 -0
- data_hydrator-0.2.0.dist-info/licenses/LICENSE +661 -0
|
File without changes
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from data_hydrator.models import (
|
|
2
|
+
CollectionEntry, CollectionFormat, EntryStatus,
|
|
3
|
+
SegmentExtractionResult, ImageUploadResult, ExtractionTrack,
|
|
4
|
+
ImageUploadStatus,
|
|
5
|
+
)
|
|
6
|
+
from data_hydrator.config import HydrationSettings
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def build_entry(
|
|
10
|
+
extraction_result: SegmentExtractionResult,
|
|
11
|
+
image_results: list[ImageUploadResult],
|
|
12
|
+
slug: str,
|
|
13
|
+
collection_name: str,
|
|
14
|
+
generated_url: str,
|
|
15
|
+
original_url: str,
|
|
16
|
+
page_title: str,
|
|
17
|
+
settings: HydrationSettings,
|
|
18
|
+
) -> CollectionEntry:
|
|
19
|
+
# Build CDN URL lookup: original → cdn
|
|
20
|
+
cdn_map: dict[str, str] = {
|
|
21
|
+
r.original_url: r.cdn_url
|
|
22
|
+
for r in image_results
|
|
23
|
+
if r.status in (ImageUploadStatus.UPLOADED, ImageUploadStatus.CACHED)
|
|
24
|
+
and r.cdn_url
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# Build data dict from successful extractions
|
|
28
|
+
data: dict = {}
|
|
29
|
+
body_markdown = ""
|
|
30
|
+
warnings: list[str] = []
|
|
31
|
+
|
|
32
|
+
for field in extraction_result.extracted_fields:
|
|
33
|
+
if not field.extraction_success:
|
|
34
|
+
continue
|
|
35
|
+
value = field.processed_value
|
|
36
|
+
|
|
37
|
+
# Rewrite image URLs to CDN
|
|
38
|
+
if field.content_type == "image_url" and isinstance(value, str):
|
|
39
|
+
value = cdn_map.get(value, value)
|
|
40
|
+
if value not in cdn_map and field.processed_value and field.processed_value not in cdn_map:
|
|
41
|
+
if field.processed_value:
|
|
42
|
+
warnings.append(f"Image upload failed for {field.field_name}; keeping original URL")
|
|
43
|
+
|
|
44
|
+
data[field.field_name] = value
|
|
45
|
+
|
|
46
|
+
# For markdown entries: pop body field from data into body_markdown
|
|
47
|
+
fmt = _resolve_format(extraction_result.component_type, settings)
|
|
48
|
+
if fmt == CollectionFormat.MARKDOWN:
|
|
49
|
+
for body_field in ("body", "content", "description"):
|
|
50
|
+
if body_field in data:
|
|
51
|
+
candidate = data[body_field]
|
|
52
|
+
if isinstance(candidate, str) and len(candidate) > 50:
|
|
53
|
+
body_markdown = candidate
|
|
54
|
+
del data[body_field]
|
|
55
|
+
break
|
|
56
|
+
# Also grab from Track B markdown
|
|
57
|
+
if not body_markdown:
|
|
58
|
+
for field in extraction_result.extracted_fields:
|
|
59
|
+
if field.content_type == "markdown" and field.extraction_success:
|
|
60
|
+
body_markdown = str(field.processed_value or "")
|
|
61
|
+
data.pop(field.field_name, None)
|
|
62
|
+
break
|
|
63
|
+
|
|
64
|
+
# Title resolution
|
|
65
|
+
title = (
|
|
66
|
+
data.get("title")
|
|
67
|
+
or data.get("name")
|
|
68
|
+
or data.get("heading")
|
|
69
|
+
or page_title
|
|
70
|
+
or _slug_to_title(slug)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
draft = len(extraction_result.missing_required_fields) > 0
|
|
74
|
+
|
|
75
|
+
return CollectionEntry(
|
|
76
|
+
slug=slug,
|
|
77
|
+
collection_name=collection_name,
|
|
78
|
+
format=fmt,
|
|
79
|
+
data=data,
|
|
80
|
+
draft=draft,
|
|
81
|
+
title=str(title),
|
|
82
|
+
body_markdown=body_markdown,
|
|
83
|
+
source_segment_id=extraction_result.segment_id,
|
|
84
|
+
source_page_url=extraction_result.page_url,
|
|
85
|
+
generated_url=generated_url,
|
|
86
|
+
original_url=original_url,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _resolve_format(component_type: str, settings: HydrationSettings) -> CollectionFormat:
|
|
91
|
+
content_prefixes = ("content.", "collection.blog", "collection.news")
|
|
92
|
+
if any(component_type.startswith(p) for p in content_prefixes):
|
|
93
|
+
return CollectionFormat.MARKDOWN
|
|
94
|
+
return CollectionFormat.JSON
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _slug_to_title(slug: str) -> str:
|
|
98
|
+
return slug.replace("-", " ").title()
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
import yaml
|
|
4
|
+
from data_hydrator.models import CollectionEntry, CollectionFormat
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def serialize(entry: CollectionEntry) -> str:
|
|
8
|
+
if entry.format == CollectionFormat.MARKDOWN:
|
|
9
|
+
return _serialize_markdown(entry)
|
|
10
|
+
return _serialize_json(entry)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _serialize_json(entry: CollectionEntry) -> str:
|
|
14
|
+
payload: dict = {}
|
|
15
|
+
payload["draft"] = entry.draft
|
|
16
|
+
payload["title"] = entry.title
|
|
17
|
+
payload["generatedUrl"] = entry.generated_url
|
|
18
|
+
for k, v in entry.data.items():
|
|
19
|
+
payload[_snake_to_camel(k)] = v
|
|
20
|
+
return json.dumps(payload, indent=2, ensure_ascii=False)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _serialize_markdown(entry: CollectionEntry) -> str:
|
|
24
|
+
frontmatter: dict = {}
|
|
25
|
+
frontmatter["draft"] = entry.draft
|
|
26
|
+
frontmatter["title"] = entry.title
|
|
27
|
+
frontmatter["generatedUrl"] = entry.generated_url
|
|
28
|
+
for k, v in entry.data.items():
|
|
29
|
+
frontmatter[_snake_to_camel(k)] = v
|
|
30
|
+
|
|
31
|
+
yaml_block = _to_yaml_frontmatter(frontmatter)
|
|
32
|
+
body = entry.body_markdown or ""
|
|
33
|
+
return f"---\n{yaml_block}---\n\n{body}\n"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _to_yaml_frontmatter(data: dict) -> str:
|
|
37
|
+
return yaml.dump(data, default_flow_style=False, allow_unicode=True)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _snake_to_camel(s: str) -> str:
|
|
41
|
+
parts = s.split("_")
|
|
42
|
+
return parts[0] + "".join(p.capitalize() for p in parts[1:])
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class SlugGenerator:
|
|
6
|
+
|
|
7
|
+
def __init__(self, max_length: int = 80):
|
|
8
|
+
self._max_length = max_length
|
|
9
|
+
self._used: dict[str, set[str]] = defaultdict(set)
|
|
10
|
+
|
|
11
|
+
def generate(self, title: str, collection: str) -> str:
|
|
12
|
+
base = self._slugify(title)
|
|
13
|
+
return self._make_unique(base, collection)
|
|
14
|
+
|
|
15
|
+
def _slugify(self, text: str) -> str:
|
|
16
|
+
if not text:
|
|
17
|
+
return "untitled"
|
|
18
|
+
text = text.lower()
|
|
19
|
+
text = re.sub(r"[^a-z0-9]+", "-", text)
|
|
20
|
+
text = text.strip("-")
|
|
21
|
+
return text[: self._max_length] or "untitled"
|
|
22
|
+
|
|
23
|
+
def _make_unique(self, slug: str, collection: str) -> str:
|
|
24
|
+
if slug not in self._used[collection]:
|
|
25
|
+
self._used[collection].add(slug)
|
|
26
|
+
return slug
|
|
27
|
+
counter = 2
|
|
28
|
+
while True:
|
|
29
|
+
candidate = f"{slug}-{counter}"
|
|
30
|
+
if candidate not in self._used[collection]:
|
|
31
|
+
self._used[collection].add(candidate)
|
|
32
|
+
return candidate
|
|
33
|
+
counter += 1
|
data_hydrator/config.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class PipelineDirsConfig(BaseModel):
|
|
6
|
+
scrape_dir: str = "./scrape_output"
|
|
7
|
+
segmented_dir: str = "./segmented"
|
|
8
|
+
classified_dir: str = "./classified"
|
|
9
|
+
mapped_dir: str = "./mapped"
|
|
10
|
+
staging_dir: str = "./staging"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ResumeConfig(BaseModel):
|
|
14
|
+
skip_segmentation_if_cached: bool = True
|
|
15
|
+
skip_classification_if_cached: bool = True
|
|
16
|
+
skip_mapping_if_cached: bool = True
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SegmenterConfig(BaseModel):
|
|
20
|
+
use_llm: bool = True
|
|
21
|
+
llm_model: str = "anthropic/claude-haiku-4-5"
|
|
22
|
+
llm_confidence_threshold: float = 0.7
|
|
23
|
+
max_concurrent_pages: int = 10
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ClassifierConfig(BaseModel):
|
|
27
|
+
rule_based_confidence_threshold: float = 0.90
|
|
28
|
+
l1_cache_path: str = ".cache/l1_fingerprints.json"
|
|
29
|
+
l2_cache_path: str = ".cache/l2_clusters.json"
|
|
30
|
+
litellm_batch_size: int = 20
|
|
31
|
+
max_concurrent_batches: int = 5
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class MapperConfig(BaseModel):
|
|
35
|
+
mcp_transport: str = "stdio"
|
|
36
|
+
signature_index_cache_path: str = ".cache/signature_index.json"
|
|
37
|
+
custom_registry_path: str = ".cache/custom_registry.json"
|
|
38
|
+
litellm_batch_size: int = 15
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class SchemaGenConfig(BaseModel):
|
|
42
|
+
model: str = "anthropic/claude-sonnet-4-5"
|
|
43
|
+
unknown_model: str = "anthropic/claude-haiku-4-5"
|
|
44
|
+
representatives_per_type: int = 3
|
|
45
|
+
unknown_batch_size: int = 20
|
|
46
|
+
max_concurrent_unknown_batches: int = 5
|
|
47
|
+
timeout_seconds: int = 60
|
|
48
|
+
schema_cache_path: str = ".cache/extraction_schemas"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ContentExtractionConfig(BaseModel):
|
|
52
|
+
markdown_library: str = "markdownify"
|
|
53
|
+
allowed_tags: list[str] = [
|
|
54
|
+
"p", "strong", "em", "b", "i",
|
|
55
|
+
"ul", "ol", "li", "h1", "h2", "h3", "h4", "h5",
|
|
56
|
+
"a", "blockquote", "code", "pre",
|
|
57
|
+
"table", "thead", "tbody", "tr", "th", "td", "img",
|
|
58
|
+
]
|
|
59
|
+
always_strip_tags: list[str] = [
|
|
60
|
+
"script", "style", "iframe", "noscript",
|
|
61
|
+
"nav", "header", "footer", "aside",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class URLGenConfig(BaseModel):
|
|
66
|
+
type_base_paths: dict[str, str] = {
|
|
67
|
+
"collection.product_card": "/products",
|
|
68
|
+
"collection.product_list": "/products",
|
|
69
|
+
"collection.blog_card": "/blog",
|
|
70
|
+
"collection.blog_list": "/blog",
|
|
71
|
+
"collection.news_item": "/news",
|
|
72
|
+
"content.article": "/articles",
|
|
73
|
+
}
|
|
74
|
+
category_detection_depth: int = 3
|
|
75
|
+
category_min_occurrence: int = 2
|
|
76
|
+
max_slug_length: int = 80
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class StagingConfig(BaseModel):
|
|
80
|
+
astro_content_prefix: str = "src/content"
|
|
81
|
+
astro_pages_prefix: str = "src/pages/_data"
|
|
82
|
+
create_backup_on_overwrite: bool = True
|
|
83
|
+
manifest_filename: str = "hydration_manifest.json"
|
|
84
|
+
redirects_json_filename: str = "redirects.json"
|
|
85
|
+
redirects_csv_filename: str = "redirects.csv"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class R2Config(BaseModel):
|
|
89
|
+
bucket_name: str = ""
|
|
90
|
+
endpoint_url: str = ""
|
|
91
|
+
access_key_id: str = ""
|
|
92
|
+
secret_access_key: str = ""
|
|
93
|
+
public_base_url: str = ""
|
|
94
|
+
key_prefix: str = "images"
|
|
95
|
+
max_concurrent_uploads: int = 10
|
|
96
|
+
max_concurrent_downloads: int = 20
|
|
97
|
+
download_timeout_seconds: int = 30
|
|
98
|
+
upload_timeout_seconds: int = 60
|
|
99
|
+
url_cache_path: str = ".cache/image_url_cache.json"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class HydrationSettings(BaseSettings):
|
|
103
|
+
model_config = SettingsConfigDict(env_file=".env", env_prefix="HYDRATOR_")
|
|
104
|
+
|
|
105
|
+
site_id: str = ""
|
|
106
|
+
|
|
107
|
+
dirs: PipelineDirsConfig = PipelineDirsConfig()
|
|
108
|
+
resume: ResumeConfig = ResumeConfig()
|
|
109
|
+
segmenter: SegmenterConfig = SegmenterConfig()
|
|
110
|
+
classifier: ClassifierConfig = ClassifierConfig()
|
|
111
|
+
mapper: MapperConfig = MapperConfig()
|
|
112
|
+
schema_gen: SchemaGenConfig = SchemaGenConfig()
|
|
113
|
+
content: ContentExtractionConfig = ContentExtractionConfig()
|
|
114
|
+
url_gen: URLGenConfig = URLGenConfig()
|
|
115
|
+
staging: StagingConfig = StagingConfig()
|
|
116
|
+
r2: R2Config = R2Config()
|
|
117
|
+
|
|
118
|
+
content_type_prefixes: list[str] = [
|
|
119
|
+
"content.", "collection.blog", "collection.news",
|
|
120
|
+
]
|
|
121
|
+
max_concurrent_segments: int = 50
|
|
File without changes
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from urllib.parse import urljoin
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
import markdownify
|
|
6
|
+
from data_hydrator.models import ExtractionSchema, ExtractedField
|
|
7
|
+
from data_hydrator.config import ContentExtractionConfig
|
|
8
|
+
from data_hydrator.extraction.field_extractor import extract_fields
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def extract_content(
|
|
14
|
+
segment_html: str,
|
|
15
|
+
schema: ExtractionSchema,
|
|
16
|
+
page_url: str,
|
|
17
|
+
settings: ContentExtractionConfig,
|
|
18
|
+
) -> tuple[list[ExtractedField], str, list[str]]:
|
|
19
|
+
"""Returns (metadata_fields, body_markdown, missing_required_fields). Never raises."""
|
|
20
|
+
try:
|
|
21
|
+
# Step 1: Extract metadata fields using field selectors
|
|
22
|
+
metadata_fields, missing_required = extract_fields(segment_html, schema, page_url)
|
|
23
|
+
|
|
24
|
+
# Step 2: Parse for content extraction
|
|
25
|
+
soup = BeautifulSoup(segment_html, "lxml")
|
|
26
|
+
|
|
27
|
+
# Step 3: Find content container
|
|
28
|
+
if schema.content_selector:
|
|
29
|
+
container = soup.select_one(schema.content_selector)
|
|
30
|
+
if container is None:
|
|
31
|
+
logger.warning(
|
|
32
|
+
"content_selector '%s' found nothing; using full segment",
|
|
33
|
+
schema.content_selector,
|
|
34
|
+
)
|
|
35
|
+
container = soup
|
|
36
|
+
else:
|
|
37
|
+
container = soup
|
|
38
|
+
|
|
39
|
+
# Step 4: Remove noise elements
|
|
40
|
+
_remove_noise(container, schema.exclude_selectors)
|
|
41
|
+
|
|
42
|
+
# Step 5: Remove always-strip tags
|
|
43
|
+
for tag_name in settings.always_strip_tags:
|
|
44
|
+
for el in container.find_all(tag_name):
|
|
45
|
+
el.decompose()
|
|
46
|
+
|
|
47
|
+
# Step 6: Convert to Markdown
|
|
48
|
+
content_html = str(container)
|
|
49
|
+
md = markdownify.markdownify(
|
|
50
|
+
content_html,
|
|
51
|
+
strip=settings.always_strip_tags,
|
|
52
|
+
convert=settings.allowed_tags,
|
|
53
|
+
heading_style="ATX",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Post-process
|
|
57
|
+
md = re.sub(r"\n{3,}", "\n\n", md)
|
|
58
|
+
md = md.strip()
|
|
59
|
+
md = _resolve_relative_urls(md, page_url)
|
|
60
|
+
|
|
61
|
+
return metadata_fields, md, missing_required
|
|
62
|
+
|
|
63
|
+
except Exception as exc:
|
|
64
|
+
logger.error("Content extraction failed: %s", exc)
|
|
65
|
+
return [], "", []
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _remove_noise(soup: BeautifulSoup, exclude_selectors: list[str]) -> None:
|
|
69
|
+
for selector in exclude_selectors:
|
|
70
|
+
try:
|
|
71
|
+
for el in soup.select(selector):
|
|
72
|
+
el.decompose()
|
|
73
|
+
except Exception as exc:
|
|
74
|
+
logger.warning("Exclude selector '%s' failed: %s", selector, exc)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _resolve_relative_urls(markdown: str, base_url: str) -> str:
|
|
78
|
+
if not base_url:
|
|
79
|
+
return markdown
|
|
80
|
+
|
|
81
|
+
def replace_image(m: re.Match) -> str:
|
|
82
|
+
alt, url = m.group(1), m.group(2)
|
|
83
|
+
resolved = urljoin(base_url, url)
|
|
84
|
+
return f""
|
|
85
|
+
|
|
86
|
+
def replace_link(m: re.Match) -> str:
|
|
87
|
+
text, url = m.group(1), m.group(2)
|
|
88
|
+
resolved = urljoin(base_url, url)
|
|
89
|
+
return f"[{text}]({resolved})"
|
|
90
|
+
|
|
91
|
+
# Images first (more specific pattern)
|
|
92
|
+
markdown = re.sub(r"!\[([^\]]*)\]\(([^)]+)\)", replace_image, markdown)
|
|
93
|
+
# Then links
|
|
94
|
+
markdown = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", replace_link, markdown)
|
|
95
|
+
return markdown
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from urllib.parse import urljoin
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from data_hydrator.models import ExtractionSchema, ExtractedField, FieldSelector
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def extract_fields(
|
|
11
|
+
segment_html: str,
|
|
12
|
+
schema: ExtractionSchema,
|
|
13
|
+
page_url: str,
|
|
14
|
+
) -> tuple[list[ExtractedField], list[str]]:
|
|
15
|
+
"""Returns (fields, missing_required_field_names). Never raises."""
|
|
16
|
+
try:
|
|
17
|
+
soup = BeautifulSoup(segment_html, "lxml")
|
|
18
|
+
except Exception as exc:
|
|
19
|
+
logger.error("Failed to parse segment HTML: %s", exc)
|
|
20
|
+
missing = [fs.field_name for fs in schema.field_selectors if fs.required]
|
|
21
|
+
return [], missing
|
|
22
|
+
|
|
23
|
+
fields: list[ExtractedField] = []
|
|
24
|
+
missing_required: list[str] = []
|
|
25
|
+
|
|
26
|
+
for selector in schema.field_selectors:
|
|
27
|
+
field = _extract_one(soup, selector, page_url)
|
|
28
|
+
fields.append(field)
|
|
29
|
+
if not field.extraction_success and selector.required:
|
|
30
|
+
missing_required.append(selector.field_name)
|
|
31
|
+
|
|
32
|
+
return fields, missing_required
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _extract_one(
|
|
36
|
+
soup: BeautifulSoup,
|
|
37
|
+
selector: FieldSelector,
|
|
38
|
+
page_url: str,
|
|
39
|
+
) -> ExtractedField:
|
|
40
|
+
css, attr_override = _parse_selector_attr(selector.css_selector)
|
|
41
|
+
attribute = selector.attribute or attr_override
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
if selector.content_type == "list" or selector.multiple:
|
|
45
|
+
elements = soup.select(css)
|
|
46
|
+
if not elements:
|
|
47
|
+
return ExtractedField(
|
|
48
|
+
field_name=selector.field_name,
|
|
49
|
+
raw_value=None,
|
|
50
|
+
processed_value=[],
|
|
51
|
+
content_type=selector.content_type,
|
|
52
|
+
extraction_success=False,
|
|
53
|
+
warning=f"No elements matched selector '{css}'",
|
|
54
|
+
)
|
|
55
|
+
values = [el.get_text(strip=True) for el in elements]
|
|
56
|
+
return ExtractedField(
|
|
57
|
+
field_name=selector.field_name,
|
|
58
|
+
raw_value=values,
|
|
59
|
+
processed_value=values,
|
|
60
|
+
content_type=selector.content_type,
|
|
61
|
+
extraction_success=True,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
element = soup.select_one(css)
|
|
65
|
+
|
|
66
|
+
if selector.content_type == "boolean":
|
|
67
|
+
val = element is not None
|
|
68
|
+
return ExtractedField(
|
|
69
|
+
field_name=selector.field_name,
|
|
70
|
+
raw_value=val,
|
|
71
|
+
processed_value=val,
|
|
72
|
+
content_type="boolean",
|
|
73
|
+
extraction_success=True,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
if element is None:
|
|
77
|
+
return ExtractedField(
|
|
78
|
+
field_name=selector.field_name,
|
|
79
|
+
raw_value=None,
|
|
80
|
+
processed_value=None,
|
|
81
|
+
content_type=selector.content_type,
|
|
82
|
+
extraction_success=False,
|
|
83
|
+
warning=f"No element matched selector '{css}'",
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
if selector.content_type == "text":
|
|
87
|
+
text = element.get_text(strip=True)
|
|
88
|
+
return ExtractedField(
|
|
89
|
+
field_name=selector.field_name,
|
|
90
|
+
raw_value=text,
|
|
91
|
+
processed_value=text,
|
|
92
|
+
content_type="text",
|
|
93
|
+
extraction_success=bool(text),
|
|
94
|
+
warning="" if text else f"Empty text at selector '{css}'",
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if selector.content_type == "image_url":
|
|
98
|
+
src = _get_image_src(element, attribute)
|
|
99
|
+
if src:
|
|
100
|
+
src = urljoin(page_url, src)
|
|
101
|
+
return ExtractedField(
|
|
102
|
+
field_name=selector.field_name,
|
|
103
|
+
raw_value=src,
|
|
104
|
+
processed_value=src,
|
|
105
|
+
content_type="image_url",
|
|
106
|
+
extraction_success=bool(src),
|
|
107
|
+
warning="" if src else f"Could not extract image URL at '{css}'",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
if selector.content_type == "url":
|
|
111
|
+
attr = attribute or "href"
|
|
112
|
+
href = element.get(attr, "")
|
|
113
|
+
if href:
|
|
114
|
+
href = urljoin(page_url, href)
|
|
115
|
+
return ExtractedField(
|
|
116
|
+
field_name=selector.field_name,
|
|
117
|
+
raw_value=href,
|
|
118
|
+
processed_value=href,
|
|
119
|
+
content_type="url",
|
|
120
|
+
extraction_success=bool(href),
|
|
121
|
+
warning="" if href else f"No href at selector '{css}'",
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Fallback: treat as text
|
|
125
|
+
text = element.get_text(strip=True)
|
|
126
|
+
return ExtractedField(
|
|
127
|
+
field_name=selector.field_name,
|
|
128
|
+
raw_value=text,
|
|
129
|
+
processed_value=text,
|
|
130
|
+
content_type=selector.content_type,
|
|
131
|
+
extraction_success=bool(text),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
except Exception as exc:
|
|
135
|
+
logger.warning("Field extraction failed for %s: %s", selector.field_name, exc)
|
|
136
|
+
return ExtractedField(
|
|
137
|
+
field_name=selector.field_name,
|
|
138
|
+
raw_value=None,
|
|
139
|
+
processed_value=None,
|
|
140
|
+
content_type=selector.content_type,
|
|
141
|
+
extraction_success=False,
|
|
142
|
+
warning=str(exc),
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _get_image_src(element, attribute: str | None) -> str:
|
|
147
|
+
"""Try multiple src attributes for lazy-loaded images."""
|
|
148
|
+
candidates = [attribute] if attribute else []
|
|
149
|
+
candidates += ["src", "data-src", "data-lazy-src"]
|
|
150
|
+
for attr in candidates:
|
|
151
|
+
if attr and element.get(attr):
|
|
152
|
+
return element[attr]
|
|
153
|
+
|
|
154
|
+
# Try background-image from style
|
|
155
|
+
style = element.get("style", "")
|
|
156
|
+
if "background-image" in style:
|
|
157
|
+
match = re.search(r"url\(['\"]?([^'\")\s]+)['\"]?\)", style)
|
|
158
|
+
if match:
|
|
159
|
+
return match.group(1)
|
|
160
|
+
return ""
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _parse_selector_attr(segment_field: str) -> tuple[str, str | None]:
|
|
164
|
+
"""Split 'img[src]' → ('img', 'src'), 'h2.name' → ('h2.name', None)."""
|
|
165
|
+
match = re.match(r"^(.*?)\[([^\]]+)\]$", segment_field)
|
|
166
|
+
if match:
|
|
167
|
+
return match.group(1), match.group(2)
|
|
168
|
+
return segment_field, None
|