data-hydrator 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data_hydrator/__init__.py +4 -0
  2. data_hydrator/collection/__init__.py +0 -0
  3. data_hydrator/collection/entry_builder.py +98 -0
  4. data_hydrator/collection/serializer.py +42 -0
  5. data_hydrator/collection/slug_generator.py +33 -0
  6. data_hydrator/config.py +121 -0
  7. data_hydrator/extraction/__init__.py +0 -0
  8. data_hydrator/extraction/content_extractor.py +95 -0
  9. data_hydrator/extraction/field_extractor.py +168 -0
  10. data_hydrator/extraction/unknown_extractor.py +213 -0
  11. data_hydrator/io/__init__.py +0 -0
  12. data_hydrator/io/page_loader.py +74 -0
  13. data_hydrator/io/phase_cache.py +100 -0
  14. data_hydrator/io/staging_writer.py +98 -0
  15. data_hydrator/manifest/__init__.py +0 -0
  16. data_hydrator/manifest/manifest_builder.py +67 -0
  17. data_hydrator/media/__init__.py +0 -0
  18. data_hydrator/media/image_processor.py +102 -0
  19. data_hydrator/media/r2_client.py +70 -0
  20. data_hydrator/media/url_cache.py +57 -0
  21. data_hydrator/models.py +261 -0
  22. data_hydrator/phases/__init__.py +0 -0
  23. data_hydrator/phases/phase2_segmentation.py +103 -0
  24. data_hydrator/phases/phase3_classification.py +170 -0
  25. data_hydrator/phases/phase4_mapping.py +164 -0
  26. data_hydrator/phases/phase5_hydration.py +348 -0
  27. data_hydrator/pipeline.py +138 -0
  28. data_hydrator/schema/__init__.py +0 -0
  29. data_hydrator/schema/representative_picker.py +67 -0
  30. data_hydrator/schema/schema_cache.py +72 -0
  31. data_hydrator/schema/schema_generator.py +159 -0
  32. data_hydrator/urls/__init__.py +0 -0
  33. data_hydrator/urls/category_inferrer.py +72 -0
  34. data_hydrator/urls/redirect_builder.py +65 -0
  35. data_hydrator/urls/url_generator.py +83 -0
  36. data_hydrator-0.2.0.dist-info/METADATA +421 -0
  37. data_hydrator-0.2.0.dist-info/RECORD +39 -0
  38. data_hydrator-0.2.0.dist-info/WHEEL +4 -0
  39. data_hydrator-0.2.0.dist-info/licenses/LICENSE +661 -0
@@ -0,0 +1,4 @@
1
+ from data_hydrator.pipeline import HydrationPipeline
2
+ from data_hydrator.config import HydrationSettings
3
+
4
+ __all__ = ["HydrationPipeline", "HydrationSettings"]
File without changes
@@ -0,0 +1,98 @@
1
+ from data_hydrator.models import (
2
+ CollectionEntry, CollectionFormat, EntryStatus,
3
+ SegmentExtractionResult, ImageUploadResult, ExtractionTrack,
4
+ ImageUploadStatus,
5
+ )
6
+ from data_hydrator.config import HydrationSettings
7
+
8
+
9
+ def build_entry(
10
+ extraction_result: SegmentExtractionResult,
11
+ image_results: list[ImageUploadResult],
12
+ slug: str,
13
+ collection_name: str,
14
+ generated_url: str,
15
+ original_url: str,
16
+ page_title: str,
17
+ settings: HydrationSettings,
18
+ ) -> CollectionEntry:
19
+ # Build CDN URL lookup: original → cdn
20
+ cdn_map: dict[str, str] = {
21
+ r.original_url: r.cdn_url
22
+ for r in image_results
23
+ if r.status in (ImageUploadStatus.UPLOADED, ImageUploadStatus.CACHED)
24
+ and r.cdn_url
25
+ }
26
+
27
+ # Build data dict from successful extractions
28
+ data: dict = {}
29
+ body_markdown = ""
30
+ warnings: list[str] = []
31
+
32
+ for field in extraction_result.extracted_fields:
33
+ if not field.extraction_success:
34
+ continue
35
+ value = field.processed_value
36
+
37
+ # Rewrite image URLs to CDN
38
+ if field.content_type == "image_url" and isinstance(value, str):
39
+ value = cdn_map.get(value, value)
40
+ if value not in cdn_map and field.processed_value and field.processed_value not in cdn_map:
41
+ if field.processed_value:
42
+ warnings.append(f"Image upload failed for {field.field_name}; keeping original URL")
43
+
44
+ data[field.field_name] = value
45
+
46
+ # For markdown entries: pop body field from data into body_markdown
47
+ fmt = _resolve_format(extraction_result.component_type, settings)
48
+ if fmt == CollectionFormat.MARKDOWN:
49
+ for body_field in ("body", "content", "description"):
50
+ if body_field in data:
51
+ candidate = data[body_field]
52
+ if isinstance(candidate, str) and len(candidate) > 50:
53
+ body_markdown = candidate
54
+ del data[body_field]
55
+ break
56
+ # Also grab from Track B markdown
57
+ if not body_markdown:
58
+ for field in extraction_result.extracted_fields:
59
+ if field.content_type == "markdown" and field.extraction_success:
60
+ body_markdown = str(field.processed_value or "")
61
+ data.pop(field.field_name, None)
62
+ break
63
+
64
+ # Title resolution
65
+ title = (
66
+ data.get("title")
67
+ or data.get("name")
68
+ or data.get("heading")
69
+ or page_title
70
+ or _slug_to_title(slug)
71
+ )
72
+
73
+ draft = len(extraction_result.missing_required_fields) > 0
74
+
75
+ return CollectionEntry(
76
+ slug=slug,
77
+ collection_name=collection_name,
78
+ format=fmt,
79
+ data=data,
80
+ draft=draft,
81
+ title=str(title),
82
+ body_markdown=body_markdown,
83
+ source_segment_id=extraction_result.segment_id,
84
+ source_page_url=extraction_result.page_url,
85
+ generated_url=generated_url,
86
+ original_url=original_url,
87
+ )
88
+
89
+
90
+ def _resolve_format(component_type: str, settings: HydrationSettings) -> CollectionFormat:
91
+ content_prefixes = ("content.", "collection.blog", "collection.news")
92
+ if any(component_type.startswith(p) for p in content_prefixes):
93
+ return CollectionFormat.MARKDOWN
94
+ return CollectionFormat.JSON
95
+
96
+
97
+ def _slug_to_title(slug: str) -> str:
98
+ return slug.replace("-", " ").title()
@@ -0,0 +1,42 @@
1
+ import json
2
+ import re
3
+ import yaml
4
+ from data_hydrator.models import CollectionEntry, CollectionFormat
5
+
6
+
7
+ def serialize(entry: CollectionEntry) -> str:
8
+ if entry.format == CollectionFormat.MARKDOWN:
9
+ return _serialize_markdown(entry)
10
+ return _serialize_json(entry)
11
+
12
+
13
+ def _serialize_json(entry: CollectionEntry) -> str:
14
+ payload: dict = {}
15
+ payload["draft"] = entry.draft
16
+ payload["title"] = entry.title
17
+ payload["generatedUrl"] = entry.generated_url
18
+ for k, v in entry.data.items():
19
+ payload[_snake_to_camel(k)] = v
20
+ return json.dumps(payload, indent=2, ensure_ascii=False)
21
+
22
+
23
+ def _serialize_markdown(entry: CollectionEntry) -> str:
24
+ frontmatter: dict = {}
25
+ frontmatter["draft"] = entry.draft
26
+ frontmatter["title"] = entry.title
27
+ frontmatter["generatedUrl"] = entry.generated_url
28
+ for k, v in entry.data.items():
29
+ frontmatter[_snake_to_camel(k)] = v
30
+
31
+ yaml_block = _to_yaml_frontmatter(frontmatter)
32
+ body = entry.body_markdown or ""
33
+ return f"---\n{yaml_block}---\n\n{body}\n"
34
+
35
+
36
+ def _to_yaml_frontmatter(data: dict) -> str:
37
+ return yaml.dump(data, default_flow_style=False, allow_unicode=True)
38
+
39
+
40
+ def _snake_to_camel(s: str) -> str:
41
+ parts = s.split("_")
42
+ return parts[0] + "".join(p.capitalize() for p in parts[1:])
@@ -0,0 +1,33 @@
1
+ import re
2
+ from collections import defaultdict
3
+
4
+
5
+ class SlugGenerator:
6
+
7
+ def __init__(self, max_length: int = 80):
8
+ self._max_length = max_length
9
+ self._used: dict[str, set[str]] = defaultdict(set)
10
+
11
+ def generate(self, title: str, collection: str) -> str:
12
+ base = self._slugify(title)
13
+ return self._make_unique(base, collection)
14
+
15
+ def _slugify(self, text: str) -> str:
16
+ if not text:
17
+ return "untitled"
18
+ text = text.lower()
19
+ text = re.sub(r"[^a-z0-9]+", "-", text)
20
+ text = text.strip("-")
21
+ return text[: self._max_length] or "untitled"
22
+
23
+ def _make_unique(self, slug: str, collection: str) -> str:
24
+ if slug not in self._used[collection]:
25
+ self._used[collection].add(slug)
26
+ return slug
27
+ counter = 2
28
+ while True:
29
+ candidate = f"{slug}-{counter}"
30
+ if candidate not in self._used[collection]:
31
+ self._used[collection].add(candidate)
32
+ return candidate
33
+ counter += 1
@@ -0,0 +1,121 @@
1
+ from pydantic import BaseModel
2
+ from pydantic_settings import BaseSettings, SettingsConfigDict
3
+
4
+
5
+ class PipelineDirsConfig(BaseModel):
6
+ scrape_dir: str = "./scrape_output"
7
+ segmented_dir: str = "./segmented"
8
+ classified_dir: str = "./classified"
9
+ mapped_dir: str = "./mapped"
10
+ staging_dir: str = "./staging"
11
+
12
+
13
+ class ResumeConfig(BaseModel):
14
+ skip_segmentation_if_cached: bool = True
15
+ skip_classification_if_cached: bool = True
16
+ skip_mapping_if_cached: bool = True
17
+
18
+
19
+ class SegmenterConfig(BaseModel):
20
+ use_llm: bool = True
21
+ llm_model: str = "anthropic/claude-haiku-4-5"
22
+ llm_confidence_threshold: float = 0.7
23
+ max_concurrent_pages: int = 10
24
+
25
+
26
+ class ClassifierConfig(BaseModel):
27
+ rule_based_confidence_threshold: float = 0.90
28
+ l1_cache_path: str = ".cache/l1_fingerprints.json"
29
+ l2_cache_path: str = ".cache/l2_clusters.json"
30
+ litellm_batch_size: int = 20
31
+ max_concurrent_batches: int = 5
32
+
33
+
34
+ class MapperConfig(BaseModel):
35
+ mcp_transport: str = "stdio"
36
+ signature_index_cache_path: str = ".cache/signature_index.json"
37
+ custom_registry_path: str = ".cache/custom_registry.json"
38
+ litellm_batch_size: int = 15
39
+
40
+
41
+ class SchemaGenConfig(BaseModel):
42
+ model: str = "anthropic/claude-sonnet-4-5"
43
+ unknown_model: str = "anthropic/claude-haiku-4-5"
44
+ representatives_per_type: int = 3
45
+ unknown_batch_size: int = 20
46
+ max_concurrent_unknown_batches: int = 5
47
+ timeout_seconds: int = 60
48
+ schema_cache_path: str = ".cache/extraction_schemas"
49
+
50
+
51
+ class ContentExtractionConfig(BaseModel):
52
+ markdown_library: str = "markdownify"
53
+ allowed_tags: list[str] = [
54
+ "p", "strong", "em", "b", "i",
55
+ "ul", "ol", "li", "h1", "h2", "h3", "h4", "h5",
56
+ "a", "blockquote", "code", "pre",
57
+ "table", "thead", "tbody", "tr", "th", "td", "img",
58
+ ]
59
+ always_strip_tags: list[str] = [
60
+ "script", "style", "iframe", "noscript",
61
+ "nav", "header", "footer", "aside",
62
+ ]
63
+
64
+
65
+ class URLGenConfig(BaseModel):
66
+ type_base_paths: dict[str, str] = {
67
+ "collection.product_card": "/products",
68
+ "collection.product_list": "/products",
69
+ "collection.blog_card": "/blog",
70
+ "collection.blog_list": "/blog",
71
+ "collection.news_item": "/news",
72
+ "content.article": "/articles",
73
+ }
74
+ category_detection_depth: int = 3
75
+ category_min_occurrence: int = 2
76
+ max_slug_length: int = 80
77
+
78
+
79
+ class StagingConfig(BaseModel):
80
+ astro_content_prefix: str = "src/content"
81
+ astro_pages_prefix: str = "src/pages/_data"
82
+ create_backup_on_overwrite: bool = True
83
+ manifest_filename: str = "hydration_manifest.json"
84
+ redirects_json_filename: str = "redirects.json"
85
+ redirects_csv_filename: str = "redirects.csv"
86
+
87
+
88
+ class R2Config(BaseModel):
89
+ bucket_name: str = ""
90
+ endpoint_url: str = ""
91
+ access_key_id: str = ""
92
+ secret_access_key: str = ""
93
+ public_base_url: str = ""
94
+ key_prefix: str = "images"
95
+ max_concurrent_uploads: int = 10
96
+ max_concurrent_downloads: int = 20
97
+ download_timeout_seconds: int = 30
98
+ upload_timeout_seconds: int = 60
99
+ url_cache_path: str = ".cache/image_url_cache.json"
100
+
101
+
102
+ class HydrationSettings(BaseSettings):
103
+ model_config = SettingsConfigDict(env_file=".env", env_prefix="HYDRATOR_")
104
+
105
+ site_id: str = ""
106
+
107
+ dirs: PipelineDirsConfig = PipelineDirsConfig()
108
+ resume: ResumeConfig = ResumeConfig()
109
+ segmenter: SegmenterConfig = SegmenterConfig()
110
+ classifier: ClassifierConfig = ClassifierConfig()
111
+ mapper: MapperConfig = MapperConfig()
112
+ schema_gen: SchemaGenConfig = SchemaGenConfig()
113
+ content: ContentExtractionConfig = ContentExtractionConfig()
114
+ url_gen: URLGenConfig = URLGenConfig()
115
+ staging: StagingConfig = StagingConfig()
116
+ r2: R2Config = R2Config()
117
+
118
+ content_type_prefixes: list[str] = [
119
+ "content.", "collection.blog", "collection.news",
120
+ ]
121
+ max_concurrent_segments: int = 50
File without changes
@@ -0,0 +1,95 @@
1
+ import logging
2
+ import re
3
+ from urllib.parse import urljoin
4
+ from bs4 import BeautifulSoup
5
+ import markdownify
6
+ from data_hydrator.models import ExtractionSchema, ExtractedField
7
+ from data_hydrator.config import ContentExtractionConfig
8
+ from data_hydrator.extraction.field_extractor import extract_fields
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def extract_content(
14
+ segment_html: str,
15
+ schema: ExtractionSchema,
16
+ page_url: str,
17
+ settings: ContentExtractionConfig,
18
+ ) -> tuple[list[ExtractedField], str, list[str]]:
19
+ """Returns (metadata_fields, body_markdown, missing_required_fields). Never raises."""
20
+ try:
21
+ # Step 1: Extract metadata fields using field selectors
22
+ metadata_fields, missing_required = extract_fields(segment_html, schema, page_url)
23
+
24
+ # Step 2: Parse for content extraction
25
+ soup = BeautifulSoup(segment_html, "lxml")
26
+
27
+ # Step 3: Find content container
28
+ if schema.content_selector:
29
+ container = soup.select_one(schema.content_selector)
30
+ if container is None:
31
+ logger.warning(
32
+ "content_selector '%s' found nothing; using full segment",
33
+ schema.content_selector,
34
+ )
35
+ container = soup
36
+ else:
37
+ container = soup
38
+
39
+ # Step 4: Remove noise elements
40
+ _remove_noise(container, schema.exclude_selectors)
41
+
42
+ # Step 5: Remove always-strip tags
43
+ for tag_name in settings.always_strip_tags:
44
+ for el in container.find_all(tag_name):
45
+ el.decompose()
46
+
47
+ # Step 6: Convert to Markdown
48
+ content_html = str(container)
49
+ md = markdownify.markdownify(
50
+ content_html,
51
+ strip=settings.always_strip_tags,
52
+ convert=settings.allowed_tags,
53
+ heading_style="ATX",
54
+ )
55
+
56
+ # Post-process
57
+ md = re.sub(r"\n{3,}", "\n\n", md)
58
+ md = md.strip()
59
+ md = _resolve_relative_urls(md, page_url)
60
+
61
+ return metadata_fields, md, missing_required
62
+
63
+ except Exception as exc:
64
+ logger.error("Content extraction failed: %s", exc)
65
+ return [], "", []
66
+
67
+
68
+ def _remove_noise(soup: BeautifulSoup, exclude_selectors: list[str]) -> None:
69
+ for selector in exclude_selectors:
70
+ try:
71
+ for el in soup.select(selector):
72
+ el.decompose()
73
+ except Exception as exc:
74
+ logger.warning("Exclude selector '%s' failed: %s", selector, exc)
75
+
76
+
77
+ def _resolve_relative_urls(markdown: str, base_url: str) -> str:
78
+ if not base_url:
79
+ return markdown
80
+
81
+ def replace_image(m: re.Match) -> str:
82
+ alt, url = m.group(1), m.group(2)
83
+ resolved = urljoin(base_url, url)
84
+ return f"![{alt}]({resolved})"
85
+
86
+ def replace_link(m: re.Match) -> str:
87
+ text, url = m.group(1), m.group(2)
88
+ resolved = urljoin(base_url, url)
89
+ return f"[{text}]({resolved})"
90
+
91
+ # Images first (more specific pattern)
92
+ markdown = re.sub(r"!\[([^\]]*)\]\(([^)]+)\)", replace_image, markdown)
93
+ # Then links
94
+ markdown = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", replace_link, markdown)
95
+ return markdown
@@ -0,0 +1,168 @@
1
+ import logging
2
+ import re
3
+ from urllib.parse import urljoin
4
+ from bs4 import BeautifulSoup
5
+ from data_hydrator.models import ExtractionSchema, ExtractedField, FieldSelector
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def extract_fields(
11
+ segment_html: str,
12
+ schema: ExtractionSchema,
13
+ page_url: str,
14
+ ) -> tuple[list[ExtractedField], list[str]]:
15
+ """Returns (fields, missing_required_field_names). Never raises."""
16
+ try:
17
+ soup = BeautifulSoup(segment_html, "lxml")
18
+ except Exception as exc:
19
+ logger.error("Failed to parse segment HTML: %s", exc)
20
+ missing = [fs.field_name for fs in schema.field_selectors if fs.required]
21
+ return [], missing
22
+
23
+ fields: list[ExtractedField] = []
24
+ missing_required: list[str] = []
25
+
26
+ for selector in schema.field_selectors:
27
+ field = _extract_one(soup, selector, page_url)
28
+ fields.append(field)
29
+ if not field.extraction_success and selector.required:
30
+ missing_required.append(selector.field_name)
31
+
32
+ return fields, missing_required
33
+
34
+
35
+ def _extract_one(
36
+ soup: BeautifulSoup,
37
+ selector: FieldSelector,
38
+ page_url: str,
39
+ ) -> ExtractedField:
40
+ css, attr_override = _parse_selector_attr(selector.css_selector)
41
+ attribute = selector.attribute or attr_override
42
+
43
+ try:
44
+ if selector.content_type == "list" or selector.multiple:
45
+ elements = soup.select(css)
46
+ if not elements:
47
+ return ExtractedField(
48
+ field_name=selector.field_name,
49
+ raw_value=None,
50
+ processed_value=[],
51
+ content_type=selector.content_type,
52
+ extraction_success=False,
53
+ warning=f"No elements matched selector '{css}'",
54
+ )
55
+ values = [el.get_text(strip=True) for el in elements]
56
+ return ExtractedField(
57
+ field_name=selector.field_name,
58
+ raw_value=values,
59
+ processed_value=values,
60
+ content_type=selector.content_type,
61
+ extraction_success=True,
62
+ )
63
+
64
+ element = soup.select_one(css)
65
+
66
+ if selector.content_type == "boolean":
67
+ val = element is not None
68
+ return ExtractedField(
69
+ field_name=selector.field_name,
70
+ raw_value=val,
71
+ processed_value=val,
72
+ content_type="boolean",
73
+ extraction_success=True,
74
+ )
75
+
76
+ if element is None:
77
+ return ExtractedField(
78
+ field_name=selector.field_name,
79
+ raw_value=None,
80
+ processed_value=None,
81
+ content_type=selector.content_type,
82
+ extraction_success=False,
83
+ warning=f"No element matched selector '{css}'",
84
+ )
85
+
86
+ if selector.content_type == "text":
87
+ text = element.get_text(strip=True)
88
+ return ExtractedField(
89
+ field_name=selector.field_name,
90
+ raw_value=text,
91
+ processed_value=text,
92
+ content_type="text",
93
+ extraction_success=bool(text),
94
+ warning="" if text else f"Empty text at selector '{css}'",
95
+ )
96
+
97
+ if selector.content_type == "image_url":
98
+ src = _get_image_src(element, attribute)
99
+ if src:
100
+ src = urljoin(page_url, src)
101
+ return ExtractedField(
102
+ field_name=selector.field_name,
103
+ raw_value=src,
104
+ processed_value=src,
105
+ content_type="image_url",
106
+ extraction_success=bool(src),
107
+ warning="" if src else f"Could not extract image URL at '{css}'",
108
+ )
109
+
110
+ if selector.content_type == "url":
111
+ attr = attribute or "href"
112
+ href = element.get(attr, "")
113
+ if href:
114
+ href = urljoin(page_url, href)
115
+ return ExtractedField(
116
+ field_name=selector.field_name,
117
+ raw_value=href,
118
+ processed_value=href,
119
+ content_type="url",
120
+ extraction_success=bool(href),
121
+ warning="" if href else f"No href at selector '{css}'",
122
+ )
123
+
124
+ # Fallback: treat as text
125
+ text = element.get_text(strip=True)
126
+ return ExtractedField(
127
+ field_name=selector.field_name,
128
+ raw_value=text,
129
+ processed_value=text,
130
+ content_type=selector.content_type,
131
+ extraction_success=bool(text),
132
+ )
133
+
134
+ except Exception as exc:
135
+ logger.warning("Field extraction failed for %s: %s", selector.field_name, exc)
136
+ return ExtractedField(
137
+ field_name=selector.field_name,
138
+ raw_value=None,
139
+ processed_value=None,
140
+ content_type=selector.content_type,
141
+ extraction_success=False,
142
+ warning=str(exc),
143
+ )
144
+
145
+
146
+ def _get_image_src(element, attribute: str | None) -> str:
147
+ """Try multiple src attributes for lazy-loaded images."""
148
+ candidates = [attribute] if attribute else []
149
+ candidates += ["src", "data-src", "data-lazy-src"]
150
+ for attr in candidates:
151
+ if attr and element.get(attr):
152
+ return element[attr]
153
+
154
+ # Try background-image from style
155
+ style = element.get("style", "")
156
+ if "background-image" in style:
157
+ match = re.search(r"url\(['\"]?([^'\")\s]+)['\"]?\)", style)
158
+ if match:
159
+ return match.group(1)
160
+ return ""
161
+
162
+
163
+ def _parse_selector_attr(segment_field: str) -> tuple[str, str | None]:
164
+ """Split 'img[src]' → ('img', 'src'), 'h2.name' → ('h2.name', None)."""
165
+ match = re.match(r"^(.*?)\[([^\]]+)\]$", segment_field)
166
+ if match:
167
+ return match.group(1), match.group(2)
168
+ return segment_field, None