corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,171 @@
1
+ """
2
+ StatementDeduplicator - Hash-based deduplication for statements.
3
+
4
+ Removes duplicate statements across chunks using normalized hashing.
5
+ Works with Stage 2+ output (PipelineStatement, LabeledStatement) which
6
+ have subject-predicate-object structure.
7
+ """
8
+
9
+ import hashlib
10
+ import logging
11
+ from typing import TypeVar, Union
12
+
13
+ from ..models.labels import LabeledStatement
14
+ from ..models.statement import PipelineStatement
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Type variable for generic deduplication
19
+ T = TypeVar("T", PipelineStatement, LabeledStatement)
20
+
21
+
22
+ class StatementDeduplicator:
23
+ """
24
+ Deduplicates statements using normalized hash comparison.
25
+
26
+ Uses a hash of normalized (subject, predicate, object) to identify
27
+ duplicates. Keeps the first occurrence of each unique statement.
28
+
29
+ Works with PipelineStatement (Stage 2) and LabeledStatement (Stage 4).
30
+ """
31
+
32
+ def __init__(self):
33
+ """Initialize the deduplicator."""
34
+ self._seen_hashes: set[str] = set()
35
+
36
+ def reset(self) -> None:
37
+ """Reset the deduplicator state, clearing all seen hashes."""
38
+ self._seen_hashes.clear()
39
+ logger.debug("Deduplicator state reset")
40
+
41
+ def _normalize_text(self, text: str) -> str:
42
+ """
43
+ Normalize text for comparison.
44
+
45
+ - Lowercase
46
+ - Strip whitespace
47
+ - Collapse multiple spaces
48
+ """
49
+ return " ".join(text.lower().strip().split())
50
+
51
+ def _get_triple_parts(
52
+ self,
53
+ stmt: Union[PipelineStatement, LabeledStatement],
54
+ ) -> tuple[str, str, str]:
55
+ """
56
+ Extract (subject, predicate, object) from a statement.
57
+
58
+ Handles different statement types consistently.
59
+ """
60
+ if isinstance(stmt, LabeledStatement):
61
+ return (
62
+ stmt.statement.subject.text,
63
+ stmt.statement.predicate,
64
+ stmt.statement.object.text,
65
+ )
66
+ else:
67
+ # PipelineStatement
68
+ return (
69
+ stmt.subject.text,
70
+ stmt.predicate,
71
+ stmt.object.text,
72
+ )
73
+
74
+ def _hash_triple(
75
+ self,
76
+ stmt: Union[PipelineStatement, LabeledStatement],
77
+ ) -> str:
78
+ """
79
+ Generate a hash for a statement triple.
80
+
81
+ Uses normalized text to catch near-duplicates with different
82
+ casing or whitespace.
83
+ """
84
+ subj, pred, obj = self._get_triple_parts(stmt)
85
+
86
+ key = (
87
+ self._normalize_text(subj),
88
+ self._normalize_text(pred),
89
+ self._normalize_text(obj),
90
+ )
91
+
92
+ # Use sha256 and truncate to 16 chars for reasonable uniqueness
93
+ return hashlib.sha256(str(key).encode()).hexdigest()[:16]
94
+
95
+ def is_duplicate(
96
+ self,
97
+ stmt: Union[PipelineStatement, LabeledStatement],
98
+ ) -> bool:
99
+ """
100
+ Check if a statement is a duplicate.
101
+
102
+ Also marks the statement as seen if it's not a duplicate.
103
+
104
+ Args:
105
+ stmt: Statement to check
106
+
107
+ Returns:
108
+ True if this is a duplicate of a previously seen statement
109
+ """
110
+ hash_value = self._hash_triple(stmt)
111
+
112
+ if hash_value in self._seen_hashes:
113
+ return True
114
+
115
+ self._seen_hashes.add(hash_value)
116
+ return False
117
+
118
+ def filter_duplicates(self, statements: list[T]) -> list[T]:
119
+ """
120
+ Filter out duplicate statements from a list.
121
+
122
+ Preserves order and keeps the first occurrence of each unique statement.
123
+
124
+ Args:
125
+ statements: List of statements to deduplicate
126
+
127
+ Returns:
128
+ List with duplicates removed
129
+ """
130
+ if not statements:
131
+ return []
132
+
133
+ original_count = len(statements)
134
+ result = []
135
+
136
+ for stmt in statements:
137
+ if not self.is_duplicate(stmt):
138
+ result.append(stmt)
139
+
140
+ removed = original_count - len(result)
141
+ if removed > 0:
142
+ logger.info(f"Deduplication removed {removed} statements ({len(result)} remaining)")
143
+
144
+ return result
145
+
146
+ def deduplicate_batch(
147
+ self,
148
+ statements: list[T],
149
+ reset_first: bool = True,
150
+ ) -> list[T]:
151
+ """
152
+ Deduplicate a batch of statements.
153
+
154
+ Optionally resets state before processing to ensure clean deduplication.
155
+
156
+ Args:
157
+ statements: List of statements to deduplicate
158
+ reset_first: Whether to reset seen hashes before processing
159
+
160
+ Returns:
161
+ Deduplicated list of statements
162
+ """
163
+ if reset_first:
164
+ self.reset()
165
+
166
+ return self.filter_duplicates(statements)
167
+
168
+ @property
169
+ def seen_count(self) -> int:
170
+ """Get the number of unique statements seen."""
171
+ return len(self._seen_hashes)
@@ -0,0 +1,246 @@
1
+ """
2
+ HTML text extraction utilities.
3
+
4
+ Extracts clean text content from HTML pages, prioritizing article content
5
+ and removing navigation, headers, footers, and other non-content elements.
6
+ """
7
+
8
+ import logging
9
+ import re
10
+ from typing import Optional
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def extract_text_from_html(html: str) -> tuple[str, Optional[str]]:
16
+ """
17
+ Extract clean text and title from HTML.
18
+
19
+ Removes scripts, styles, navigation, and other non-content elements.
20
+ Prioritizes article or main content areas.
21
+
22
+ Args:
23
+ html: Raw HTML string
24
+
25
+ Returns:
26
+ Tuple of (extracted_text, title or None)
27
+ """
28
+ try:
29
+ from bs4 import BeautifulSoup
30
+ except ImportError:
31
+ raise ImportError(
32
+ "BeautifulSoup is required for HTML extraction. "
33
+ "Install with: pip install beautifulsoup4"
34
+ )
35
+
36
+ soup = BeautifulSoup(html, "html.parser")
37
+
38
+ # Remove unwanted elements
39
+ for tag_name in [
40
+ "script",
41
+ "style",
42
+ "nav",
43
+ "footer",
44
+ "header",
45
+ "aside",
46
+ "noscript",
47
+ "iframe",
48
+ "form",
49
+ "button",
50
+ "input",
51
+ "select",
52
+ "textarea",
53
+ ]:
54
+ for tag in soup.find_all(tag_name):
55
+ tag.decompose()
56
+
57
+ # Remove elements with common non-content class/id patterns
58
+ non_content_patterns = [
59
+ "nav",
60
+ "menu",
61
+ "sidebar",
62
+ "footer",
63
+ "header",
64
+ "comment",
65
+ "advertisement",
66
+ "ad-",
67
+ "social",
68
+ "share",
69
+ "related",
70
+ "recommended",
71
+ "popup",
72
+ "modal",
73
+ "cookie",
74
+ "banner",
75
+ "promo",
76
+ ]
77
+
78
+ # Collect elements to remove first, then decompose
79
+ # (decomposing while iterating can cause issues)
80
+ elements_to_remove = []
81
+
82
+ for element in soup.find_all(class_=True):
83
+ if element.attrs is None:
84
+ continue
85
+ classes = element.get("class", [])
86
+ if classes:
87
+ class_str = " ".join(classes).lower()
88
+ if any(pattern in class_str for pattern in non_content_patterns):
89
+ elements_to_remove.append(element)
90
+
91
+ for element in soup.find_all(id=True):
92
+ if element.attrs is None:
93
+ continue
94
+ element_id = element.get("id", "")
95
+ if element_id and any(pattern in element_id.lower() for pattern in non_content_patterns):
96
+ elements_to_remove.append(element)
97
+
98
+ for element in elements_to_remove:
99
+ try:
100
+ element.decompose()
101
+ except Exception:
102
+ pass # Element may already be decomposed
103
+
104
+ # Get title
105
+ title = None
106
+ if soup.title and soup.title.string:
107
+ title = soup.title.string.strip()
108
+ # Clean up common title patterns (e.g., "Article Title | Site Name")
109
+ title = re.split(r"\s*[|—\-]\s*", title)[0].strip()
110
+
111
+ # Find main content area
112
+ content = None
113
+
114
+ # Priority: article > main > [role="main"] > body
115
+ for selector in ["article", "main", "[role='main']", ".content", "#content"]:
116
+ content = soup.select_one(selector)
117
+ if content and len(content.get_text(strip=True)) > 100:
118
+ break
119
+
120
+ if not content:
121
+ content = soup.body or soup
122
+
123
+ # Extract text using BeautifulSoup's get_text with newline separator
124
+ text = content.get_text(separator="\n", strip=True)
125
+
126
+ # Clean up whitespace
127
+ text = _clean_whitespace(text)
128
+
129
+ logger.debug(f"Extracted {len(text)} chars from HTML (title: {title})")
130
+
131
+ return text, title
132
+
133
+
134
+ def _clean_whitespace(text: str) -> str:
135
+ """
136
+ Clean up whitespace while preserving paragraph structure.
137
+
138
+ Args:
139
+ text: Raw extracted text
140
+
141
+ Returns:
142
+ Cleaned text
143
+ """
144
+ # Normalize line breaks
145
+ text = re.sub(r"\r\n?", "\n", text)
146
+
147
+ # Collapse multiple spaces (but not newlines)
148
+ text = re.sub(r"[^\S\n]+", " ", text)
149
+
150
+ # Collapse multiple newlines to max 2
151
+ text = re.sub(r"\n{3,}", "\n\n", text)
152
+
153
+ # Remove leading/trailing whitespace from each line
154
+ lines = [line.strip() for line in text.split("\n")]
155
+ text = "\n".join(lines)
156
+
157
+ # Remove empty lines at start/end
158
+ text = text.strip()
159
+
160
+ return text
161
+
162
+
163
+ def extract_article_content(html: str) -> tuple[str, dict]:
164
+ """
165
+ Extract article content with metadata.
166
+
167
+ Attempts to extract structured article data including:
168
+ - Title
169
+ - Author
170
+ - Published date
171
+ - Main content
172
+
173
+ Args:
174
+ html: Raw HTML string
175
+
176
+ Returns:
177
+ Tuple of (content, metadata dict)
178
+ """
179
+ try:
180
+ from bs4 import BeautifulSoup
181
+ except ImportError:
182
+ raise ImportError(
183
+ "BeautifulSoup is required for HTML extraction. "
184
+ "Install with: pip install beautifulsoup4"
185
+ )
186
+
187
+ soup = BeautifulSoup(html, "html.parser")
188
+
189
+ metadata = {}
190
+
191
+ # Extract title
192
+ title = None
193
+ # Try og:title first
194
+ og_title = soup.find("meta", property="og:title")
195
+ if og_title and og_title.get("content"):
196
+ title = og_title["content"].strip()
197
+ elif soup.title and soup.title.string:
198
+ title = soup.title.string.strip()
199
+
200
+ if title:
201
+ metadata["title"] = title
202
+
203
+ # Extract author
204
+ author = None
205
+ author_meta = soup.find("meta", attrs={"name": "author"})
206
+ if author_meta and author_meta.get("content"):
207
+ author = author_meta["content"].strip()
208
+ else:
209
+ # Try common author class patterns
210
+ author_elem = soup.select_one(".author, .byline, [rel='author']")
211
+ if author_elem:
212
+ author = author_elem.get_text(strip=True)
213
+
214
+ if author:
215
+ metadata["author"] = author
216
+
217
+ # Extract published date
218
+ date = None
219
+ date_meta = soup.find("meta", property="article:published_time")
220
+ if date_meta and date_meta.get("content"):
221
+ date = date_meta["content"]
222
+ else:
223
+ date_elem = soup.select_one("time[datetime], .date, .published")
224
+ if date_elem:
225
+ date = date_elem.get("datetime") or date_elem.get_text(strip=True)
226
+
227
+ if date:
228
+ metadata["published_date"] = date
229
+
230
+ # Extract description
231
+ description = None
232
+ desc_meta = soup.find("meta", attrs={"name": "description"})
233
+ if desc_meta and desc_meta.get("content"):
234
+ description = desc_meta["content"].strip()
235
+
236
+ if description:
237
+ metadata["description"] = description
238
+
239
+ # Extract main content
240
+ content, extracted_title = extract_text_from_html(html)
241
+
242
+ # Use extracted title if we didn't find one
243
+ if not title and extracted_title:
244
+ metadata["title"] = extracted_title
245
+
246
+ return content, metadata
@@ -0,0 +1,303 @@
1
+ """
2
+ URL loader for fetching and parsing web content.
3
+
4
+ Orchestrates scraper and PDF parser plugins to load documents from URLs.
5
+ """
6
+
7
+ import asyncio
8
+ import logging
9
+ from typing import Optional
10
+
11
+ from pydantic import BaseModel, Field
12
+
13
+ from ..models.document import Document
14
+ from ..pipeline.registry import PluginRegistry
15
+ from ..plugins.base import (
16
+ BaseScraperPlugin,
17
+ BasePDFParserPlugin,
18
+ ContentType,
19
+ ScraperResult,
20
+ )
21
+ from .html_extractor import extract_text_from_html, extract_article_content
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class URLLoaderConfig(BaseModel):
27
+ """Configuration for URL loading."""
28
+
29
+ timeout: float = Field(
30
+ default=30.0,
31
+ description="Request timeout in seconds"
32
+ )
33
+ use_ocr: bool = Field(
34
+ default=False,
35
+ description="Force OCR for PDF parsing"
36
+ )
37
+ max_pdf_pages: int = Field(
38
+ default=500,
39
+ description="Maximum pages to extract from PDFs"
40
+ )
41
+ scraper_plugin: Optional[str] = Field(
42
+ default=None,
43
+ description="Specific scraper plugin to use (None = auto-select)"
44
+ )
45
+ pdf_parser_plugin: Optional[str] = Field(
46
+ default=None,
47
+ description="Specific PDF parser plugin to use (None = auto-select)"
48
+ )
49
+ extract_metadata: bool = Field(
50
+ default=True,
51
+ description="Extract article metadata from HTML pages"
52
+ )
53
+
54
+
55
+ class URLLoader:
56
+ """
57
+ Loads documents from URLs using scraper and PDF parser plugins.
58
+
59
+ Orchestrates the content acquisition process:
60
+ 1. Fetch content using a scraper plugin
61
+ 2. Detect content type (HTML vs PDF)
62
+ 3. Parse content using appropriate parser
63
+ 4. Create a Document object
64
+
65
+ Example:
66
+ >>> loader = URLLoader()
67
+ >>> document = await loader.load("https://example.com/article")
68
+ >>> print(document.title)
69
+
70
+ >>> # Synchronous usage
71
+ >>> document = loader.load_sync("https://example.com/report.pdf")
72
+ """
73
+
74
+ def __init__(self, config: Optional[URLLoaderConfig] = None):
75
+ """
76
+ Initialize the URL loader.
77
+
78
+ Args:
79
+ config: Loader configuration
80
+ """
81
+ self.config = config or URLLoaderConfig()
82
+ self._scraper: Optional[BaseScraperPlugin] = None
83
+ self._pdf_parser: Optional[BasePDFParserPlugin] = None
84
+
85
+ def _get_scraper(self) -> BaseScraperPlugin:
86
+ """Get the scraper plugin to use."""
87
+ if self._scraper is not None:
88
+ return self._scraper
89
+
90
+ scrapers = PluginRegistry.get_scrapers()
91
+ if not scrapers:
92
+ raise RuntimeError(
93
+ "No scraper plugins registered. "
94
+ "Ensure plugins are loaded via 'from statement_extractor import plugins'"
95
+ )
96
+
97
+ if self.config.scraper_plugin:
98
+ for scraper in scrapers:
99
+ if scraper.name == self.config.scraper_plugin:
100
+ self._scraper = scraper
101
+ return scraper
102
+ raise ValueError(f"Scraper plugin not found: {self.config.scraper_plugin}")
103
+
104
+ # Use first available (highest priority)
105
+ self._scraper = scrapers[0]
106
+ return self._scraper
107
+
108
+ def _get_pdf_parser(self) -> BasePDFParserPlugin:
109
+ """Get the PDF parser plugin to use."""
110
+ if self._pdf_parser is not None:
111
+ return self._pdf_parser
112
+
113
+ parsers = PluginRegistry.get_pdf_parsers()
114
+ if not parsers:
115
+ raise RuntimeError(
116
+ "No PDF parser plugins registered. "
117
+ "Ensure plugins are loaded via 'from statement_extractor import plugins'"
118
+ )
119
+
120
+ if self.config.pdf_parser_plugin:
121
+ for parser in parsers:
122
+ if parser.name == self.config.pdf_parser_plugin:
123
+ self._pdf_parser = parser
124
+ return parser
125
+ raise ValueError(f"PDF parser plugin not found: {self.config.pdf_parser_plugin}")
126
+
127
+ # Use first available (highest priority)
128
+ self._pdf_parser = parsers[0]
129
+ return self._pdf_parser
130
+
131
+ async def load(self, url: str) -> Document:
132
+ """
133
+ Load a URL and return a Document.
134
+
135
+ Args:
136
+ url: URL to load
137
+
138
+ Returns:
139
+ Document with extracted content
140
+
141
+ Raises:
142
+ ValueError: If URL cannot be fetched or parsed
143
+ """
144
+ logger.info(f"Loading URL: {url}")
145
+
146
+ # 1. Fetch content
147
+ scraper = self._get_scraper()
148
+ result = await scraper.fetch(url, self.config.timeout)
149
+
150
+ if not result.ok:
151
+ raise ValueError(f"Failed to fetch {url}: {result.error}")
152
+
153
+ logger.debug(f"Fetched {len(result.content)} bytes, type: {result.content_type}")
154
+
155
+ # 2. Process based on content type
156
+ if result.content_type == ContentType.PDF:
157
+ return self._process_pdf(result)
158
+ elif result.content_type == ContentType.HTML:
159
+ return self._process_html(result)
160
+ else:
161
+ # Try to guess based on content
162
+ if result.content[:5] == b"%PDF-":
163
+ return self._process_pdf(result)
164
+ # Default to HTML
165
+ return self._process_html(result)
166
+
167
+ def load_sync(self, url: str) -> Document:
168
+ """
169
+ Synchronous wrapper for load().
170
+
171
+ Args:
172
+ url: URL to load
173
+
174
+ Returns:
175
+ Document with extracted content
176
+ """
177
+ return asyncio.run(self.load(url))
178
+
179
+ def _process_pdf(self, result: ScraperResult) -> Document:
180
+ """
181
+ Convert PDF to Document with pages.
182
+
183
+ Args:
184
+ result: ScraperResult containing PDF bytes
185
+
186
+ Returns:
187
+ Document with PDF content
188
+ """
189
+ logger.info(f"Processing PDF from {result.final_url}")
190
+
191
+ parser = self._get_pdf_parser()
192
+ parse_result = parser.parse(
193
+ result.content,
194
+ max_pages=self.config.max_pdf_pages,
195
+ use_ocr=self.config.use_ocr,
196
+ )
197
+
198
+ if not parse_result.ok:
199
+ raise ValueError(f"Failed to parse PDF: {parse_result.error}")
200
+
201
+ logger.info(f"Extracted {len(parse_result.pages)} pages from PDF")
202
+
203
+ # Create Document from pages
204
+ kwargs = {
205
+ "pages": parse_result.pages,
206
+ "title": parse_result.metadata.get("title"),
207
+ "source_type": "pdf",
208
+ "url": result.final_url,
209
+ }
210
+ author = parse_result.metadata.get("author")
211
+ if author:
212
+ kwargs["authors"] = [author]
213
+
214
+ return Document.from_pages(**kwargs)
215
+
216
+ def _process_html(self, result: ScraperResult) -> Document:
217
+ """
218
+ Convert HTML to Document (single page).
219
+
220
+ Args:
221
+ result: ScraperResult containing HTML bytes
222
+
223
+ Returns:
224
+ Document with HTML content
225
+ """
226
+ logger.info(f"Processing HTML from {result.final_url}")
227
+
228
+ # Decode HTML
229
+ try:
230
+ html = result.content.decode("utf-8", errors="replace")
231
+ except Exception as e:
232
+ raise ValueError(f"Failed to decode HTML: {e}")
233
+
234
+ # Extract text and metadata
235
+ if self.config.extract_metadata:
236
+ text, metadata = extract_article_content(html)
237
+ title = metadata.get("title")
238
+ author = metadata.get("author")
239
+ # Log extracted metadata
240
+ logger.debug(f"Extracted metadata: {metadata}")
241
+ else:
242
+ text, title = extract_text_from_html(html)
243
+ author = None
244
+ metadata = {}
245
+
246
+ if not text or len(text.strip()) < 50:
247
+ raise ValueError("No meaningful content extracted from HTML")
248
+
249
+ logger.info(f"Extracted {len(text)} chars from HTML")
250
+ if title:
251
+ logger.info(f" Title: {title}")
252
+ if author:
253
+ logger.info(f" Author: {author}")
254
+ if metadata.get("published_date"):
255
+ logger.info(f" Published: {metadata.get('published_date')}")
256
+
257
+ # Create Document using from_pages since from_text forces source_type="text"
258
+ kwargs = {
259
+ "pages": [text],
260
+ "title": title,
261
+ "source_type": "webpage",
262
+ "url": result.final_url,
263
+ }
264
+ if author:
265
+ kwargs["authors"] = [author]
266
+
267
+ return Document.from_pages(**kwargs)
268
+
269
+
270
+ async def load_url(
271
+ url: str,
272
+ config: Optional[URLLoaderConfig] = None,
273
+ ) -> Document:
274
+ """
275
+ Convenience function to load a URL.
276
+
277
+ Args:
278
+ url: URL to load
279
+ config: Optional loader configuration
280
+
281
+ Returns:
282
+ Document with extracted content
283
+ """
284
+ loader = URLLoader(config)
285
+ return await loader.load(url)
286
+
287
+
288
+ def load_url_sync(
289
+ url: str,
290
+ config: Optional[URLLoaderConfig] = None,
291
+ ) -> Document:
292
+ """
293
+ Convenience function to load a URL synchronously.
294
+
295
+ Args:
296
+ url: URL to load
297
+ config: Optional loader configuration
298
+
299
+ Returns:
300
+ Document with extracted content
301
+ """
302
+ loader = URLLoader(config)
303
+ return loader.load_sync(url)