corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
- corp_extractor-0.9.3.dist-info/RECORD +79 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +2030 -24
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +428 -0
- statement_extractor/database/importers/__init__.py +32 -0
- statement_extractor/database/importers/companies_house.py +559 -0
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +561 -0
- statement_extractor/database/importers/sec_edgar.py +392 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +1120 -0
- statement_extractor/database/importers/wikidata_dump.py +1951 -0
- statement_extractor/database/importers/wikidata_people.py +1130 -0
- statement_extractor/database/models.py +254 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +3034 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +171 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +19 -3
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +39 -15
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +90 -121
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +348 -78
- statement_extractor/plugins/extractors/gliner2.py +38 -28
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +588 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +176 -75
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""
|
|
2
|
+
StatementDeduplicator - Hash-based deduplication for statements.
|
|
3
|
+
|
|
4
|
+
Removes duplicate statements across chunks using normalized hashing.
|
|
5
|
+
Works with Stage 2+ output (PipelineStatement, LabeledStatement) which
|
|
6
|
+
have subject-predicate-object structure.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import logging
|
|
11
|
+
from typing import TypeVar, Union
|
|
12
|
+
|
|
13
|
+
from ..models.labels import LabeledStatement
|
|
14
|
+
from ..models.statement import PipelineStatement
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
# Type variable for generic deduplication
|
|
19
|
+
T = TypeVar("T", PipelineStatement, LabeledStatement)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class StatementDeduplicator:
|
|
23
|
+
"""
|
|
24
|
+
Deduplicates statements using normalized hash comparison.
|
|
25
|
+
|
|
26
|
+
Uses a hash of normalized (subject, predicate, object) to identify
|
|
27
|
+
duplicates. Keeps the first occurrence of each unique statement.
|
|
28
|
+
|
|
29
|
+
Works with PipelineStatement (Stage 2) and LabeledStatement (Stage 4).
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self):
|
|
33
|
+
"""Initialize the deduplicator."""
|
|
34
|
+
self._seen_hashes: set[str] = set()
|
|
35
|
+
|
|
36
|
+
def reset(self) -> None:
|
|
37
|
+
"""Reset the deduplicator state, clearing all seen hashes."""
|
|
38
|
+
self._seen_hashes.clear()
|
|
39
|
+
logger.debug("Deduplicator state reset")
|
|
40
|
+
|
|
41
|
+
def _normalize_text(self, text: str) -> str:
|
|
42
|
+
"""
|
|
43
|
+
Normalize text for comparison.
|
|
44
|
+
|
|
45
|
+
- Lowercase
|
|
46
|
+
- Strip whitespace
|
|
47
|
+
- Collapse multiple spaces
|
|
48
|
+
"""
|
|
49
|
+
return " ".join(text.lower().strip().split())
|
|
50
|
+
|
|
51
|
+
def _get_triple_parts(
|
|
52
|
+
self,
|
|
53
|
+
stmt: Union[PipelineStatement, LabeledStatement],
|
|
54
|
+
) -> tuple[str, str, str]:
|
|
55
|
+
"""
|
|
56
|
+
Extract (subject, predicate, object) from a statement.
|
|
57
|
+
|
|
58
|
+
Handles different statement types consistently.
|
|
59
|
+
"""
|
|
60
|
+
if isinstance(stmt, LabeledStatement):
|
|
61
|
+
return (
|
|
62
|
+
stmt.statement.subject.text,
|
|
63
|
+
stmt.statement.predicate,
|
|
64
|
+
stmt.statement.object.text,
|
|
65
|
+
)
|
|
66
|
+
else:
|
|
67
|
+
# PipelineStatement
|
|
68
|
+
return (
|
|
69
|
+
stmt.subject.text,
|
|
70
|
+
stmt.predicate,
|
|
71
|
+
stmt.object.text,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def _hash_triple(
|
|
75
|
+
self,
|
|
76
|
+
stmt: Union[PipelineStatement, LabeledStatement],
|
|
77
|
+
) -> str:
|
|
78
|
+
"""
|
|
79
|
+
Generate a hash for a statement triple.
|
|
80
|
+
|
|
81
|
+
Uses normalized text to catch near-duplicates with different
|
|
82
|
+
casing or whitespace.
|
|
83
|
+
"""
|
|
84
|
+
subj, pred, obj = self._get_triple_parts(stmt)
|
|
85
|
+
|
|
86
|
+
key = (
|
|
87
|
+
self._normalize_text(subj),
|
|
88
|
+
self._normalize_text(pred),
|
|
89
|
+
self._normalize_text(obj),
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Use sha256 and truncate to 16 chars for reasonable uniqueness
|
|
93
|
+
return hashlib.sha256(str(key).encode()).hexdigest()[:16]
|
|
94
|
+
|
|
95
|
+
def is_duplicate(
|
|
96
|
+
self,
|
|
97
|
+
stmt: Union[PipelineStatement, LabeledStatement],
|
|
98
|
+
) -> bool:
|
|
99
|
+
"""
|
|
100
|
+
Check if a statement is a duplicate.
|
|
101
|
+
|
|
102
|
+
Also marks the statement as seen if it's not a duplicate.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
stmt: Statement to check
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
True if this is a duplicate of a previously seen statement
|
|
109
|
+
"""
|
|
110
|
+
hash_value = self._hash_triple(stmt)
|
|
111
|
+
|
|
112
|
+
if hash_value in self._seen_hashes:
|
|
113
|
+
return True
|
|
114
|
+
|
|
115
|
+
self._seen_hashes.add(hash_value)
|
|
116
|
+
return False
|
|
117
|
+
|
|
118
|
+
def filter_duplicates(self, statements: list[T]) -> list[T]:
|
|
119
|
+
"""
|
|
120
|
+
Filter out duplicate statements from a list.
|
|
121
|
+
|
|
122
|
+
Preserves order and keeps the first occurrence of each unique statement.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
statements: List of statements to deduplicate
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
List with duplicates removed
|
|
129
|
+
"""
|
|
130
|
+
if not statements:
|
|
131
|
+
return []
|
|
132
|
+
|
|
133
|
+
original_count = len(statements)
|
|
134
|
+
result = []
|
|
135
|
+
|
|
136
|
+
for stmt in statements:
|
|
137
|
+
if not self.is_duplicate(stmt):
|
|
138
|
+
result.append(stmt)
|
|
139
|
+
|
|
140
|
+
removed = original_count - len(result)
|
|
141
|
+
if removed > 0:
|
|
142
|
+
logger.info(f"Deduplication removed {removed} statements ({len(result)} remaining)")
|
|
143
|
+
|
|
144
|
+
return result
|
|
145
|
+
|
|
146
|
+
def deduplicate_batch(
|
|
147
|
+
self,
|
|
148
|
+
statements: list[T],
|
|
149
|
+
reset_first: bool = True,
|
|
150
|
+
) -> list[T]:
|
|
151
|
+
"""
|
|
152
|
+
Deduplicate a batch of statements.
|
|
153
|
+
|
|
154
|
+
Optionally resets state before processing to ensure clean deduplication.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
statements: List of statements to deduplicate
|
|
158
|
+
reset_first: Whether to reset seen hashes before processing
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Deduplicated list of statements
|
|
162
|
+
"""
|
|
163
|
+
if reset_first:
|
|
164
|
+
self.reset()
|
|
165
|
+
|
|
166
|
+
return self.filter_duplicates(statements)
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def seen_count(self) -> int:
|
|
170
|
+
"""Get the number of unique statements seen."""
|
|
171
|
+
return len(self._seen_hashes)
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTML text extraction utilities.
|
|
3
|
+
|
|
4
|
+
Extracts clean text content from HTML pages, prioritizing article content
|
|
5
|
+
and removing navigation, headers, footers, and other non-content elements.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import re
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def extract_text_from_html(html: str) -> tuple[str, Optional[str]]:
|
|
16
|
+
"""
|
|
17
|
+
Extract clean text and title from HTML.
|
|
18
|
+
|
|
19
|
+
Removes scripts, styles, navigation, and other non-content elements.
|
|
20
|
+
Prioritizes article or main content areas.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
html: Raw HTML string
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Tuple of (extracted_text, title or None)
|
|
27
|
+
"""
|
|
28
|
+
try:
|
|
29
|
+
from bs4 import BeautifulSoup
|
|
30
|
+
except ImportError:
|
|
31
|
+
raise ImportError(
|
|
32
|
+
"BeautifulSoup is required for HTML extraction. "
|
|
33
|
+
"Install with: pip install beautifulsoup4"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
37
|
+
|
|
38
|
+
# Remove unwanted elements
|
|
39
|
+
for tag_name in [
|
|
40
|
+
"script",
|
|
41
|
+
"style",
|
|
42
|
+
"nav",
|
|
43
|
+
"footer",
|
|
44
|
+
"header",
|
|
45
|
+
"aside",
|
|
46
|
+
"noscript",
|
|
47
|
+
"iframe",
|
|
48
|
+
"form",
|
|
49
|
+
"button",
|
|
50
|
+
"input",
|
|
51
|
+
"select",
|
|
52
|
+
"textarea",
|
|
53
|
+
]:
|
|
54
|
+
for tag in soup.find_all(tag_name):
|
|
55
|
+
tag.decompose()
|
|
56
|
+
|
|
57
|
+
# Remove elements with common non-content class/id patterns
|
|
58
|
+
non_content_patterns = [
|
|
59
|
+
"nav",
|
|
60
|
+
"menu",
|
|
61
|
+
"sidebar",
|
|
62
|
+
"footer",
|
|
63
|
+
"header",
|
|
64
|
+
"comment",
|
|
65
|
+
"advertisement",
|
|
66
|
+
"ad-",
|
|
67
|
+
"social",
|
|
68
|
+
"share",
|
|
69
|
+
"related",
|
|
70
|
+
"recommended",
|
|
71
|
+
"popup",
|
|
72
|
+
"modal",
|
|
73
|
+
"cookie",
|
|
74
|
+
"banner",
|
|
75
|
+
"promo",
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
# Collect elements to remove first, then decompose
|
|
79
|
+
# (decomposing while iterating can cause issues)
|
|
80
|
+
elements_to_remove = []
|
|
81
|
+
|
|
82
|
+
for element in soup.find_all(class_=True):
|
|
83
|
+
if element.attrs is None:
|
|
84
|
+
continue
|
|
85
|
+
classes = element.get("class", [])
|
|
86
|
+
if classes:
|
|
87
|
+
class_str = " ".join(classes).lower()
|
|
88
|
+
if any(pattern in class_str for pattern in non_content_patterns):
|
|
89
|
+
elements_to_remove.append(element)
|
|
90
|
+
|
|
91
|
+
for element in soup.find_all(id=True):
|
|
92
|
+
if element.attrs is None:
|
|
93
|
+
continue
|
|
94
|
+
element_id = element.get("id", "")
|
|
95
|
+
if element_id and any(pattern in element_id.lower() for pattern in non_content_patterns):
|
|
96
|
+
elements_to_remove.append(element)
|
|
97
|
+
|
|
98
|
+
for element in elements_to_remove:
|
|
99
|
+
try:
|
|
100
|
+
element.decompose()
|
|
101
|
+
except Exception:
|
|
102
|
+
pass # Element may already be decomposed
|
|
103
|
+
|
|
104
|
+
# Get title
|
|
105
|
+
title = None
|
|
106
|
+
if soup.title and soup.title.string:
|
|
107
|
+
title = soup.title.string.strip()
|
|
108
|
+
# Clean up common title patterns (e.g., "Article Title | Site Name")
|
|
109
|
+
title = re.split(r"\s*[|—\-]\s*", title)[0].strip()
|
|
110
|
+
|
|
111
|
+
# Find main content area
|
|
112
|
+
content = None
|
|
113
|
+
|
|
114
|
+
# Priority: article > main > [role="main"] > body
|
|
115
|
+
for selector in ["article", "main", "[role='main']", ".content", "#content"]:
|
|
116
|
+
content = soup.select_one(selector)
|
|
117
|
+
if content and len(content.get_text(strip=True)) > 100:
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
if not content:
|
|
121
|
+
content = soup.body or soup
|
|
122
|
+
|
|
123
|
+
# Extract text using BeautifulSoup's get_text with newline separator
|
|
124
|
+
text = content.get_text(separator="\n", strip=True)
|
|
125
|
+
|
|
126
|
+
# Clean up whitespace
|
|
127
|
+
text = _clean_whitespace(text)
|
|
128
|
+
|
|
129
|
+
logger.debug(f"Extracted {len(text)} chars from HTML (title: {title})")
|
|
130
|
+
|
|
131
|
+
return text, title
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _clean_whitespace(text: str) -> str:
|
|
135
|
+
"""
|
|
136
|
+
Clean up whitespace while preserving paragraph structure.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
text: Raw extracted text
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Cleaned text
|
|
143
|
+
"""
|
|
144
|
+
# Normalize line breaks
|
|
145
|
+
text = re.sub(r"\r\n?", "\n", text)
|
|
146
|
+
|
|
147
|
+
# Collapse multiple spaces (but not newlines)
|
|
148
|
+
text = re.sub(r"[^\S\n]+", " ", text)
|
|
149
|
+
|
|
150
|
+
# Collapse multiple newlines to max 2
|
|
151
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
152
|
+
|
|
153
|
+
# Remove leading/trailing whitespace from each line
|
|
154
|
+
lines = [line.strip() for line in text.split("\n")]
|
|
155
|
+
text = "\n".join(lines)
|
|
156
|
+
|
|
157
|
+
# Remove empty lines at start/end
|
|
158
|
+
text = text.strip()
|
|
159
|
+
|
|
160
|
+
return text
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def extract_article_content(html: str) -> tuple[str, dict]:
|
|
164
|
+
"""
|
|
165
|
+
Extract article content with metadata.
|
|
166
|
+
|
|
167
|
+
Attempts to extract structured article data including:
|
|
168
|
+
- Title
|
|
169
|
+
- Author
|
|
170
|
+
- Published date
|
|
171
|
+
- Main content
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
html: Raw HTML string
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Tuple of (content, metadata dict)
|
|
178
|
+
"""
|
|
179
|
+
try:
|
|
180
|
+
from bs4 import BeautifulSoup
|
|
181
|
+
except ImportError:
|
|
182
|
+
raise ImportError(
|
|
183
|
+
"BeautifulSoup is required for HTML extraction. "
|
|
184
|
+
"Install with: pip install beautifulsoup4"
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
188
|
+
|
|
189
|
+
metadata = {}
|
|
190
|
+
|
|
191
|
+
# Extract title
|
|
192
|
+
title = None
|
|
193
|
+
# Try og:title first
|
|
194
|
+
og_title = soup.find("meta", property="og:title")
|
|
195
|
+
if og_title and og_title.get("content"):
|
|
196
|
+
title = og_title["content"].strip()
|
|
197
|
+
elif soup.title and soup.title.string:
|
|
198
|
+
title = soup.title.string.strip()
|
|
199
|
+
|
|
200
|
+
if title:
|
|
201
|
+
metadata["title"] = title
|
|
202
|
+
|
|
203
|
+
# Extract author
|
|
204
|
+
author = None
|
|
205
|
+
author_meta = soup.find("meta", attrs={"name": "author"})
|
|
206
|
+
if author_meta and author_meta.get("content"):
|
|
207
|
+
author = author_meta["content"].strip()
|
|
208
|
+
else:
|
|
209
|
+
# Try common author class patterns
|
|
210
|
+
author_elem = soup.select_one(".author, .byline, [rel='author']")
|
|
211
|
+
if author_elem:
|
|
212
|
+
author = author_elem.get_text(strip=True)
|
|
213
|
+
|
|
214
|
+
if author:
|
|
215
|
+
metadata["author"] = author
|
|
216
|
+
|
|
217
|
+
# Extract published date
|
|
218
|
+
date = None
|
|
219
|
+
date_meta = soup.find("meta", property="article:published_time")
|
|
220
|
+
if date_meta and date_meta.get("content"):
|
|
221
|
+
date = date_meta["content"]
|
|
222
|
+
else:
|
|
223
|
+
date_elem = soup.select_one("time[datetime], .date, .published")
|
|
224
|
+
if date_elem:
|
|
225
|
+
date = date_elem.get("datetime") or date_elem.get_text(strip=True)
|
|
226
|
+
|
|
227
|
+
if date:
|
|
228
|
+
metadata["published_date"] = date
|
|
229
|
+
|
|
230
|
+
# Extract description
|
|
231
|
+
description = None
|
|
232
|
+
desc_meta = soup.find("meta", attrs={"name": "description"})
|
|
233
|
+
if desc_meta and desc_meta.get("content"):
|
|
234
|
+
description = desc_meta["content"].strip()
|
|
235
|
+
|
|
236
|
+
if description:
|
|
237
|
+
metadata["description"] = description
|
|
238
|
+
|
|
239
|
+
# Extract main content
|
|
240
|
+
content, extracted_title = extract_text_from_html(html)
|
|
241
|
+
|
|
242
|
+
# Use extracted title if we didn't find one
|
|
243
|
+
if not title and extracted_title:
|
|
244
|
+
metadata["title"] = extracted_title
|
|
245
|
+
|
|
246
|
+
return content, metadata
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
"""
|
|
2
|
+
URL loader for fetching and parsing web content.
|
|
3
|
+
|
|
4
|
+
Orchestrates scraper and PDF parser plugins to load documents from URLs.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, Field
|
|
12
|
+
|
|
13
|
+
from ..models.document import Document
|
|
14
|
+
from ..pipeline.registry import PluginRegistry
|
|
15
|
+
from ..plugins.base import (
|
|
16
|
+
BaseScraperPlugin,
|
|
17
|
+
BasePDFParserPlugin,
|
|
18
|
+
ContentType,
|
|
19
|
+
ScraperResult,
|
|
20
|
+
)
|
|
21
|
+
from .html_extractor import extract_text_from_html, extract_article_content
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class URLLoaderConfig(BaseModel):
|
|
27
|
+
"""Configuration for URL loading."""
|
|
28
|
+
|
|
29
|
+
timeout: float = Field(
|
|
30
|
+
default=30.0,
|
|
31
|
+
description="Request timeout in seconds"
|
|
32
|
+
)
|
|
33
|
+
use_ocr: bool = Field(
|
|
34
|
+
default=False,
|
|
35
|
+
description="Force OCR for PDF parsing"
|
|
36
|
+
)
|
|
37
|
+
max_pdf_pages: int = Field(
|
|
38
|
+
default=500,
|
|
39
|
+
description="Maximum pages to extract from PDFs"
|
|
40
|
+
)
|
|
41
|
+
scraper_plugin: Optional[str] = Field(
|
|
42
|
+
default=None,
|
|
43
|
+
description="Specific scraper plugin to use (None = auto-select)"
|
|
44
|
+
)
|
|
45
|
+
pdf_parser_plugin: Optional[str] = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="Specific PDF parser plugin to use (None = auto-select)"
|
|
48
|
+
)
|
|
49
|
+
extract_metadata: bool = Field(
|
|
50
|
+
default=True,
|
|
51
|
+
description="Extract article metadata from HTML pages"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class URLLoader:
|
|
56
|
+
"""
|
|
57
|
+
Loads documents from URLs using scraper and PDF parser plugins.
|
|
58
|
+
|
|
59
|
+
Orchestrates the content acquisition process:
|
|
60
|
+
1. Fetch content using a scraper plugin
|
|
61
|
+
2. Detect content type (HTML vs PDF)
|
|
62
|
+
3. Parse content using appropriate parser
|
|
63
|
+
4. Create a Document object
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
>>> loader = URLLoader()
|
|
67
|
+
>>> document = await loader.load("https://example.com/article")
|
|
68
|
+
>>> print(document.title)
|
|
69
|
+
|
|
70
|
+
>>> # Synchronous usage
|
|
71
|
+
>>> document = loader.load_sync("https://example.com/report.pdf")
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(self, config: Optional[URLLoaderConfig] = None):
|
|
75
|
+
"""
|
|
76
|
+
Initialize the URL loader.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
config: Loader configuration
|
|
80
|
+
"""
|
|
81
|
+
self.config = config or URLLoaderConfig()
|
|
82
|
+
self._scraper: Optional[BaseScraperPlugin] = None
|
|
83
|
+
self._pdf_parser: Optional[BasePDFParserPlugin] = None
|
|
84
|
+
|
|
85
|
+
def _get_scraper(self) -> BaseScraperPlugin:
|
|
86
|
+
"""Get the scraper plugin to use."""
|
|
87
|
+
if self._scraper is not None:
|
|
88
|
+
return self._scraper
|
|
89
|
+
|
|
90
|
+
scrapers = PluginRegistry.get_scrapers()
|
|
91
|
+
if not scrapers:
|
|
92
|
+
raise RuntimeError(
|
|
93
|
+
"No scraper plugins registered. "
|
|
94
|
+
"Ensure plugins are loaded via 'from statement_extractor import plugins'"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if self.config.scraper_plugin:
|
|
98
|
+
for scraper in scrapers:
|
|
99
|
+
if scraper.name == self.config.scraper_plugin:
|
|
100
|
+
self._scraper = scraper
|
|
101
|
+
return scraper
|
|
102
|
+
raise ValueError(f"Scraper plugin not found: {self.config.scraper_plugin}")
|
|
103
|
+
|
|
104
|
+
# Use first available (highest priority)
|
|
105
|
+
self._scraper = scrapers[0]
|
|
106
|
+
return self._scraper
|
|
107
|
+
|
|
108
|
+
def _get_pdf_parser(self) -> BasePDFParserPlugin:
|
|
109
|
+
"""Get the PDF parser plugin to use."""
|
|
110
|
+
if self._pdf_parser is not None:
|
|
111
|
+
return self._pdf_parser
|
|
112
|
+
|
|
113
|
+
parsers = PluginRegistry.get_pdf_parsers()
|
|
114
|
+
if not parsers:
|
|
115
|
+
raise RuntimeError(
|
|
116
|
+
"No PDF parser plugins registered. "
|
|
117
|
+
"Ensure plugins are loaded via 'from statement_extractor import plugins'"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if self.config.pdf_parser_plugin:
|
|
121
|
+
for parser in parsers:
|
|
122
|
+
if parser.name == self.config.pdf_parser_plugin:
|
|
123
|
+
self._pdf_parser = parser
|
|
124
|
+
return parser
|
|
125
|
+
raise ValueError(f"PDF parser plugin not found: {self.config.pdf_parser_plugin}")
|
|
126
|
+
|
|
127
|
+
# Use first available (highest priority)
|
|
128
|
+
self._pdf_parser = parsers[0]
|
|
129
|
+
return self._pdf_parser
|
|
130
|
+
|
|
131
|
+
async def load(self, url: str) -> Document:
|
|
132
|
+
"""
|
|
133
|
+
Load a URL and return a Document.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
url: URL to load
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Document with extracted content
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
ValueError: If URL cannot be fetched or parsed
|
|
143
|
+
"""
|
|
144
|
+
logger.info(f"Loading URL: {url}")
|
|
145
|
+
|
|
146
|
+
# 1. Fetch content
|
|
147
|
+
scraper = self._get_scraper()
|
|
148
|
+
result = await scraper.fetch(url, self.config.timeout)
|
|
149
|
+
|
|
150
|
+
if not result.ok:
|
|
151
|
+
raise ValueError(f"Failed to fetch {url}: {result.error}")
|
|
152
|
+
|
|
153
|
+
logger.debug(f"Fetched {len(result.content)} bytes, type: {result.content_type}")
|
|
154
|
+
|
|
155
|
+
# 2. Process based on content type
|
|
156
|
+
if result.content_type == ContentType.PDF:
|
|
157
|
+
return self._process_pdf(result)
|
|
158
|
+
elif result.content_type == ContentType.HTML:
|
|
159
|
+
return self._process_html(result)
|
|
160
|
+
else:
|
|
161
|
+
# Try to guess based on content
|
|
162
|
+
if result.content[:5] == b"%PDF-":
|
|
163
|
+
return self._process_pdf(result)
|
|
164
|
+
# Default to HTML
|
|
165
|
+
return self._process_html(result)
|
|
166
|
+
|
|
167
|
+
def load_sync(self, url: str) -> Document:
|
|
168
|
+
"""
|
|
169
|
+
Synchronous wrapper for load().
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
url: URL to load
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Document with extracted content
|
|
176
|
+
"""
|
|
177
|
+
return asyncio.run(self.load(url))
|
|
178
|
+
|
|
179
|
+
def _process_pdf(self, result: ScraperResult) -> Document:
|
|
180
|
+
"""
|
|
181
|
+
Convert PDF to Document with pages.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
result: ScraperResult containing PDF bytes
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Document with PDF content
|
|
188
|
+
"""
|
|
189
|
+
logger.info(f"Processing PDF from {result.final_url}")
|
|
190
|
+
|
|
191
|
+
parser = self._get_pdf_parser()
|
|
192
|
+
parse_result = parser.parse(
|
|
193
|
+
result.content,
|
|
194
|
+
max_pages=self.config.max_pdf_pages,
|
|
195
|
+
use_ocr=self.config.use_ocr,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
if not parse_result.ok:
|
|
199
|
+
raise ValueError(f"Failed to parse PDF: {parse_result.error}")
|
|
200
|
+
|
|
201
|
+
logger.info(f"Extracted {len(parse_result.pages)} pages from PDF")
|
|
202
|
+
|
|
203
|
+
# Create Document from pages
|
|
204
|
+
kwargs = {
|
|
205
|
+
"pages": parse_result.pages,
|
|
206
|
+
"title": parse_result.metadata.get("title"),
|
|
207
|
+
"source_type": "pdf",
|
|
208
|
+
"url": result.final_url,
|
|
209
|
+
}
|
|
210
|
+
author = parse_result.metadata.get("author")
|
|
211
|
+
if author:
|
|
212
|
+
kwargs["authors"] = [author]
|
|
213
|
+
|
|
214
|
+
return Document.from_pages(**kwargs)
|
|
215
|
+
|
|
216
|
+
def _process_html(self, result: ScraperResult) -> Document:
|
|
217
|
+
"""
|
|
218
|
+
Convert HTML to Document (single page).
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
result: ScraperResult containing HTML bytes
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Document with HTML content
|
|
225
|
+
"""
|
|
226
|
+
logger.info(f"Processing HTML from {result.final_url}")
|
|
227
|
+
|
|
228
|
+
# Decode HTML
|
|
229
|
+
try:
|
|
230
|
+
html = result.content.decode("utf-8", errors="replace")
|
|
231
|
+
except Exception as e:
|
|
232
|
+
raise ValueError(f"Failed to decode HTML: {e}")
|
|
233
|
+
|
|
234
|
+
# Extract text and metadata
|
|
235
|
+
if self.config.extract_metadata:
|
|
236
|
+
text, metadata = extract_article_content(html)
|
|
237
|
+
title = metadata.get("title")
|
|
238
|
+
author = metadata.get("author")
|
|
239
|
+
# Log extracted metadata
|
|
240
|
+
logger.debug(f"Extracted metadata: {metadata}")
|
|
241
|
+
else:
|
|
242
|
+
text, title = extract_text_from_html(html)
|
|
243
|
+
author = None
|
|
244
|
+
metadata = {}
|
|
245
|
+
|
|
246
|
+
if not text or len(text.strip()) < 50:
|
|
247
|
+
raise ValueError("No meaningful content extracted from HTML")
|
|
248
|
+
|
|
249
|
+
logger.info(f"Extracted {len(text)} chars from HTML")
|
|
250
|
+
if title:
|
|
251
|
+
logger.info(f" Title: {title}")
|
|
252
|
+
if author:
|
|
253
|
+
logger.info(f" Author: {author}")
|
|
254
|
+
if metadata.get("published_date"):
|
|
255
|
+
logger.info(f" Published: {metadata.get('published_date')}")
|
|
256
|
+
|
|
257
|
+
# Create Document using from_pages since from_text forces source_type="text"
|
|
258
|
+
kwargs = {
|
|
259
|
+
"pages": [text],
|
|
260
|
+
"title": title,
|
|
261
|
+
"source_type": "webpage",
|
|
262
|
+
"url": result.final_url,
|
|
263
|
+
}
|
|
264
|
+
if author:
|
|
265
|
+
kwargs["authors"] = [author]
|
|
266
|
+
|
|
267
|
+
return Document.from_pages(**kwargs)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
async def load_url(
|
|
271
|
+
url: str,
|
|
272
|
+
config: Optional[URLLoaderConfig] = None,
|
|
273
|
+
) -> Document:
|
|
274
|
+
"""
|
|
275
|
+
Convenience function to load a URL.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
url: URL to load
|
|
279
|
+
config: Optional loader configuration
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Document with extracted content
|
|
283
|
+
"""
|
|
284
|
+
loader = URLLoader(config)
|
|
285
|
+
return await loader.load(url)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def load_url_sync(
|
|
289
|
+
url: str,
|
|
290
|
+
config: Optional[URLLoaderConfig] = None,
|
|
291
|
+
) -> Document:
|
|
292
|
+
"""
|
|
293
|
+
Convenience function to load a URL synchronously.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
url: URL to load
|
|
297
|
+
config: Optional loader configuration
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Document with extracted content
|
|
301
|
+
"""
|
|
302
|
+
loader = URLLoader(config)
|
|
303
|
+
return loader.load_sync(url)
|