corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
- corp_extractor-0.9.0.dist-info/RECORD +76 -0
- statement_extractor/__init__.py +1 -1
- statement_extractor/cli.py +1227 -10
- statement_extractor/data/statement_taxonomy.json +6949 -1159
- statement_extractor/database/__init__.py +52 -0
- statement_extractor/database/embeddings.py +186 -0
- statement_extractor/database/hub.py +520 -0
- statement_extractor/database/importers/__init__.py +24 -0
- statement_extractor/database/importers/companies_house.py +545 -0
- statement_extractor/database/importers/gleif.py +538 -0
- statement_extractor/database/importers/sec_edgar.py +375 -0
- statement_extractor/database/importers/wikidata.py +1012 -0
- statement_extractor/database/importers/wikidata_people.py +632 -0
- statement_extractor/database/models.py +230 -0
- statement_extractor/database/resolver.py +245 -0
- statement_extractor/database/store.py +1609 -0
- statement_extractor/document/__init__.py +62 -0
- statement_extractor/document/chunker.py +410 -0
- statement_extractor/document/context.py +171 -0
- statement_extractor/document/deduplicator.py +173 -0
- statement_extractor/document/html_extractor.py +246 -0
- statement_extractor/document/loader.py +303 -0
- statement_extractor/document/pipeline.py +388 -0
- statement_extractor/document/summarizer.py +195 -0
- statement_extractor/models/__init__.py +16 -1
- statement_extractor/models/canonical.py +44 -1
- statement_extractor/models/document.py +308 -0
- statement_extractor/models/labels.py +47 -18
- statement_extractor/models/qualifiers.py +51 -3
- statement_extractor/models/statement.py +26 -0
- statement_extractor/pipeline/config.py +6 -11
- statement_extractor/pipeline/orchestrator.py +80 -111
- statement_extractor/pipeline/registry.py +52 -46
- statement_extractor/plugins/__init__.py +20 -8
- statement_extractor/plugins/base.py +334 -64
- statement_extractor/plugins/extractors/gliner2.py +10 -0
- statement_extractor/plugins/labelers/taxonomy.py +18 -5
- statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
- statement_extractor/plugins/pdf/__init__.py +10 -0
- statement_extractor/plugins/pdf/pypdf.py +291 -0
- statement_extractor/plugins/qualifiers/__init__.py +11 -0
- statement_extractor/plugins/qualifiers/companies_house.py +14 -3
- statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
- statement_extractor/plugins/qualifiers/gleif.py +14 -3
- statement_extractor/plugins/qualifiers/person.py +578 -14
- statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
- statement_extractor/plugins/scrapers/__init__.py +10 -0
- statement_extractor/plugins/scrapers/http.py +236 -0
- statement_extractor/plugins/splitters/t5_gemma.py +158 -53
- statement_extractor/plugins/taxonomy/embedding.py +193 -46
- statement_extractor/plugins/taxonomy/mnli.py +16 -4
- statement_extractor/scoring.py +8 -8
- corp_extractor-0.5.0.dist-info/RECORD +0 -55
- statement_extractor/plugins/canonicalizers/__init__.py +0 -17
- statement_extractor/plugins/canonicalizers/base.py +0 -9
- statement_extractor/plugins/canonicalizers/location.py +0 -219
- statement_extractor/plugins/canonicalizers/organization.py +0 -230
- statement_extractor/plugins/canonicalizers/person.py +0 -242
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
- {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""
|
|
2
|
+
StatementDeduplicator - Hash-based deduplication for statements.
|
|
3
|
+
|
|
4
|
+
Removes duplicate statements across chunks using normalized hashing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
import logging
|
|
9
|
+
from typing import TypeVar, Union
|
|
10
|
+
|
|
11
|
+
from ..models.labels import LabeledStatement
|
|
12
|
+
from ..models.statement import PipelineStatement, RawTriple
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# Type variable for generic deduplication
|
|
17
|
+
T = TypeVar("T", RawTriple, PipelineStatement, LabeledStatement)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class StatementDeduplicator:
|
|
21
|
+
"""
|
|
22
|
+
Deduplicates statements using normalized hash comparison.
|
|
23
|
+
|
|
24
|
+
Uses a hash of normalized (subject, predicate, object) to identify
|
|
25
|
+
duplicates. Keeps the first occurrence of each unique statement.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self):
|
|
29
|
+
"""Initialize the deduplicator."""
|
|
30
|
+
self._seen_hashes: set[str] = set()
|
|
31
|
+
|
|
32
|
+
def reset(self) -> None:
|
|
33
|
+
"""Reset the deduplicator state, clearing all seen hashes."""
|
|
34
|
+
self._seen_hashes.clear()
|
|
35
|
+
logger.debug("Deduplicator state reset")
|
|
36
|
+
|
|
37
|
+
def _normalize_text(self, text: str) -> str:
|
|
38
|
+
"""
|
|
39
|
+
Normalize text for comparison.
|
|
40
|
+
|
|
41
|
+
- Lowercase
|
|
42
|
+
- Strip whitespace
|
|
43
|
+
- Collapse multiple spaces
|
|
44
|
+
"""
|
|
45
|
+
return " ".join(text.lower().strip().split())
|
|
46
|
+
|
|
47
|
+
def _get_triple_parts(
|
|
48
|
+
self,
|
|
49
|
+
stmt: Union[RawTriple, PipelineStatement, LabeledStatement],
|
|
50
|
+
) -> tuple[str, str, str]:
|
|
51
|
+
"""
|
|
52
|
+
Extract (subject, predicate, object) from a statement.
|
|
53
|
+
|
|
54
|
+
Handles different statement types consistently.
|
|
55
|
+
"""
|
|
56
|
+
if isinstance(stmt, RawTriple):
|
|
57
|
+
return (
|
|
58
|
+
stmt.subject_text,
|
|
59
|
+
stmt.predicate_text,
|
|
60
|
+
stmt.object_text,
|
|
61
|
+
)
|
|
62
|
+
elif isinstance(stmt, LabeledStatement):
|
|
63
|
+
return (
|
|
64
|
+
stmt.statement.subject.text,
|
|
65
|
+
stmt.statement.predicate,
|
|
66
|
+
stmt.statement.object.text,
|
|
67
|
+
)
|
|
68
|
+
else:
|
|
69
|
+
# PipelineStatement
|
|
70
|
+
return (
|
|
71
|
+
stmt.subject.text,
|
|
72
|
+
stmt.predicate,
|
|
73
|
+
stmt.object.text,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def _hash_triple(
|
|
77
|
+
self,
|
|
78
|
+
stmt: Union[RawTriple, PipelineStatement, LabeledStatement],
|
|
79
|
+
) -> str:
|
|
80
|
+
"""
|
|
81
|
+
Generate a hash for a statement triple.
|
|
82
|
+
|
|
83
|
+
Uses normalized text to catch near-duplicates with different
|
|
84
|
+
casing or whitespace.
|
|
85
|
+
"""
|
|
86
|
+
subj, pred, obj = self._get_triple_parts(stmt)
|
|
87
|
+
|
|
88
|
+
key = (
|
|
89
|
+
self._normalize_text(subj),
|
|
90
|
+
self._normalize_text(pred),
|
|
91
|
+
self._normalize_text(obj),
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Use sha256 and truncate to 16 chars for reasonable uniqueness
|
|
95
|
+
return hashlib.sha256(str(key).encode()).hexdigest()[:16]
|
|
96
|
+
|
|
97
|
+
def is_duplicate(
|
|
98
|
+
self,
|
|
99
|
+
stmt: Union[RawTriple, PipelineStatement, LabeledStatement],
|
|
100
|
+
) -> bool:
|
|
101
|
+
"""
|
|
102
|
+
Check if a statement is a duplicate.
|
|
103
|
+
|
|
104
|
+
Also marks the statement as seen if it's not a duplicate.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
stmt: Statement to check
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
True if this is a duplicate of a previously seen statement
|
|
111
|
+
"""
|
|
112
|
+
hash_value = self._hash_triple(stmt)
|
|
113
|
+
|
|
114
|
+
if hash_value in self._seen_hashes:
|
|
115
|
+
return True
|
|
116
|
+
|
|
117
|
+
self._seen_hashes.add(hash_value)
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
def filter_duplicates(self, statements: list[T]) -> list[T]:
|
|
121
|
+
"""
|
|
122
|
+
Filter out duplicate statements from a list.
|
|
123
|
+
|
|
124
|
+
Preserves order and keeps the first occurrence of each unique statement.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
statements: List of statements to deduplicate
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
List with duplicates removed
|
|
131
|
+
"""
|
|
132
|
+
if not statements:
|
|
133
|
+
return []
|
|
134
|
+
|
|
135
|
+
original_count = len(statements)
|
|
136
|
+
result = []
|
|
137
|
+
|
|
138
|
+
for stmt in statements:
|
|
139
|
+
if not self.is_duplicate(stmt):
|
|
140
|
+
result.append(stmt)
|
|
141
|
+
|
|
142
|
+
removed = original_count - len(result)
|
|
143
|
+
if removed > 0:
|
|
144
|
+
logger.info(f"Deduplication removed {removed} statements ({len(result)} remaining)")
|
|
145
|
+
|
|
146
|
+
return result
|
|
147
|
+
|
|
148
|
+
def deduplicate_batch(
|
|
149
|
+
self,
|
|
150
|
+
statements: list[T],
|
|
151
|
+
reset_first: bool = True,
|
|
152
|
+
) -> list[T]:
|
|
153
|
+
"""
|
|
154
|
+
Deduplicate a batch of statements.
|
|
155
|
+
|
|
156
|
+
Optionally resets state before processing to ensure clean deduplication.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
statements: List of statements to deduplicate
|
|
160
|
+
reset_first: Whether to reset seen hashes before processing
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Deduplicated list of statements
|
|
164
|
+
"""
|
|
165
|
+
if reset_first:
|
|
166
|
+
self.reset()
|
|
167
|
+
|
|
168
|
+
return self.filter_duplicates(statements)
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def seen_count(self) -> int:
|
|
172
|
+
"""Get the number of unique statements seen."""
|
|
173
|
+
return len(self._seen_hashes)
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTML text extraction utilities.
|
|
3
|
+
|
|
4
|
+
Extracts clean text content from HTML pages, prioritizing article content
|
|
5
|
+
and removing navigation, headers, footers, and other non-content elements.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import re
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def extract_text_from_html(html: str) -> tuple[str, Optional[str]]:
|
|
16
|
+
"""
|
|
17
|
+
Extract clean text and title from HTML.
|
|
18
|
+
|
|
19
|
+
Removes scripts, styles, navigation, and other non-content elements.
|
|
20
|
+
Prioritizes article or main content areas.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
html: Raw HTML string
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Tuple of (extracted_text, title or None)
|
|
27
|
+
"""
|
|
28
|
+
try:
|
|
29
|
+
from bs4 import BeautifulSoup
|
|
30
|
+
except ImportError:
|
|
31
|
+
raise ImportError(
|
|
32
|
+
"BeautifulSoup is required for HTML extraction. "
|
|
33
|
+
"Install with: pip install beautifulsoup4"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
37
|
+
|
|
38
|
+
# Remove unwanted elements
|
|
39
|
+
for tag_name in [
|
|
40
|
+
"script",
|
|
41
|
+
"style",
|
|
42
|
+
"nav",
|
|
43
|
+
"footer",
|
|
44
|
+
"header",
|
|
45
|
+
"aside",
|
|
46
|
+
"noscript",
|
|
47
|
+
"iframe",
|
|
48
|
+
"form",
|
|
49
|
+
"button",
|
|
50
|
+
"input",
|
|
51
|
+
"select",
|
|
52
|
+
"textarea",
|
|
53
|
+
]:
|
|
54
|
+
for tag in soup.find_all(tag_name):
|
|
55
|
+
tag.decompose()
|
|
56
|
+
|
|
57
|
+
# Remove elements with common non-content class/id patterns
|
|
58
|
+
non_content_patterns = [
|
|
59
|
+
"nav",
|
|
60
|
+
"menu",
|
|
61
|
+
"sidebar",
|
|
62
|
+
"footer",
|
|
63
|
+
"header",
|
|
64
|
+
"comment",
|
|
65
|
+
"advertisement",
|
|
66
|
+
"ad-",
|
|
67
|
+
"social",
|
|
68
|
+
"share",
|
|
69
|
+
"related",
|
|
70
|
+
"recommended",
|
|
71
|
+
"popup",
|
|
72
|
+
"modal",
|
|
73
|
+
"cookie",
|
|
74
|
+
"banner",
|
|
75
|
+
"promo",
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
# Collect elements to remove first, then decompose
|
|
79
|
+
# (decomposing while iterating can cause issues)
|
|
80
|
+
elements_to_remove = []
|
|
81
|
+
|
|
82
|
+
for element in soup.find_all(class_=True):
|
|
83
|
+
if element.attrs is None:
|
|
84
|
+
continue
|
|
85
|
+
classes = element.get("class", [])
|
|
86
|
+
if classes:
|
|
87
|
+
class_str = " ".join(classes).lower()
|
|
88
|
+
if any(pattern in class_str for pattern in non_content_patterns):
|
|
89
|
+
elements_to_remove.append(element)
|
|
90
|
+
|
|
91
|
+
for element in soup.find_all(id=True):
|
|
92
|
+
if element.attrs is None:
|
|
93
|
+
continue
|
|
94
|
+
element_id = element.get("id", "")
|
|
95
|
+
if element_id and any(pattern in element_id.lower() for pattern in non_content_patterns):
|
|
96
|
+
elements_to_remove.append(element)
|
|
97
|
+
|
|
98
|
+
for element in elements_to_remove:
|
|
99
|
+
try:
|
|
100
|
+
element.decompose()
|
|
101
|
+
except Exception:
|
|
102
|
+
pass # Element may already be decomposed
|
|
103
|
+
|
|
104
|
+
# Get title
|
|
105
|
+
title = None
|
|
106
|
+
if soup.title and soup.title.string:
|
|
107
|
+
title = soup.title.string.strip()
|
|
108
|
+
# Clean up common title patterns (e.g., "Article Title | Site Name")
|
|
109
|
+
title = re.split(r"\s*[|—\-]\s*", title)[0].strip()
|
|
110
|
+
|
|
111
|
+
# Find main content area
|
|
112
|
+
content = None
|
|
113
|
+
|
|
114
|
+
# Priority: article > main > [role="main"] > body
|
|
115
|
+
for selector in ["article", "main", "[role='main']", ".content", "#content"]:
|
|
116
|
+
content = soup.select_one(selector)
|
|
117
|
+
if content and len(content.get_text(strip=True)) > 100:
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
if not content:
|
|
121
|
+
content = soup.body or soup
|
|
122
|
+
|
|
123
|
+
# Extract text using BeautifulSoup's get_text with newline separator
|
|
124
|
+
text = content.get_text(separator="\n", strip=True)
|
|
125
|
+
|
|
126
|
+
# Clean up whitespace
|
|
127
|
+
text = _clean_whitespace(text)
|
|
128
|
+
|
|
129
|
+
logger.debug(f"Extracted {len(text)} chars from HTML (title: {title})")
|
|
130
|
+
|
|
131
|
+
return text, title
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _clean_whitespace(text: str) -> str:
|
|
135
|
+
"""
|
|
136
|
+
Clean up whitespace while preserving paragraph structure.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
text: Raw extracted text
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Cleaned text
|
|
143
|
+
"""
|
|
144
|
+
# Normalize line breaks
|
|
145
|
+
text = re.sub(r"\r\n?", "\n", text)
|
|
146
|
+
|
|
147
|
+
# Collapse multiple spaces (but not newlines)
|
|
148
|
+
text = re.sub(r"[^\S\n]+", " ", text)
|
|
149
|
+
|
|
150
|
+
# Collapse multiple newlines to max 2
|
|
151
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
152
|
+
|
|
153
|
+
# Remove leading/trailing whitespace from each line
|
|
154
|
+
lines = [line.strip() for line in text.split("\n")]
|
|
155
|
+
text = "\n".join(lines)
|
|
156
|
+
|
|
157
|
+
# Remove empty lines at start/end
|
|
158
|
+
text = text.strip()
|
|
159
|
+
|
|
160
|
+
return text
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def extract_article_content(html: str) -> tuple[str, dict]:
|
|
164
|
+
"""
|
|
165
|
+
Extract article content with metadata.
|
|
166
|
+
|
|
167
|
+
Attempts to extract structured article data including:
|
|
168
|
+
- Title
|
|
169
|
+
- Author
|
|
170
|
+
- Published date
|
|
171
|
+
- Main content
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
html: Raw HTML string
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Tuple of (content, metadata dict)
|
|
178
|
+
"""
|
|
179
|
+
try:
|
|
180
|
+
from bs4 import BeautifulSoup
|
|
181
|
+
except ImportError:
|
|
182
|
+
raise ImportError(
|
|
183
|
+
"BeautifulSoup is required for HTML extraction. "
|
|
184
|
+
"Install with: pip install beautifulsoup4"
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
188
|
+
|
|
189
|
+
metadata = {}
|
|
190
|
+
|
|
191
|
+
# Extract title
|
|
192
|
+
title = None
|
|
193
|
+
# Try og:title first
|
|
194
|
+
og_title = soup.find("meta", property="og:title")
|
|
195
|
+
if og_title and og_title.get("content"):
|
|
196
|
+
title = og_title["content"].strip()
|
|
197
|
+
elif soup.title and soup.title.string:
|
|
198
|
+
title = soup.title.string.strip()
|
|
199
|
+
|
|
200
|
+
if title:
|
|
201
|
+
metadata["title"] = title
|
|
202
|
+
|
|
203
|
+
# Extract author
|
|
204
|
+
author = None
|
|
205
|
+
author_meta = soup.find("meta", attrs={"name": "author"})
|
|
206
|
+
if author_meta and author_meta.get("content"):
|
|
207
|
+
author = author_meta["content"].strip()
|
|
208
|
+
else:
|
|
209
|
+
# Try common author class patterns
|
|
210
|
+
author_elem = soup.select_one(".author, .byline, [rel='author']")
|
|
211
|
+
if author_elem:
|
|
212
|
+
author = author_elem.get_text(strip=True)
|
|
213
|
+
|
|
214
|
+
if author:
|
|
215
|
+
metadata["author"] = author
|
|
216
|
+
|
|
217
|
+
# Extract published date
|
|
218
|
+
date = None
|
|
219
|
+
date_meta = soup.find("meta", property="article:published_time")
|
|
220
|
+
if date_meta and date_meta.get("content"):
|
|
221
|
+
date = date_meta["content"]
|
|
222
|
+
else:
|
|
223
|
+
date_elem = soup.select_one("time[datetime], .date, .published")
|
|
224
|
+
if date_elem:
|
|
225
|
+
date = date_elem.get("datetime") or date_elem.get_text(strip=True)
|
|
226
|
+
|
|
227
|
+
if date:
|
|
228
|
+
metadata["published_date"] = date
|
|
229
|
+
|
|
230
|
+
# Extract description
|
|
231
|
+
description = None
|
|
232
|
+
desc_meta = soup.find("meta", attrs={"name": "description"})
|
|
233
|
+
if desc_meta and desc_meta.get("content"):
|
|
234
|
+
description = desc_meta["content"].strip()
|
|
235
|
+
|
|
236
|
+
if description:
|
|
237
|
+
metadata["description"] = description
|
|
238
|
+
|
|
239
|
+
# Extract main content
|
|
240
|
+
content, extracted_title = extract_text_from_html(html)
|
|
241
|
+
|
|
242
|
+
# Use extracted title if we didn't find one
|
|
243
|
+
if not title and extracted_title:
|
|
244
|
+
metadata["title"] = extracted_title
|
|
245
|
+
|
|
246
|
+
return content, metadata
|