corp-extractor 0.5.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +191 -24
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +1227 -10
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +520 -0
  9. statement_extractor/database/importers/__init__.py +24 -0
  10. statement_extractor/database/importers/companies_house.py +545 -0
  11. statement_extractor/database/importers/gleif.py +538 -0
  12. statement_extractor/database/importers/sec_edgar.py +375 -0
  13. statement_extractor/database/importers/wikidata.py +1012 -0
  14. statement_extractor/database/importers/wikidata_people.py +632 -0
  15. statement_extractor/database/models.py +230 -0
  16. statement_extractor/database/resolver.py +245 -0
  17. statement_extractor/database/store.py +1609 -0
  18. statement_extractor/document/__init__.py +62 -0
  19. statement_extractor/document/chunker.py +410 -0
  20. statement_extractor/document/context.py +171 -0
  21. statement_extractor/document/deduplicator.py +173 -0
  22. statement_extractor/document/html_extractor.py +246 -0
  23. statement_extractor/document/loader.py +303 -0
  24. statement_extractor/document/pipeline.py +388 -0
  25. statement_extractor/document/summarizer.py +195 -0
  26. statement_extractor/models/__init__.py +16 -1
  27. statement_extractor/models/canonical.py +44 -1
  28. statement_extractor/models/document.py +308 -0
  29. statement_extractor/models/labels.py +47 -18
  30. statement_extractor/models/qualifiers.py +51 -3
  31. statement_extractor/models/statement.py +26 -0
  32. statement_extractor/pipeline/config.py +6 -11
  33. statement_extractor/pipeline/orchestrator.py +80 -111
  34. statement_extractor/pipeline/registry.py +52 -46
  35. statement_extractor/plugins/__init__.py +20 -8
  36. statement_extractor/plugins/base.py +334 -64
  37. statement_extractor/plugins/extractors/gliner2.py +10 -0
  38. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  39. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  40. statement_extractor/plugins/pdf/__init__.py +10 -0
  41. statement_extractor/plugins/pdf/pypdf.py +291 -0
  42. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  43. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  44. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  45. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  46. statement_extractor/plugins/qualifiers/person.py +578 -14
  47. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  48. statement_extractor/plugins/scrapers/__init__.py +10 -0
  49. statement_extractor/plugins/scrapers/http.py +236 -0
  50. statement_extractor/plugins/splitters/t5_gemma.py +158 -53
  51. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  52. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  53. statement_extractor/scoring.py +8 -8
  54. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  55. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  56. statement_extractor/plugins/canonicalizers/base.py +0 -9
  57. statement_extractor/plugins/canonicalizers/location.py +0 -219
  58. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  59. statement_extractor/plugins/canonicalizers/person.py +0 -242
  60. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  61. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,173 @@
1
+ """
2
+ StatementDeduplicator - Hash-based deduplication for statements.
3
+
4
+ Removes duplicate statements across chunks using normalized hashing.
5
+ """
6
+
7
+ import hashlib
8
+ import logging
9
+ from typing import TypeVar, Union
10
+
11
+ from ..models.labels import LabeledStatement
12
+ from ..models.statement import PipelineStatement, RawTriple
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Type variable for generic deduplication
17
+ T = TypeVar("T", RawTriple, PipelineStatement, LabeledStatement)
18
+
19
+
20
+ class StatementDeduplicator:
21
+ """
22
+ Deduplicates statements using normalized hash comparison.
23
+
24
+ Uses a hash of normalized (subject, predicate, object) to identify
25
+ duplicates. Keeps the first occurrence of each unique statement.
26
+ """
27
+
28
+ def __init__(self):
29
+ """Initialize the deduplicator."""
30
+ self._seen_hashes: set[str] = set()
31
+
32
+ def reset(self) -> None:
33
+ """Reset the deduplicator state, clearing all seen hashes."""
34
+ self._seen_hashes.clear()
35
+ logger.debug("Deduplicator state reset")
36
+
37
+ def _normalize_text(self, text: str) -> str:
38
+ """
39
+ Normalize text for comparison.
40
+
41
+ - Lowercase
42
+ - Strip whitespace
43
+ - Collapse multiple spaces
44
+ """
45
+ return " ".join(text.lower().strip().split())
46
+
47
+ def _get_triple_parts(
48
+ self,
49
+ stmt: Union[RawTriple, PipelineStatement, LabeledStatement],
50
+ ) -> tuple[str, str, str]:
51
+ """
52
+ Extract (subject, predicate, object) from a statement.
53
+
54
+ Handles different statement types consistently.
55
+ """
56
+ if isinstance(stmt, RawTriple):
57
+ return (
58
+ stmt.subject_text,
59
+ stmt.predicate_text,
60
+ stmt.object_text,
61
+ )
62
+ elif isinstance(stmt, LabeledStatement):
63
+ return (
64
+ stmt.statement.subject.text,
65
+ stmt.statement.predicate,
66
+ stmt.statement.object.text,
67
+ )
68
+ else:
69
+ # PipelineStatement
70
+ return (
71
+ stmt.subject.text,
72
+ stmt.predicate,
73
+ stmt.object.text,
74
+ )
75
+
76
+ def _hash_triple(
77
+ self,
78
+ stmt: Union[RawTriple, PipelineStatement, LabeledStatement],
79
+ ) -> str:
80
+ """
81
+ Generate a hash for a statement triple.
82
+
83
+ Uses normalized text to catch near-duplicates with different
84
+ casing or whitespace.
85
+ """
86
+ subj, pred, obj = self._get_triple_parts(stmt)
87
+
88
+ key = (
89
+ self._normalize_text(subj),
90
+ self._normalize_text(pred),
91
+ self._normalize_text(obj),
92
+ )
93
+
94
+ # Use sha256 and truncate to 16 chars for reasonable uniqueness
95
+ return hashlib.sha256(str(key).encode()).hexdigest()[:16]
96
+
97
+ def is_duplicate(
98
+ self,
99
+ stmt: Union[RawTriple, PipelineStatement, LabeledStatement],
100
+ ) -> bool:
101
+ """
102
+ Check if a statement is a duplicate.
103
+
104
+ Also marks the statement as seen if it's not a duplicate.
105
+
106
+ Args:
107
+ stmt: Statement to check
108
+
109
+ Returns:
110
+ True if this is a duplicate of a previously seen statement
111
+ """
112
+ hash_value = self._hash_triple(stmt)
113
+
114
+ if hash_value in self._seen_hashes:
115
+ return True
116
+
117
+ self._seen_hashes.add(hash_value)
118
+ return False
119
+
120
+ def filter_duplicates(self, statements: list[T]) -> list[T]:
121
+ """
122
+ Filter out duplicate statements from a list.
123
+
124
+ Preserves order and keeps the first occurrence of each unique statement.
125
+
126
+ Args:
127
+ statements: List of statements to deduplicate
128
+
129
+ Returns:
130
+ List with duplicates removed
131
+ """
132
+ if not statements:
133
+ return []
134
+
135
+ original_count = len(statements)
136
+ result = []
137
+
138
+ for stmt in statements:
139
+ if not self.is_duplicate(stmt):
140
+ result.append(stmt)
141
+
142
+ removed = original_count - len(result)
143
+ if removed > 0:
144
+ logger.info(f"Deduplication removed {removed} statements ({len(result)} remaining)")
145
+
146
+ return result
147
+
148
+ def deduplicate_batch(
149
+ self,
150
+ statements: list[T],
151
+ reset_first: bool = True,
152
+ ) -> list[T]:
153
+ """
154
+ Deduplicate a batch of statements.
155
+
156
+ Optionally resets state before processing to ensure clean deduplication.
157
+
158
+ Args:
159
+ statements: List of statements to deduplicate
160
+ reset_first: Whether to reset seen hashes before processing
161
+
162
+ Returns:
163
+ Deduplicated list of statements
164
+ """
165
+ if reset_first:
166
+ self.reset()
167
+
168
+ return self.filter_duplicates(statements)
169
+
170
+ @property
171
+ def seen_count(self) -> int:
172
+ """Get the number of unique statements seen."""
173
+ return len(self._seen_hashes)
@@ -0,0 +1,246 @@
1
+ """
2
+ HTML text extraction utilities.
3
+
4
+ Extracts clean text content from HTML pages, prioritizing article content
5
+ and removing navigation, headers, footers, and other non-content elements.
6
+ """
7
+
8
+ import logging
9
+ import re
10
+ from typing import Optional
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def extract_text_from_html(html: str) -> tuple[str, Optional[str]]:
16
+ """
17
+ Extract clean text and title from HTML.
18
+
19
+ Removes scripts, styles, navigation, and other non-content elements.
20
+ Prioritizes article or main content areas.
21
+
22
+ Args:
23
+ html: Raw HTML string
24
+
25
+ Returns:
26
+ Tuple of (extracted_text, title or None)
27
+ """
28
+ try:
29
+ from bs4 import BeautifulSoup
30
+ except ImportError:
31
+ raise ImportError(
32
+ "BeautifulSoup is required for HTML extraction. "
33
+ "Install with: pip install beautifulsoup4"
34
+ )
35
+
36
+ soup = BeautifulSoup(html, "html.parser")
37
+
38
+ # Remove unwanted elements
39
+ for tag_name in [
40
+ "script",
41
+ "style",
42
+ "nav",
43
+ "footer",
44
+ "header",
45
+ "aside",
46
+ "noscript",
47
+ "iframe",
48
+ "form",
49
+ "button",
50
+ "input",
51
+ "select",
52
+ "textarea",
53
+ ]:
54
+ for tag in soup.find_all(tag_name):
55
+ tag.decompose()
56
+
57
+ # Remove elements with common non-content class/id patterns
58
+ non_content_patterns = [
59
+ "nav",
60
+ "menu",
61
+ "sidebar",
62
+ "footer",
63
+ "header",
64
+ "comment",
65
+ "advertisement",
66
+ "ad-",
67
+ "social",
68
+ "share",
69
+ "related",
70
+ "recommended",
71
+ "popup",
72
+ "modal",
73
+ "cookie",
74
+ "banner",
75
+ "promo",
76
+ ]
77
+
78
+ # Collect elements to remove first, then decompose
79
+ # (decomposing while iterating can cause issues)
80
+ elements_to_remove = []
81
+
82
+ for element in soup.find_all(class_=True):
83
+ if element.attrs is None:
84
+ continue
85
+ classes = element.get("class", [])
86
+ if classes:
87
+ class_str = " ".join(classes).lower()
88
+ if any(pattern in class_str for pattern in non_content_patterns):
89
+ elements_to_remove.append(element)
90
+
91
+ for element in soup.find_all(id=True):
92
+ if element.attrs is None:
93
+ continue
94
+ element_id = element.get("id", "")
95
+ if element_id and any(pattern in element_id.lower() for pattern in non_content_patterns):
96
+ elements_to_remove.append(element)
97
+
98
+ for element in elements_to_remove:
99
+ try:
100
+ element.decompose()
101
+ except Exception:
102
+ pass # Element may already be decomposed
103
+
104
+ # Get title
105
+ title = None
106
+ if soup.title and soup.title.string:
107
+ title = soup.title.string.strip()
108
+ # Clean up common title patterns (e.g., "Article Title | Site Name")
109
+ title = re.split(r"\s*[|—\-]\s*", title)[0].strip()
110
+
111
+ # Find main content area
112
+ content = None
113
+
114
+ # Priority: article > main > [role="main"] > body
115
+ for selector in ["article", "main", "[role='main']", ".content", "#content"]:
116
+ content = soup.select_one(selector)
117
+ if content and len(content.get_text(strip=True)) > 100:
118
+ break
119
+
120
+ if not content:
121
+ content = soup.body or soup
122
+
123
+ # Extract text using BeautifulSoup's get_text with newline separator
124
+ text = content.get_text(separator="\n", strip=True)
125
+
126
+ # Clean up whitespace
127
+ text = _clean_whitespace(text)
128
+
129
+ logger.debug(f"Extracted {len(text)} chars from HTML (title: {title})")
130
+
131
+ return text, title
132
+
133
+
134
+ def _clean_whitespace(text: str) -> str:
135
+ """
136
+ Clean up whitespace while preserving paragraph structure.
137
+
138
+ Args:
139
+ text: Raw extracted text
140
+
141
+ Returns:
142
+ Cleaned text
143
+ """
144
+ # Normalize line breaks
145
+ text = re.sub(r"\r\n?", "\n", text)
146
+
147
+ # Collapse multiple spaces (but not newlines)
148
+ text = re.sub(r"[^\S\n]+", " ", text)
149
+
150
+ # Collapse multiple newlines to max 2
151
+ text = re.sub(r"\n{3,}", "\n\n", text)
152
+
153
+ # Remove leading/trailing whitespace from each line
154
+ lines = [line.strip() for line in text.split("\n")]
155
+ text = "\n".join(lines)
156
+
157
+ # Remove empty lines at start/end
158
+ text = text.strip()
159
+
160
+ return text
161
+
162
+
163
+ def extract_article_content(html: str) -> tuple[str, dict]:
164
+ """
165
+ Extract article content with metadata.
166
+
167
+ Attempts to extract structured article data including:
168
+ - Title
169
+ - Author
170
+ - Published date
171
+ - Main content
172
+
173
+ Args:
174
+ html: Raw HTML string
175
+
176
+ Returns:
177
+ Tuple of (content, metadata dict)
178
+ """
179
+ try:
180
+ from bs4 import BeautifulSoup
181
+ except ImportError:
182
+ raise ImportError(
183
+ "BeautifulSoup is required for HTML extraction. "
184
+ "Install with: pip install beautifulsoup4"
185
+ )
186
+
187
+ soup = BeautifulSoup(html, "html.parser")
188
+
189
+ metadata = {}
190
+
191
+ # Extract title
192
+ title = None
193
+ # Try og:title first
194
+ og_title = soup.find("meta", property="og:title")
195
+ if og_title and og_title.get("content"):
196
+ title = og_title["content"].strip()
197
+ elif soup.title and soup.title.string:
198
+ title = soup.title.string.strip()
199
+
200
+ if title:
201
+ metadata["title"] = title
202
+
203
+ # Extract author
204
+ author = None
205
+ author_meta = soup.find("meta", attrs={"name": "author"})
206
+ if author_meta and author_meta.get("content"):
207
+ author = author_meta["content"].strip()
208
+ else:
209
+ # Try common author class patterns
210
+ author_elem = soup.select_one(".author, .byline, [rel='author']")
211
+ if author_elem:
212
+ author = author_elem.get_text(strip=True)
213
+
214
+ if author:
215
+ metadata["author"] = author
216
+
217
+ # Extract published date
218
+ date = None
219
+ date_meta = soup.find("meta", property="article:published_time")
220
+ if date_meta and date_meta.get("content"):
221
+ date = date_meta["content"]
222
+ else:
223
+ date_elem = soup.select_one("time[datetime], .date, .published")
224
+ if date_elem:
225
+ date = date_elem.get("datetime") or date_elem.get_text(strip=True)
226
+
227
+ if date:
228
+ metadata["published_date"] = date
229
+
230
+ # Extract description
231
+ description = None
232
+ desc_meta = soup.find("meta", attrs={"name": "description"})
233
+ if desc_meta and desc_meta.get("content"):
234
+ description = desc_meta["content"].strip()
235
+
236
+ if description:
237
+ metadata["description"] = description
238
+
239
+ # Extract main content
240
+ content, extracted_title = extract_text_from_html(html)
241
+
242
+ # Use extracted title if we didn't find one
243
+ if not title and extracted_title:
244
+ metadata["title"] = extracted_title
245
+
246
+ return content, metadata