corp-extractor 0.4.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/METADATA +348 -64
  2. corp_extractor-0.9.0.dist-info/RECORD +76 -0
  3. statement_extractor/__init__.py +10 -1
  4. statement_extractor/cli.py +1663 -17
  5. statement_extractor/data/default_predicates.json +368 -0
  6. statement_extractor/data/statement_taxonomy.json +6972 -0
  7. statement_extractor/database/__init__.py +52 -0
  8. statement_extractor/database/embeddings.py +186 -0
  9. statement_extractor/database/hub.py +520 -0
  10. statement_extractor/database/importers/__init__.py +24 -0
  11. statement_extractor/database/importers/companies_house.py +545 -0
  12. statement_extractor/database/importers/gleif.py +538 -0
  13. statement_extractor/database/importers/sec_edgar.py +375 -0
  14. statement_extractor/database/importers/wikidata.py +1012 -0
  15. statement_extractor/database/importers/wikidata_people.py +632 -0
  16. statement_extractor/database/models.py +230 -0
  17. statement_extractor/database/resolver.py +245 -0
  18. statement_extractor/database/store.py +1609 -0
  19. statement_extractor/document/__init__.py +62 -0
  20. statement_extractor/document/chunker.py +410 -0
  21. statement_extractor/document/context.py +171 -0
  22. statement_extractor/document/deduplicator.py +173 -0
  23. statement_extractor/document/html_extractor.py +246 -0
  24. statement_extractor/document/loader.py +303 -0
  25. statement_extractor/document/pipeline.py +388 -0
  26. statement_extractor/document/summarizer.py +195 -0
  27. statement_extractor/extractor.py +1 -23
  28. statement_extractor/gliner_extraction.py +4 -74
  29. statement_extractor/llm.py +255 -0
  30. statement_extractor/models/__init__.py +89 -0
  31. statement_extractor/models/canonical.py +182 -0
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/entity.py +102 -0
  34. statement_extractor/models/labels.py +220 -0
  35. statement_extractor/models/qualifiers.py +139 -0
  36. statement_extractor/models/statement.py +101 -0
  37. statement_extractor/models.py +4 -1
  38. statement_extractor/pipeline/__init__.py +39 -0
  39. statement_extractor/pipeline/config.py +129 -0
  40. statement_extractor/pipeline/context.py +177 -0
  41. statement_extractor/pipeline/orchestrator.py +416 -0
  42. statement_extractor/pipeline/registry.py +303 -0
  43. statement_extractor/plugins/__init__.py +55 -0
  44. statement_extractor/plugins/base.py +716 -0
  45. statement_extractor/plugins/extractors/__init__.py +13 -0
  46. statement_extractor/plugins/extractors/base.py +9 -0
  47. statement_extractor/plugins/extractors/gliner2.py +546 -0
  48. statement_extractor/plugins/labelers/__init__.py +29 -0
  49. statement_extractor/plugins/labelers/base.py +9 -0
  50. statement_extractor/plugins/labelers/confidence.py +138 -0
  51. statement_extractor/plugins/labelers/relation_type.py +87 -0
  52. statement_extractor/plugins/labelers/sentiment.py +159 -0
  53. statement_extractor/plugins/labelers/taxonomy.py +386 -0
  54. statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  55. statement_extractor/plugins/pdf/__init__.py +10 -0
  56. statement_extractor/plugins/pdf/pypdf.py +291 -0
  57. statement_extractor/plugins/qualifiers/__init__.py +30 -0
  58. statement_extractor/plugins/qualifiers/base.py +9 -0
  59. statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  60. statement_extractor/plugins/qualifiers/embedding_company.py +420 -0
  61. statement_extractor/plugins/qualifiers/gleif.py +197 -0
  62. statement_extractor/plugins/qualifiers/person.py +785 -0
  63. statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  64. statement_extractor/plugins/scrapers/__init__.py +10 -0
  65. statement_extractor/plugins/scrapers/http.py +236 -0
  66. statement_extractor/plugins/splitters/__init__.py +13 -0
  67. statement_extractor/plugins/splitters/base.py +9 -0
  68. statement_extractor/plugins/splitters/t5_gemma.py +293 -0
  69. statement_extractor/plugins/taxonomy/__init__.py +13 -0
  70. statement_extractor/plugins/taxonomy/embedding.py +484 -0
  71. statement_extractor/plugins/taxonomy/mnli.py +291 -0
  72. statement_extractor/scoring.py +8 -8
  73. corp_extractor-0.4.0.dist-info/RECORD +0 -12
  74. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/WHEEL +0 -0
  75. {corp_extractor-0.4.0.dist-info → corp_extractor-0.9.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,209 @@
1
+ """
2
+ SECEdgarQualifierPlugin - Qualifies US ORG entities with SEC data.
3
+
4
+ DEPRECATED: Use EmbeddingCompanyQualifier instead, which uses a local
5
+ embedding database with pre-loaded SEC Edgar data for faster, offline matching.
6
+
7
+ Uses the SEC EDGAR API to:
8
+ - Look up CIK (Central Index Key) by company name
9
+ - Retrieve ticker symbol, exchange, filing history
10
+ """
11
+
12
+ import logging
13
+ import warnings
14
+ from typing import Optional
15
+
16
+ from ..base import BaseQualifierPlugin, PluginCapability
17
+ from ...pipeline.context import PipelineContext
18
+ from ...models import ExtractedEntity, EntityQualifiers, EntityType
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # SEC EDGAR API endpoints
23
+ SEC_COMPANY_SEARCH = "https://efts.sec.gov/LATEST/search-index"
24
+ SEC_COMPANY_TICKERS = "https://www.sec.gov/files/company_tickers.json"
25
+
26
+
27
+ # DEPRECATED: Not auto-registered. Use EmbeddingCompanyQualifier instead.
28
+ class SECEdgarQualifierPlugin(BaseQualifierPlugin):
29
+ """
30
+ DEPRECATED: Use EmbeddingCompanyQualifier instead.
31
+
32
+ Qualifier plugin for US ORG entities using SEC EDGAR.
33
+ Provides CIK and ticker symbol for publicly traded US companies.
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ timeout: int = 10,
39
+ cache_results: bool = True,
40
+ ):
41
+ """
42
+ Initialize the SEC EDGAR qualifier.
43
+
44
+ DEPRECATED: Use EmbeddingCompanyQualifier instead.
45
+
46
+ Args:
47
+ timeout: API request timeout in seconds
48
+ cache_results: Whether to cache API results
49
+ """
50
+ warnings.warn(
51
+ "SECEdgarQualifierPlugin is deprecated. Use EmbeddingCompanyQualifier instead.",
52
+ DeprecationWarning,
53
+ stacklevel=2,
54
+ )
55
+ self._timeout = timeout
56
+ self._cache_results = cache_results
57
+ self._cache: dict[str, Optional[dict]] = {}
58
+ self._ticker_cache: Optional[dict] = None
59
+
60
+ @property
61
+ def name(self) -> str:
62
+ return "sec_edgar_qualifier"
63
+
64
+ @property
65
+ def priority(self) -> int:
66
+ return 30 # Run after GLEIF and Companies House
67
+
68
+ @property
69
+ def capabilities(self) -> PluginCapability:
70
+ return PluginCapability.EXTERNAL_API | PluginCapability.CACHING
71
+
72
+ @property
73
+ def description(self) -> str:
74
+ return "Looks up SEC CIK and ticker for US public companies"
75
+
76
+ @property
77
+ def supported_entity_types(self) -> set[EntityType]:
78
+ return {EntityType.ORG}
79
+
80
+ @property
81
+ def supported_identifier_types(self) -> list[str]:
82
+ return ["sec_cik", "ticker"] # Can lookup by CIK or ticker
83
+
84
+ @property
85
+ def provided_identifier_types(self) -> list[str]:
86
+ return ["sec_cik", "ticker"] # Provides CIK and ticker
87
+
88
+ def qualify(
89
+ self,
90
+ entity: ExtractedEntity,
91
+ context: PipelineContext,
92
+ ) -> Optional[EntityQualifiers]:
93
+ """
94
+ Qualify an ORG entity with SEC EDGAR data.
95
+
96
+ Args:
97
+ entity: The ORG entity to qualify
98
+ context: Pipeline context
99
+
100
+ Returns:
101
+ EntityQualifiers with CIK and ticker, or None if not found
102
+ """
103
+ if entity.type != EntityType.ORG:
104
+ return None
105
+
106
+ # Check cache first
107
+ cache_key = entity.text.lower().strip()
108
+ if self._cache_results and cache_key in self._cache:
109
+ cached = self._cache[cache_key]
110
+ if cached is None:
111
+ return None
112
+ return self._data_to_qualifiers(cached)
113
+
114
+ # Search SEC
115
+ result = self._search_sec(entity.text)
116
+
117
+ # Cache result
118
+ if self._cache_results:
119
+ self._cache[cache_key] = result
120
+
121
+ if result:
122
+ return self._data_to_qualifiers(result)
123
+
124
+ return None
125
+
126
+ def _load_ticker_cache(self) -> dict:
127
+ """Load the SEC company tickers JSON (cached)."""
128
+ if self._ticker_cache is not None:
129
+ return self._ticker_cache
130
+
131
+ try:
132
+ import requests
133
+
134
+ response = requests.get(SEC_COMPANY_TICKERS, timeout=self._timeout)
135
+ response.raise_for_status()
136
+ data = response.json()
137
+
138
+ # Build lookup by company name (lowercase)
139
+ self._ticker_cache = {}
140
+ for key, company in data.items():
141
+ name = company.get("title", "").lower()
142
+ if name:
143
+ self._ticker_cache[name] = {
144
+ "cik": str(company.get("cik_str", "")),
145
+ "ticker": company.get("ticker", ""),
146
+ "title": company.get("title", ""),
147
+ }
148
+
149
+ logger.debug(f"Loaded {len(self._ticker_cache)} SEC company tickers")
150
+ return self._ticker_cache
151
+
152
+ except Exception as e:
153
+ logger.debug(f"Failed to load SEC ticker cache: {e}")
154
+ self._ticker_cache = {}
155
+ return self._ticker_cache
156
+
157
+ def _search_sec(self, org_name: str) -> Optional[dict]:
158
+ """Search SEC for company information."""
159
+ try:
160
+ # Load ticker cache
161
+ ticker_cache = self._load_ticker_cache()
162
+
163
+ # Try exact match first
164
+ org_lower = org_name.lower().strip()
165
+ if org_lower in ticker_cache:
166
+ return ticker_cache[org_lower]
167
+
168
+ # Try partial match
169
+ for name, data in ticker_cache.items():
170
+ if org_lower in name or name in org_lower:
171
+ return data
172
+
173
+ # Try matching without common suffixes
174
+ clean_name = org_lower
175
+ for suffix in [" inc", " inc.", " corp", " corp.", " co", " co.", " ltd", " llc"]:
176
+ clean_name = clean_name.replace(suffix, "")
177
+ clean_name = clean_name.strip()
178
+
179
+ for name, data in ticker_cache.items():
180
+ clean_cached = name
181
+ for suffix in [" inc", " inc.", " corp", " corp.", " co", " co.", " ltd", " llc"]:
182
+ clean_cached = clean_cached.replace(suffix, "")
183
+ clean_cached = clean_cached.strip()
184
+
185
+ if clean_name == clean_cached or clean_name in clean_cached or clean_cached in clean_name:
186
+ return data
187
+
188
+ except Exception as e:
189
+ logger.debug(f"SEC search error: {e}")
190
+
191
+ return None
192
+
193
+ def _data_to_qualifiers(self, data: dict) -> EntityQualifiers:
194
+ """Convert SEC data to EntityQualifiers."""
195
+ identifiers = {}
196
+ if data.get("cik"):
197
+ identifiers["sec_cik"] = data["cik"]
198
+ if data.get("ticker"):
199
+ identifiers["ticker"] = data["ticker"]
200
+
201
+ return EntityQualifiers(
202
+ jurisdiction="US",
203
+ country="US",
204
+ identifiers=identifiers,
205
+ )
206
+
207
+
208
+ # Allow importing without decorator for testing
209
+ SECEdgarQualifierPluginClass = SECEdgarQualifierPlugin
@@ -0,0 +1,10 @@
1
+ """
2
+ Scraper plugins for fetching content from URLs.
3
+
4
+ Built-in scrapers:
5
+ - http_scraper: Default HTTP scraper using httpx with retries
6
+ """
7
+
8
+ from .http import HttpScraperPlugin
9
+
10
+ __all__ = ["HttpScraperPlugin"]
@@ -0,0 +1,236 @@
1
+ """
2
+ HTTP scraper plugin for fetching web content.
3
+
4
+ Uses httpx for async HTTP requests with retries, timeouts, and CAPTCHA detection.
5
+ """
6
+
7
+ import logging
8
+ from typing import Optional
9
+
10
+ from ..base import BaseScraperPlugin, ContentType, ScraperResult
11
+ from ...pipeline.registry import PluginRegistry
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @PluginRegistry.scraper
17
+ class HttpScraperPlugin(BaseScraperPlugin):
18
+ """
19
+ Default HTTP scraper using httpx with retries and timeouts.
20
+
21
+ Features:
22
+ - Async HTTP requests with httpx
23
+ - Automatic redirect following
24
+ - Content type detection from headers and URL
25
+ - CAPTCHA page detection
26
+ - Configurable timeout and retries
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ timeout: float = 30.0,
32
+ max_retries: int = 3,
33
+ user_agent: str = "Mozilla/5.0 (compatible; StatementExtractor/1.0; +https://github.com/corp-o-rate/statement-extractor)",
34
+ follow_redirects: bool = True,
35
+ ):
36
+ self._timeout = timeout
37
+ self._max_retries = max_retries
38
+ self._user_agent = user_agent
39
+ self._follow_redirects = follow_redirects
40
+
41
+ @property
42
+ def name(self) -> str:
43
+ return "http_scraper"
44
+
45
+ @property
46
+ def priority(self) -> int:
47
+ return 100 # Default scraper
48
+
49
+ @property
50
+ def description(self) -> str:
51
+ return "Default HTTP scraper using httpx with retries and CAPTCHA detection"
52
+
53
+ async def fetch(self, url: str, timeout: Optional[float] = None) -> ScraperResult:
54
+ """
55
+ Fetch content from a URL with retries and CAPTCHA detection.
56
+
57
+ Args:
58
+ url: The URL to fetch
59
+ timeout: Request timeout in seconds (uses instance default if None)
60
+
61
+ Returns:
62
+ ScraperResult with content, content type, and any errors
63
+ """
64
+ import httpx
65
+
66
+ timeout = timeout or self._timeout
67
+ last_error: Optional[str] = None
68
+
69
+ for attempt in range(self._max_retries):
70
+ try:
71
+ async with httpx.AsyncClient(
72
+ timeout=timeout,
73
+ follow_redirects=self._follow_redirects,
74
+ ) as client:
75
+ logger.debug(f"Fetching URL: {url} (attempt {attempt + 1})")
76
+
77
+ response = await client.get(
78
+ url,
79
+ headers={"User-Agent": self._user_agent},
80
+ )
81
+
82
+ content_type = self._detect_content_type(
83
+ dict(response.headers), url
84
+ )
85
+
86
+ # Check for CAPTCHA if HTML
87
+ error = None
88
+ if content_type == ContentType.HTML:
89
+ if self._is_captcha_page(response.content):
90
+ error = "CAPTCHA or challenge page detected"
91
+ logger.warning(f"CAPTCHA detected at {url}")
92
+
93
+ return ScraperResult(
94
+ url=url,
95
+ final_url=str(response.url),
96
+ content=response.content,
97
+ content_type=content_type,
98
+ headers=dict(response.headers),
99
+ error=error,
100
+ )
101
+
102
+ except httpx.TimeoutException as e:
103
+ last_error = f"Request timed out after {timeout}s"
104
+ logger.warning(f"Timeout fetching {url}: {e}")
105
+ except httpx.ConnectError as e:
106
+ last_error = f"Connection error: {e}"
107
+ logger.warning(f"Connection error fetching {url}: {e}")
108
+ except httpx.HTTPStatusError as e:
109
+ last_error = f"HTTP {e.response.status_code}: {e.response.reason_phrase}"
110
+ logger.warning(f"HTTP error fetching {url}: {e}")
111
+ # Don't retry on 4xx errors
112
+ if 400 <= e.response.status_code < 500:
113
+ break
114
+ except Exception as e:
115
+ last_error = f"Unexpected error: {e}"
116
+ logger.exception(f"Error fetching {url}")
117
+
118
+ # All retries failed
119
+ return ScraperResult(
120
+ url=url,
121
+ final_url=url,
122
+ content=b"",
123
+ content_type=ContentType.UNKNOWN,
124
+ error=last_error or "Unknown error",
125
+ )
126
+
127
+ async def head(self, url: str, timeout: Optional[float] = None) -> ScraperResult:
128
+ """
129
+ Check content type without downloading the full body.
130
+
131
+ Args:
132
+ url: The URL to check
133
+ timeout: Request timeout in seconds
134
+
135
+ Returns:
136
+ ScraperResult with content_type populated (content is empty)
137
+ """
138
+ import httpx
139
+
140
+ timeout = timeout or self._timeout
141
+
142
+ try:
143
+ async with httpx.AsyncClient(
144
+ timeout=timeout,
145
+ follow_redirects=self._follow_redirects,
146
+ ) as client:
147
+ response = await client.head(
148
+ url,
149
+ headers={"User-Agent": self._user_agent},
150
+ )
151
+
152
+ content_type = self._detect_content_type(
153
+ dict(response.headers), url
154
+ )
155
+
156
+ return ScraperResult(
157
+ url=url,
158
+ final_url=str(response.url),
159
+ content=b"",
160
+ content_type=content_type,
161
+ headers=dict(response.headers),
162
+ )
163
+
164
+ except Exception as e:
165
+ logger.warning(f"HEAD request failed for {url}: {e}")
166
+ # Fall back to full fetch
167
+ return await self.fetch(url, timeout)
168
+
169
+ @staticmethod
170
+ def _detect_content_type(headers: dict[str, str], url: str) -> ContentType:
171
+ """
172
+ Detect content type from HTTP headers and URL.
173
+
174
+ Priority:
175
+ 1. Content-Type header
176
+ 2. URL file extension
177
+ """
178
+ content_type_header = headers.get("content-type", "").lower()
179
+
180
+ # Check Content-Type header
181
+ if "application/pdf" in content_type_header:
182
+ return ContentType.PDF
183
+ if any(mime in content_type_header for mime in [
184
+ "text/html",
185
+ "application/xhtml+xml",
186
+ ]):
187
+ return ContentType.HTML
188
+
189
+ # Check URL extension
190
+ url_lower = url.lower().split("?")[0] # Remove query params
191
+ if url_lower.endswith(".pdf"):
192
+ return ContentType.PDF
193
+ if url_lower.endswith((".html", ".htm")):
194
+ return ContentType.HTML
195
+
196
+ # Default based on content-type
197
+ if content_type_header.startswith("text/"):
198
+ return ContentType.HTML
199
+ if content_type_header.startswith(("image/", "audio/", "video/")):
200
+ return ContentType.BINARY
201
+
202
+ return ContentType.UNKNOWN
203
+
204
+ @staticmethod
205
+ def _is_captcha_page(content: bytes) -> bool:
206
+ """
207
+ Detect CAPTCHA or challenge pages.
208
+
209
+ Checks for common CAPTCHA patterns in HTML content.
210
+ """
211
+ try:
212
+ html = content.decode("utf-8", errors="replace").lower()
213
+ except Exception:
214
+ return False
215
+
216
+ # Only check small pages (challenge pages are usually small)
217
+ if len(html) > 50000:
218
+ return False
219
+
220
+ # Common CAPTCHA/challenge indicators
221
+ captcha_patterns = [
222
+ "captcha",
223
+ "cloudflare",
224
+ "checking your browser",
225
+ "please verify you are a human",
226
+ "access denied",
227
+ "bot protection",
228
+ "ddos protection",
229
+ "just a moment",
230
+ "enable javascript",
231
+ "please enable cookies",
232
+ "verify you are human",
233
+ "security check",
234
+ ]
235
+
236
+ return any(pattern in html for pattern in captcha_patterns)
@@ -0,0 +1,13 @@
1
+ """
2
+ Splitter plugins for Stage 1 (Splitting).
3
+
4
+ Splits text into atomic triples.
5
+ """
6
+
7
+ from .base import BaseSplitterPlugin
8
+ from .t5_gemma import T5GemmaSplitter
9
+
10
+ __all__ = [
11
+ "BaseSplitterPlugin",
12
+ "T5GemmaSplitter",
13
+ ]
@@ -0,0 +1,9 @@
1
+ """
2
+ Base class for splitter plugins.
3
+
4
+ Re-exports BaseSplitterPlugin from the main plugins module.
5
+ """
6
+
7
+ from ..base import BaseSplitterPlugin
8
+
9
+ __all__ = ["BaseSplitterPlugin"]