corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,392 @@
1
+ """
2
+ SEC Edgar data importer for the company database.
3
+
4
+ Imports company data from SEC's bulk submissions.zip file
5
+ into the embedding database for company name matching.
6
+
7
+ The submissions.zip contains JSON files for ALL SEC filers (~100K+),
8
+ not just companies with ticker symbols (~10K).
9
+ """
10
+
11
+ import json
12
+ import logging
13
+ import zipfile
14
+ from pathlib import Path
15
+ from typing import Any, Iterator, Optional
16
+
17
+ from ..models import CompanyRecord, EntityType
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # SEC Edgar bulk data URLs
22
+ SEC_SUBMISSIONS_URL = "https://www.sec.gov/Archives/edgar/daily-index/bulkdata/submissions.zip"
23
+ SEC_TICKERS_URL = "https://www.sec.gov/files/company_tickers.json"
24
+
25
+ # User agent for SEC requests (required)
26
+ SEC_USER_AGENT = "corp-extractor/1.0 (contact@corp-o-rate.com)"
27
+
28
+ # Entity types that are operating companies (exclude individuals and funds)
29
+ ORG_ENTITY_TYPES = {
30
+ "operating", # Operating companies
31
+ "foreign-private-issuer", # Foreign companies
32
+ }
33
+
34
+ # SIC codes for funds/financial instruments (map to FUND entity type)
35
+ FUND_SIC_CODES = {
36
+ "6722", # Management Investment Offices, Open-End
37
+ "6726", # Other Investment Offices
38
+ "6732", # Educational, Religious, and Charitable Trusts
39
+ "6733", # Trusts, Except Educational, Religious, and Charitable
40
+ "6792", # Oil Royalty Traders
41
+ "6794", # Patent Owners and Lessors
42
+ "6795", # Mineral Royalty Traders
43
+ "6798", # Real Estate Investment Trusts
44
+ "6799", # Investors, NEC
45
+ }
46
+
47
+ # Mapping from SEC entity types to our EntityType
48
+ SEC_ENTITY_TYPE_MAP: dict[str, EntityType] = {
49
+ "operating": EntityType.BUSINESS,
50
+ "foreign-private-issuer": EntityType.BUSINESS,
51
+ "": EntityType.UNKNOWN,
52
+ }
53
+
54
+
55
+ def _get_entity_type_from_sec(sec_entity_type: str, sic: str) -> EntityType:
56
+ """Determine EntityType from SEC entity type and SIC code."""
57
+ # Check SIC codes first - they're more specific
58
+ if sic in FUND_SIC_CODES:
59
+ return EntityType.FUND
60
+
61
+ # Map SEC entity type
62
+ return SEC_ENTITY_TYPE_MAP.get(sec_entity_type.lower(), EntityType.BUSINESS)
63
+
64
+
65
+ class SecEdgarImporter:
66
+ """
67
+ Importer for SEC Edgar company data.
68
+
69
+ Uses the bulk submissions.zip file which contains all SEC filers,
70
+ not just companies with ticker symbols.
71
+ """
72
+
73
+ def __init__(self):
74
+ """Initialize the SEC Edgar importer."""
75
+ self._ticker_lookup: Optional[dict[str, str]] = None
76
+
77
+ def import_from_url(
78
+ self,
79
+ limit: Optional[int] = None,
80
+ download_dir: Optional[Path] = None,
81
+ ) -> Iterator[CompanyRecord]:
82
+ """
83
+ Import records by downloading SEC bulk submissions.zip.
84
+
85
+ Args:
86
+ limit: Optional limit on number of records
87
+ download_dir: Directory to download zip file to
88
+
89
+ Yields:
90
+ CompanyRecord for each company
91
+ """
92
+ # Download submissions.zip
93
+ zip_path = self.download_submissions_zip(download_dir)
94
+ yield from self.import_from_zip(zip_path, limit)
95
+
96
+ def import_from_zip(
97
+ self,
98
+ zip_path: str | Path,
99
+ limit: Optional[int] = None,
100
+ ) -> Iterator[CompanyRecord]:
101
+ """
102
+ Import records from a local submissions.zip file.
103
+
104
+ Args:
105
+ zip_path: Path to submissions.zip
106
+ limit: Optional limit on number of records
107
+
108
+ Yields:
109
+ CompanyRecord for each company
110
+ """
111
+ zip_path = Path(zip_path)
112
+ if not zip_path.exists():
113
+ raise FileNotFoundError(f"SEC submissions.zip not found: {zip_path}")
114
+
115
+ logger.info(f"Importing SEC Edgar data from {zip_path}")
116
+
117
+ # Load ticker lookup for enrichment
118
+ self._load_ticker_lookup()
119
+
120
+ count = 0
121
+ with zipfile.ZipFile(zip_path, "r") as zf:
122
+ # Get list of JSON files (CIK*.json)
123
+ json_files = [n for n in zf.namelist() if n.startswith("CIK") and n.endswith(".json")]
124
+ logger.info(f"Found {len(json_files)} submission files in archive")
125
+
126
+ for filename in json_files:
127
+ if limit and count >= limit:
128
+ break
129
+
130
+ try:
131
+ with zf.open(filename) as f:
132
+ data = json.load(f)
133
+ record = self._parse_submission(data)
134
+ if record:
135
+ count += 1
136
+ yield record
137
+
138
+ if count % 10000 == 0:
139
+ logger.info(f"Imported {count} SEC Edgar records")
140
+ except Exception as e:
141
+ logger.debug(f"Failed to parse {filename}: {e}")
142
+
143
+ logger.info(f"Completed SEC Edgar import: {count} records")
144
+
145
+ def import_from_file(
146
+ self,
147
+ file_path: str | Path,
148
+ limit: Optional[int] = None,
149
+ ) -> Iterator[CompanyRecord]:
150
+ """
151
+ Import records from a local file (zip or legacy tickers JSON).
152
+
153
+ Args:
154
+ file_path: Path to submissions.zip or company_tickers.json
155
+ limit: Optional limit on number of records
156
+
157
+ Yields:
158
+ CompanyRecord for each company
159
+ """
160
+ file_path = Path(file_path)
161
+
162
+ if file_path.suffix == ".zip":
163
+ yield from self.import_from_zip(file_path, limit)
164
+ elif file_path.suffix == ".json":
165
+ # Legacy support for company_tickers.json
166
+ yield from self._import_from_tickers_json(file_path, limit)
167
+ else:
168
+ raise ValueError(f"Unsupported file type: {file_path.suffix}")
169
+
170
+ def _import_from_tickers_json(
171
+ self,
172
+ file_path: Path,
173
+ limit: Optional[int],
174
+ ) -> Iterator[CompanyRecord]:
175
+ """Legacy import from company_tickers.json."""
176
+ logger.info(f"Importing from legacy tickers file: {file_path}")
177
+
178
+ with open(file_path, "r", encoding="utf-8") as f:
179
+ data = json.load(f)
180
+
181
+ count = 0
182
+ for entry in data.values():
183
+ if limit and count >= limit:
184
+ break
185
+
186
+ cik = entry.get("cik_str")
187
+ ticker = entry.get("ticker", "")
188
+ title = entry.get("title", "")
189
+
190
+ if not cik or not title:
191
+ continue
192
+
193
+ cik_str = str(cik).zfill(10)
194
+ record_data = {"cik": cik_str, "ticker": ticker, "title": title}
195
+
196
+ yield CompanyRecord(
197
+ name=title.strip(),
198
+ source="sec_edgar",
199
+ source_id=cik_str,
200
+ region="US", # Tickers file is US-only
201
+ entity_type=EntityType.BUSINESS, # Ticker file = publicly traded businesses
202
+ record=record_data,
203
+ )
204
+ count += 1
205
+
206
+ logger.info(f"Completed legacy SEC import: {count} records")
207
+
208
+ def _load_ticker_lookup(self) -> None:
209
+ """Load ticker symbols for CIK enrichment."""
210
+ if self._ticker_lookup is not None:
211
+ return
212
+
213
+ self._ticker_lookup = {}
214
+ try:
215
+ import urllib.request
216
+
217
+ req = urllib.request.Request(
218
+ SEC_TICKERS_URL,
219
+ headers={"User-Agent": SEC_USER_AGENT},
220
+ )
221
+ with urllib.request.urlopen(req, timeout=30) as response:
222
+ data = json.loads(response.read().decode("utf-8"))
223
+
224
+ for entry in data.values():
225
+ cik = str(entry.get("cik_str", "")).zfill(10)
226
+ ticker = entry.get("ticker", "")
227
+ if cik and ticker:
228
+ self._ticker_lookup[cik] = ticker
229
+
230
+ logger.info(f"Loaded {len(self._ticker_lookup)} ticker symbols")
231
+ except Exception as e:
232
+ logger.warning(f"Failed to load ticker lookup: {e}")
233
+
234
+ def _parse_submission(self, data: dict[str, Any]) -> Optional[CompanyRecord]:
235
+ """Parse a submission JSON file into a CompanyRecord."""
236
+ try:
237
+ cik = str(data.get("cik", "")).zfill(10)
238
+ name = data.get("name", "").strip()
239
+ entity_type = data.get("entityType", "").lower()
240
+
241
+ if not cik or not name:
242
+ return None
243
+
244
+ # Filter to only include organizations (exclude individuals without business indicators)
245
+ ticker = self._ticker_lookup.get(cik, "") if self._ticker_lookup else ""
246
+ sic = data.get("sic", "")
247
+
248
+ # Include if: has a known org entity type, OR has a ticker (publicly traded), OR has SIC code
249
+ is_organization = (
250
+ entity_type in ORG_ENTITY_TYPES
251
+ or ticker # Has a ticker symbol = publicly traded company
252
+ or sic # Has SIC code = classified business
253
+ )
254
+
255
+ if not is_organization:
256
+ return None
257
+
258
+ # Determine entity type (business, fund, etc.)
259
+ record_entity_type = _get_entity_type_from_sec(entity_type, sic)
260
+
261
+ # Get additional fields
262
+ sic_description = data.get("sicDescription", "")
263
+ state = data.get("stateOfIncorporation", "")
264
+ fiscal_year_end = data.get("fiscalYearEnd", "")
265
+
266
+ # Get addresses
267
+ addresses = data.get("addresses", {})
268
+ business_addr = addresses.get("business", {})
269
+
270
+ # Get ticker from lookup
271
+ ticker = self._ticker_lookup.get(cik, "") if self._ticker_lookup else ""
272
+
273
+ # Get exchange info from filings if available
274
+ exchanges = data.get("exchanges", [])
275
+ exchange = exchanges[0] if exchanges else ""
276
+
277
+ # Get dates from filings history
278
+ # Use oldest filing date as from_date (when company started filing with SEC)
279
+ filings = data.get("filings", {})
280
+ recent_filings = filings.get("recent", {})
281
+ filing_dates = recent_filings.get("filingDate", [])
282
+
283
+ # Get the oldest filing date (last in the list, as they're typically newest-first)
284
+ from_date = None
285
+ if filing_dates:
286
+ # Filing dates are in YYYY-MM-DD format
287
+ oldest_date = filing_dates[-1] if filing_dates else None
288
+ if oldest_date and len(oldest_date) >= 10:
289
+ from_date = oldest_date[:10]
290
+
291
+ # Build record
292
+ record_data = {
293
+ "cik": cik,
294
+ "name": name,
295
+ "sic": sic,
296
+ "sic_description": sic_description,
297
+ "entity_type": entity_type,
298
+ "state_of_incorporation": state,
299
+ "fiscal_year_end": fiscal_year_end,
300
+ "ticker": ticker,
301
+ "exchange": exchange,
302
+ "business_address": {
303
+ "street": business_addr.get("street1", ""),
304
+ "city": business_addr.get("city", ""),
305
+ "state": business_addr.get("stateOrCountry", ""),
306
+ "zip": business_addr.get("zipCode", ""),
307
+ },
308
+ }
309
+ if from_date:
310
+ record_data["first_filing_date"] = from_date
311
+
312
+ # Use stateOrCountry for region (2-letter US state or country code)
313
+ region = business_addr.get("stateOrCountry", "US")
314
+
315
+ return CompanyRecord(
316
+ name=name,
317
+ source="sec_edgar",
318
+ source_id=cik,
319
+ region=region,
320
+ entity_type=record_entity_type,
321
+ from_date=from_date,
322
+ record=record_data,
323
+ )
324
+
325
+ except Exception as e:
326
+ logger.debug(f"Failed to parse submission: {e}")
327
+ return None
328
+
329
+ def download_submissions_zip(self, output_dir: Optional[Path] = None) -> Path:
330
+ """
331
+ Download the SEC bulk submissions.zip file.
332
+
333
+ Args:
334
+ output_dir: Directory to save the file
335
+
336
+ Returns:
337
+ Path to downloaded file
338
+ """
339
+ import tempfile
340
+ import urllib.request
341
+
342
+ if output_dir is None:
343
+ output_dir = Path(tempfile.gettempdir()) / "sec_edgar"
344
+
345
+ output_dir.mkdir(parents=True, exist_ok=True)
346
+ output_path = output_dir / "submissions.zip"
347
+
348
+ logger.info(f"Downloading SEC submissions.zip (~500MB)...")
349
+ logger.info(f"URL: {SEC_SUBMISSIONS_URL}")
350
+
351
+ req = urllib.request.Request(
352
+ SEC_SUBMISSIONS_URL,
353
+ headers={"User-Agent": SEC_USER_AGENT},
354
+ )
355
+
356
+ with urllib.request.urlopen(req) as response:
357
+ total_size = int(response.headers.get("Content-Length", 0))
358
+ downloaded = 0
359
+ chunk_size = 1024 * 1024 # 1MB chunks
360
+
361
+ with open(output_path, "wb") as f:
362
+ while True:
363
+ chunk = response.read(chunk_size)
364
+ if not chunk:
365
+ break
366
+ f.write(chunk)
367
+ downloaded += len(chunk)
368
+ if total_size > 0:
369
+ pct = (downloaded / total_size) * 100
370
+ logger.info(f"Downloaded {downloaded // (1024*1024)}MB / {total_size // (1024*1024)}MB ({pct:.1f}%)")
371
+
372
+ logger.info(f"Downloaded SEC submissions.zip to {output_path}")
373
+ return output_path
374
+
375
+ def download_latest(self, output_path: Optional[Path] = None) -> Path:
376
+ """
377
+ Download the latest SEC bulk data.
378
+
379
+ Args:
380
+ output_path: Where to save the file (directory or file path)
381
+
382
+ Returns:
383
+ Path to downloaded file
384
+ """
385
+ if output_path is None:
386
+ return self.download_submissions_zip()
387
+
388
+ output_path = Path(output_path)
389
+ if output_path.is_dir():
390
+ return self.download_submissions_zip(output_path)
391
+ else:
392
+ return self.download_submissions_zip(output_path.parent)