corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,512 @@
1
+ """
2
+ SEC Form 4 importer for the people database.
3
+
4
+ Imports insider ownership data from SEC Form 4 filings to identify
5
+ officers and directors of public companies.
6
+
7
+ Form 4 Structure (XML):
8
+ - issuer: CIK, name, ticker of the company
9
+ - reportingOwner: CIK, name, relationship (isDirector, isOfficer, officerTitle)
10
+ - transactions: Stock transactions (not used for people import)
11
+
12
+ Data Source:
13
+ - Quarterly index files at: /Archives/edgar/full-index/{year}/QTR{q}/form.idx
14
+ - Individual filings at: /Archives/edgar/data/{cik}/{accession}.txt
15
+
16
+ Resume Support:
17
+ - Progress tracked by (year, quarter, filing_index)
18
+ - Progress saved to JSON file for resume on interruption
19
+ """
20
+
21
+ import json
22
+ import logging
23
+ import re
24
+ import time
25
+ import urllib.error
26
+ import urllib.request
27
+ import xml.etree.ElementTree as ET
28
+ from dataclasses import dataclass, field
29
+ from datetime import datetime
30
+ from pathlib import Path
31
+ from typing import Callable, Iterator, Optional
32
+
33
+ from ..models import PersonRecord, PersonType
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+ # SEC Edgar URLs
38
+ SEC_BASE_URL = "https://www.sec.gov"
39
+ SEC_FULL_INDEX_URL = f"{SEC_BASE_URL}/Archives/edgar/full-index"
40
+
41
+ # User agent for SEC requests (required)
42
+ SEC_USER_AGENT = "corp-extractor/1.0 (contact@corp-o-rate.com)"
43
+
44
+ # Rate limiting: SEC allows 10 requests/second, we use 5 to be safe
45
+ SEC_REQUEST_DELAY = 0.2 # 200ms between requests
46
+
47
+ # Default progress file path
48
+ DEFAULT_PROGRESS_PATH = Path.home() / ".cache" / "corp-extractor" / "sec-form4-progress.json"
49
+
50
+
51
+ def _normalize_name(name: str) -> str:
52
+ """Normalize a person name for consistent storage."""
53
+ # Remove extra whitespace
54
+ name = " ".join(name.split())
55
+ # Title case
56
+ name = name.title()
57
+ return name
58
+
59
+
60
+ def _map_to_person_type(
61
+ is_director: bool, is_officer: bool, is_ten_percent_owner: bool, officer_title: str
62
+ ) -> PersonType:
63
+ """Map Form 4 relationship to PersonType."""
64
+ if is_officer:
65
+ return PersonType.EXECUTIVE
66
+ if is_director:
67
+ return PersonType.EXECUTIVE # Directors are also executives
68
+ if is_ten_percent_owner:
69
+ return PersonType.ENTREPRENEUR # Significant investors
70
+ return PersonType.UNKNOWN
71
+
72
+
73
+ def _extract_officer_role(
74
+ is_director: bool, is_officer: bool, is_ten_percent_owner: bool, officer_title: str
75
+ ) -> str:
76
+ """Extract the role description from Form 4 data."""
77
+ roles = []
78
+ if is_director:
79
+ roles.append("Director")
80
+ if is_officer and officer_title:
81
+ roles.append(officer_title)
82
+ elif is_officer:
83
+ roles.append("Officer")
84
+ if is_ten_percent_owner and not is_director and not is_officer:
85
+ roles.append("Investor")
86
+ return ", ".join(roles) if roles else "Insider"
87
+
88
+
89
+ @dataclass
90
+ class Form4Progress:
91
+ """
92
+ Tracks progress through SEC Form 4 import for resume support.
93
+
94
+ Progress is tracked by:
95
+ - year: Current year being processed
96
+ - quarter: Current quarter (1-4)
97
+ - filing_index: Index within current quarter's Form 4 filings
98
+ - total_imported: Total records imported so far
99
+ """
100
+ year: int = 0
101
+ quarter: int = 0
102
+ filing_index: int = 0
103
+ total_imported: int = 0
104
+ last_accession: str = ""
105
+ started_at: str = field(default_factory=lambda: datetime.now().isoformat())
106
+ updated_at: str = field(default_factory=lambda: datetime.now().isoformat())
107
+
108
+ def save(self, path: Path = DEFAULT_PROGRESS_PATH) -> None:
109
+ """Save progress to JSON file."""
110
+ path.parent.mkdir(parents=True, exist_ok=True)
111
+ self.updated_at = datetime.now().isoformat()
112
+ with open(path, "w") as f:
113
+ json.dump({
114
+ "year": self.year,
115
+ "quarter": self.quarter,
116
+ "filing_index": self.filing_index,
117
+ "total_imported": self.total_imported,
118
+ "last_accession": self.last_accession,
119
+ "started_at": self.started_at,
120
+ "updated_at": self.updated_at,
121
+ }, f, indent=2)
122
+ logger.debug(f"Saved progress: year={self.year}, Q{self.quarter}, index={self.filing_index}")
123
+
124
+ @classmethod
125
+ def load(cls, path: Path = DEFAULT_PROGRESS_PATH) -> Optional["Form4Progress"]:
126
+ """Load progress from JSON file, returns None if not found."""
127
+ if not path.exists():
128
+ return None
129
+ try:
130
+ with open(path) as f:
131
+ data = json.load(f)
132
+ return cls(
133
+ year=data.get("year", 0),
134
+ quarter=data.get("quarter", 0),
135
+ filing_index=data.get("filing_index", 0),
136
+ total_imported=data.get("total_imported", 0),
137
+ last_accession=data.get("last_accession", ""),
138
+ started_at=data.get("started_at", datetime.now().isoformat()),
139
+ updated_at=data.get("updated_at", datetime.now().isoformat()),
140
+ )
141
+ except Exception as e:
142
+ logger.warning(f"Failed to load progress from {path}: {e}")
143
+ return None
144
+
145
+ @staticmethod
146
+ def clear(path: Path = DEFAULT_PROGRESS_PATH) -> None:
147
+ """Delete the progress file."""
148
+ if path.exists():
149
+ path.unlink()
150
+ logger.info(f"Cleared progress file: {path}")
151
+
152
+
153
+ @dataclass
154
+ class Form4Filing:
155
+ """Represents a Form 4 filing from the index."""
156
+ form_type: str
157
+ company_name: str
158
+ cik: str
159
+ date_filed: str
160
+ file_path: str
161
+
162
+ @property
163
+ def accession_number(self) -> str:
164
+ """Extract accession number from file path."""
165
+ # Path like: edgar/data/1084869/0001437749-25-030850.txt
166
+ match = re.search(r"/(\d+-\d+-\d+)\.txt$", self.file_path)
167
+ return match.group(1) if match else ""
168
+
169
+ @property
170
+ def xml_url(self) -> str:
171
+ """Get URL to the filing document."""
172
+ return f"{SEC_BASE_URL}/Archives/{self.file_path}"
173
+
174
+
175
+ class SecForm4Importer:
176
+ """
177
+ Importer for SEC Form 4 insider ownership filings.
178
+
179
+ Imports officers and directors from Form 4 filings into the people database.
180
+ """
181
+
182
+ def __init__(self):
183
+ """Initialize the SEC Form 4 importer."""
184
+ self._last_request_time: float = 0
185
+
186
+ def _rate_limit(self) -> None:
187
+ """Enforce rate limiting between requests."""
188
+ elapsed = time.time() - self._last_request_time
189
+ if elapsed < SEC_REQUEST_DELAY:
190
+ time.sleep(SEC_REQUEST_DELAY - elapsed)
191
+ self._last_request_time = time.time()
192
+
193
+ def _fetch_url(self, url: str) -> str:
194
+ """Fetch URL content with proper headers and rate limiting."""
195
+ self._rate_limit()
196
+ req = urllib.request.Request(url, headers={"User-Agent": SEC_USER_AGENT})
197
+ with urllib.request.urlopen(req, timeout=30) as response:
198
+ return response.read().decode("utf-8", errors="replace")
199
+
200
+ def _fetch_index(self, year: int, quarter: int) -> list[Form4Filing]:
201
+ """
202
+ Fetch and parse quarterly form index for Form 4 filings.
203
+
204
+ Args:
205
+ year: Year (e.g., 2025)
206
+ quarter: Quarter (1-4)
207
+
208
+ Returns:
209
+ List of Form4Filing objects
210
+ """
211
+ url = f"{SEC_FULL_INDEX_URL}/{year}/QTR{quarter}/form.idx"
212
+ logger.info(f"Fetching index: {url}")
213
+
214
+ try:
215
+ content = self._fetch_url(url)
216
+ except urllib.error.HTTPError as e:
217
+ if e.code == 404:
218
+ logger.warning(f"Index not found: {year} Q{quarter}")
219
+ return []
220
+ raise
221
+
222
+ filings = []
223
+ for line in content.split("\n"):
224
+ # Form 4 lines start with "4 " followed by spaces and company name
225
+ # Format: Form Type Company Name CIK Date Filed File Name
226
+ if not line.startswith("4 "):
227
+ continue
228
+
229
+ # Parse fixed-width format
230
+ # Columns are roughly: 0-12 (form), 13-75 (company), 76-87 (cik), 88-99 (date), 100+ (file)
231
+ parts = line.split()
232
+ if len(parts) < 5:
233
+ continue
234
+
235
+ # Extract fields - the format is space-padded fixed width
236
+ form_type = parts[0]
237
+ # Company name is everything between form type and CIK (which is numeric)
238
+ # Find CIK by looking for numeric field
239
+ cik_idx = -1
240
+ for i, part in enumerate(parts[1:], 1):
241
+ if part.isdigit() and len(part) >= 6:
242
+ cik_idx = i
243
+ break
244
+
245
+ if cik_idx == -1:
246
+ continue
247
+
248
+ company_name = " ".join(parts[1:cik_idx])
249
+ cik = parts[cik_idx]
250
+ date_filed = parts[cik_idx + 1] if cik_idx + 1 < len(parts) else ""
251
+ file_path = parts[cik_idx + 2] if cik_idx + 2 < len(parts) else ""
252
+
253
+ if not file_path:
254
+ continue
255
+
256
+ filings.append(Form4Filing(
257
+ form_type=form_type,
258
+ company_name=company_name,
259
+ cik=cik.zfill(10),
260
+ date_filed=date_filed,
261
+ file_path=file_path,
262
+ ))
263
+
264
+ logger.info(f"Found {len(filings)} Form 4 filings for {year} Q{quarter}")
265
+ return filings
266
+
267
+ def _parse_form4_xml(self, content: str) -> Iterator[PersonRecord]:
268
+ """
269
+ Parse Form 4 XML content and yield PersonRecord objects.
270
+
271
+ A single Form 4 can have multiple reporting owners, each yielding a record.
272
+ """
273
+ # Extract XML from the SEC filing wrapper
274
+ xml_match = re.search(r"<\?xml.*?</ownershipDocument>", content, re.DOTALL)
275
+ if not xml_match:
276
+ return
277
+
278
+ xml_content = xml_match.group(0)
279
+
280
+ try:
281
+ root = ET.fromstring(xml_content)
282
+ except ET.ParseError as e:
283
+ logger.debug(f"Failed to parse Form 4 XML: {e}")
284
+ return
285
+
286
+ # Extract issuer info
287
+ issuer = root.find("issuer")
288
+ if issuer is None:
289
+ return
290
+
291
+ issuer_cik = issuer.findtext("issuerCik", "").lstrip("0")
292
+ issuer_name = issuer.findtext("issuerName", "")
293
+ issuer_ticker = issuer.findtext("issuerTradingSymbol", "")
294
+
295
+ if not issuer_cik or not issuer_name:
296
+ return
297
+
298
+ # Extract period of report (filing date)
299
+ period_of_report = root.findtext("periodOfReport", "")
300
+
301
+ # Process each reporting owner
302
+ for owner in root.findall("reportingOwner"):
303
+ owner_id = owner.find("reportingOwnerId")
304
+ if owner_id is None:
305
+ continue
306
+
307
+ owner_cik = owner_id.findtext("rptOwnerCik", "").lstrip("0")
308
+ owner_name = owner_id.findtext("rptOwnerName", "")
309
+
310
+ if not owner_cik or not owner_name:
311
+ continue
312
+
313
+ # Get relationship info
314
+ relationship = owner.find("reportingOwnerRelationship")
315
+ is_director = False
316
+ is_officer = False
317
+ officer_title = ""
318
+ is_ten_percent_owner = False
319
+
320
+ if relationship is not None:
321
+ is_director = relationship.findtext("isDirector", "0") == "1"
322
+ is_officer = relationship.findtext("isOfficer", "0") == "1"
323
+ officer_title = relationship.findtext("officerTitle", "") or ""
324
+ is_ten_percent_owner = relationship.findtext("isTenPercentOwner", "0") == "1"
325
+
326
+ # Skip if no relationship at all
327
+ if not is_director and not is_officer and not is_ten_percent_owner:
328
+ continue
329
+
330
+ # Map to PersonType and role
331
+ person_type = _map_to_person_type(is_director, is_officer, is_ten_percent_owner, officer_title)
332
+ role = _extract_officer_role(is_director, is_officer, is_ten_percent_owner, officer_title)
333
+
334
+ # Create unique source_id from owner CIK + issuer CIK
335
+ # This allows same person to have multiple records for different companies
336
+ source_id = f"{owner_cik}_{issuer_cik}"
337
+
338
+ # Build record data
339
+ record_data = {
340
+ "owner_cik": owner_cik,
341
+ "issuer_cik": issuer_cik,
342
+ "issuer_name": issuer_name,
343
+ "issuer_ticker": issuer_ticker,
344
+ "is_director": is_director,
345
+ "is_officer": is_officer,
346
+ "officer_title": officer_title,
347
+ "period_of_report": period_of_report,
348
+ }
349
+
350
+ yield PersonRecord(
351
+ name=_normalize_name(owner_name),
352
+ source="sec_edgar",
353
+ source_id=source_id,
354
+ country="US",
355
+ person_type=person_type,
356
+ known_for_role=role,
357
+ known_for_org=issuer_name,
358
+ # Note: known_for_org_id will be set during import if org exists in DB
359
+ from_date=period_of_report,
360
+ record=record_data,
361
+ )
362
+
363
+ def _fetch_and_parse_filing(self, filing: Form4Filing) -> Iterator[PersonRecord]:
364
+ """Fetch a Form 4 filing and parse it for person records."""
365
+ try:
366
+ content = self._fetch_url(filing.xml_url)
367
+ yield from self._parse_form4_xml(content)
368
+ except Exception as e:
369
+ logger.debug(f"Failed to fetch/parse {filing.accession_number}: {e}")
370
+
371
+ def import_quarter(
372
+ self,
373
+ year: int,
374
+ quarter: int,
375
+ start_index: int = 0,
376
+ limit: Optional[int] = None,
377
+ progress_callback: Optional[Callable[[int, str, int], None]] = None,
378
+ ) -> Iterator[PersonRecord]:
379
+ """
380
+ Import Form 4 filings for a specific quarter.
381
+
382
+ Args:
383
+ year: Year (e.g., 2025)
384
+ quarter: Quarter (1-4)
385
+ start_index: Index to start from (for resume)
386
+ limit: Optional limit on number of records
387
+ progress_callback: Optional callback(filing_index, accession, records_yielded)
388
+
389
+ Yields:
390
+ PersonRecord for each officer/director
391
+ """
392
+ filings = self._fetch_index(year, quarter)
393
+
394
+ if not filings:
395
+ return
396
+
397
+ count = 0
398
+ for i, filing in enumerate(filings):
399
+ if i < start_index:
400
+ continue
401
+
402
+ if limit and count >= limit:
403
+ break
404
+
405
+ for record in self._fetch_and_parse_filing(filing):
406
+ yield record
407
+ count += 1
408
+
409
+ if limit and count >= limit:
410
+ break
411
+
412
+ if count % 1000 == 0:
413
+ logger.info(f"Imported {count} records from {year} Q{quarter}")
414
+
415
+ if progress_callback:
416
+ progress_callback(i, filing.accession_number, count)
417
+
418
+ def import_range(
419
+ self,
420
+ start_year: int = 2020,
421
+ end_year: Optional[int] = None,
422
+ limit: Optional[int] = None,
423
+ resume: bool = False,
424
+ progress_callback: Optional[Callable[[int, int, int, str, int], None]] = None,
425
+ ) -> Iterator[PersonRecord]:
426
+ """
427
+ Import Form 4 filings for a range of years.
428
+
429
+ Args:
430
+ start_year: First year to import
431
+ end_year: Last year to import (defaults to current year)
432
+ limit: Optional total limit on records
433
+ resume: If True, resume from saved progress
434
+ progress_callback: Optional callback(year, quarter, filing_index, accession, total)
435
+
436
+ Yields:
437
+ PersonRecord for each officer/director
438
+ """
439
+ if end_year is None:
440
+ end_year = datetime.now().year
441
+
442
+ # Load or initialize progress
443
+ progress = None
444
+ if resume:
445
+ progress = Form4Progress.load()
446
+ if progress:
447
+ logger.info(f"Resuming from {progress.year} Q{progress.quarter} index {progress.filing_index}")
448
+ logger.info(f"Previously imported: {progress.total_imported} records")
449
+
450
+ if progress is None:
451
+ progress = Form4Progress(year=start_year, quarter=1)
452
+
453
+ count = progress.total_imported
454
+
455
+ for year in range(progress.year or start_year, end_year + 1):
456
+ start_q = progress.quarter if year == progress.year else 1
457
+
458
+ for quarter in range(start_q, 5):
459
+ start_idx = progress.filing_index if (year == progress.year and quarter == progress.quarter) else 0
460
+
461
+ logger.info(f"Processing {year} Q{quarter} (starting at index {start_idx})")
462
+
463
+ def track_progress(filing_idx: int, accession: str, quarter_count: int) -> None:
464
+ progress.year = year
465
+ progress.quarter = quarter
466
+ progress.filing_index = filing_idx
467
+ progress.total_imported = count + quarter_count
468
+ progress.last_accession = accession
469
+ # Save progress periodically
470
+ if filing_idx % 100 == 0:
471
+ progress.save()
472
+ if progress_callback:
473
+ progress_callback(year, quarter, filing_idx, accession, progress.total_imported)
474
+
475
+ quarter_limit = limit - count if limit else None
476
+
477
+ for record in self.import_quarter(year, quarter, start_idx, quarter_limit, track_progress):
478
+ yield record
479
+ count += 1
480
+
481
+ if limit and count >= limit:
482
+ progress.total_imported = count
483
+ progress.save()
484
+ return
485
+
486
+ # Reset filing index for next quarter
487
+ progress.filing_index = 0
488
+
489
+ # Clear progress on successful completion
490
+ Form4Progress.clear()
491
+ logger.info(f"Completed Form 4 import: {count} total records")
492
+
493
+ def get_available_quarters(self, start_year: int = 2020) -> list[tuple[int, int]]:
494
+ """
495
+ Get list of available (year, quarter) pairs.
496
+
497
+ Args:
498
+ start_year: First year to check
499
+
500
+ Returns:
501
+ List of (year, quarter) tuples
502
+ """
503
+ current_year = datetime.now().year
504
+ current_quarter = (datetime.now().month - 1) // 3 + 1
505
+
506
+ quarters = []
507
+ for year in range(start_year, current_year + 1):
508
+ max_q = current_quarter if year == current_year else 4
509
+ for quarter in range(1, max_q + 1):
510
+ quarters.append((year, quarter))
511
+
512
+ return quarters