corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
  2. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
  3. statement_extractor/cli.py +1317 -101
  4. statement_extractor/database/embeddings.py +45 -0
  5. statement_extractor/database/hub.py +86 -136
  6. statement_extractor/database/importers/__init__.py +10 -2
  7. statement_extractor/database/importers/companies_house.py +16 -2
  8. statement_extractor/database/importers/companies_house_officers.py +431 -0
  9. statement_extractor/database/importers/gleif.py +23 -0
  10. statement_extractor/database/importers/import_utils.py +264 -0
  11. statement_extractor/database/importers/sec_edgar.py +17 -0
  12. statement_extractor/database/importers/sec_form4.py +512 -0
  13. statement_extractor/database/importers/wikidata.py +151 -43
  14. statement_extractor/database/importers/wikidata_dump.py +2282 -0
  15. statement_extractor/database/importers/wikidata_people.py +867 -325
  16. statement_extractor/database/migrate_v2.py +852 -0
  17. statement_extractor/database/models.py +155 -7
  18. statement_extractor/database/schema_v2.py +409 -0
  19. statement_extractor/database/seed_data.py +359 -0
  20. statement_extractor/database/store.py +3449 -233
  21. statement_extractor/document/deduplicator.py +10 -12
  22. statement_extractor/extractor.py +1 -1
  23. statement_extractor/models/__init__.py +3 -2
  24. statement_extractor/models/statement.py +15 -17
  25. statement_extractor/models.py +1 -1
  26. statement_extractor/pipeline/context.py +5 -5
  27. statement_extractor/pipeline/orchestrator.py +12 -12
  28. statement_extractor/plugins/base.py +17 -17
  29. statement_extractor/plugins/extractors/gliner2.py +28 -28
  30. statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
  31. statement_extractor/plugins/qualifiers/person.py +120 -53
  32. statement_extractor/plugins/splitters/t5_gemma.py +35 -39
  33. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
  34. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,431 @@
1
+ """
2
+ Companies House officers bulk data importer.
3
+
4
+ Imports officer/director data from Companies House bulk data files (Prod195).
5
+ The data is in fixed-width format with `<` as field separators.
6
+
7
+ Data format:
8
+ - Header: DDDDSNAP{product_id}{date}
9
+ - Company record (type 1): company_number + 1 + company_name
10
+ - Officer record (type 2/3): company_number + type + officer_details
11
+
12
+ Officer record structure (after company_number):
13
+ - Position 8: Record type (2=address only, 3=full appointment)
14
+ - Position 9-10: Sub-type (30=current, 01=resigned, etc.)
15
+ - Position 11-23: Person ID
16
+ - Position 24-31: Appointment date (YYYYMMDD)
17
+ - Position 32-39: (varies)
18
+ - Position 40-47: Postcode
19
+ - Position 48-53: Birth date (YYYYMM)
20
+ - Then `<`-separated fields: Title, Forenames, Surname, ..., Occupation, Nationality, Country
21
+
22
+ Resume support:
23
+ - Progress tracked by file index and line number
24
+ - Progress saved to JSON file for resume on interruption
25
+ """
26
+
27
+ import json
28
+ import logging
29
+ import zipfile
30
+ from dataclasses import dataclass, field
31
+ from datetime import datetime
32
+ from pathlib import Path
33
+ from typing import Callable, Iterator, Optional
34
+
35
+ from ..models import PersonRecord, PersonType
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+ # Default progress file path
40
+ DEFAULT_PROGRESS_PATH = Path.home() / ".cache" / "corp-extractor" / "ch-officers-progress.json"
41
+
42
+
43
+ def _normalize_name(name: str) -> str:
44
+ """Normalize a person name for consistent storage."""
45
+ if not name:
46
+ return ""
47
+ # Remove extra whitespace and title case
48
+ name = " ".join(name.split())
49
+ return name.title()
50
+
51
+
52
+ def _parse_date(date_str: str) -> Optional[str]:
53
+ """Parse date from YYYYMMDD or YYYYMM format to ISO format."""
54
+ date_str = date_str.strip()
55
+ if not date_str or not date_str.isdigit():
56
+ return None
57
+
58
+ if len(date_str) == 8: # YYYYMMDD
59
+ try:
60
+ return f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"
61
+ except Exception:
62
+ return None
63
+ elif len(date_str) == 6: # YYYYMM
64
+ try:
65
+ return f"{date_str[:4]}-{date_str[4:6]}"
66
+ except Exception:
67
+ return None
68
+ return None
69
+
70
+
71
+ def _map_role_to_person_type(role: str) -> PersonType:
72
+ """Map officer role to PersonType."""
73
+ role_lower = role.lower()
74
+
75
+ # Executive roles
76
+ if any(x in role_lower for x in ["director", "ceo", "cfo", "cto", "coo", "chief", "managing", "president", "chairman"]):
77
+ return PersonType.EXECUTIVE
78
+
79
+ # Legal roles
80
+ if any(x in role_lower for x in ["secretary", "solicitor", "lawyer", "legal"]):
81
+ return PersonType.LEGAL
82
+
83
+ # Professional roles
84
+ if any(x in role_lower for x in ["accountant", "engineer", "architect", "doctor", "consultant"]):
85
+ return PersonType.PROFESSIONAL
86
+
87
+ return PersonType.EXECUTIVE # Default to executive for company officers
88
+
89
+
90
+ @dataclass
91
+ class CHProgress:
92
+ """Tracks progress through Companies House bulk import."""
93
+ file_index: int = 0
94
+ line_number: int = 0
95
+ total_imported: int = 0
96
+ last_company: str = ""
97
+ started_at: str = field(default_factory=lambda: datetime.now().isoformat())
98
+ updated_at: str = field(default_factory=lambda: datetime.now().isoformat())
99
+
100
+ def save(self, path: Path = DEFAULT_PROGRESS_PATH) -> None:
101
+ """Save progress to JSON file."""
102
+ path.parent.mkdir(parents=True, exist_ok=True)
103
+ self.updated_at = datetime.now().isoformat()
104
+ with open(path, "w") as f:
105
+ json.dump({
106
+ "file_index": self.file_index,
107
+ "line_number": self.line_number,
108
+ "total_imported": self.total_imported,
109
+ "last_company": self.last_company,
110
+ "started_at": self.started_at,
111
+ "updated_at": self.updated_at,
112
+ }, f, indent=2)
113
+ logger.debug(f"Saved progress: file={self.file_index}, line={self.line_number}")
114
+
115
+ @classmethod
116
+ def load(cls, path: Path = DEFAULT_PROGRESS_PATH) -> Optional["CHProgress"]:
117
+ """Load progress from JSON file."""
118
+ if not path.exists():
119
+ return None
120
+ try:
121
+ with open(path) as f:
122
+ data = json.load(f)
123
+ return cls(**data)
124
+ except Exception as e:
125
+ logger.warning(f"Failed to load progress: {e}")
126
+ return None
127
+
128
+ @staticmethod
129
+ def clear(path: Path = DEFAULT_PROGRESS_PATH) -> None:
130
+ """Delete progress file."""
131
+ if path.exists():
132
+ path.unlink()
133
+ logger.info(f"Cleared progress file: {path}")
134
+
135
+
136
+ class CompaniesHouseOfficersImporter:
137
+ """
138
+ Importer for Companies House bulk officers data.
139
+
140
+ Parses the Prod195 fixed-width format files and extracts officer records.
141
+ """
142
+
143
+ def __init__(self):
144
+ """Initialize the importer."""
145
+ self._current_company_number: str = ""
146
+ self._current_company_name: str = ""
147
+
148
+ def _parse_officer_line(self, line: str) -> Optional[PersonRecord]:
149
+ """
150
+ Parse a single officer record line per CH specification.
151
+
152
+ Returns PersonRecord or None if line is not a valid officer record.
153
+
154
+ Fixed-width field positions (1-indexed in spec, 0-indexed here):
155
+ - 0-7: Company Number (8)
156
+ - 8: Record Type (1) - '1'=company, '2'=person
157
+ - 9: App Date Origin (1)
158
+ - 10-11: Appointment Type (2) - 00-19
159
+ - 12-23: Person Number (12)
160
+ - 24: Corporate Indicator (1) - 'Y'=corporate, ' '=individual
161
+ - 25-31: Filler (7)
162
+ - 32-39: Appointment Date (8) - CCYYMMDD
163
+ - 40-47: Resignation Date (8) - CCYYMMDD
164
+ - 48-55: Person Postcode (8)
165
+ - 56-63: Partial DOB (8) - CCYYMM + 2 spaces
166
+ - 64-71: Full DOB (8) - CCYYMMDD
167
+ - 72-75: Variable Data Length (4)
168
+ - 76+: Variable Data (<-delimited, 14 fields)
169
+
170
+ Variable data fields (14 total):
171
+ 0:TITLE, 1:FORENAMES, 2:SURNAME, 3:HONOURS, 4:CARE_OF, 5:PO_BOX,
172
+ 6:ADDRESS1, 7:ADDRESS2, 8:POST_TOWN, 9:COUNTY, 10:COUNTRY,
173
+ 11:OCCUPATION, 12:NATIONALITY, 13:USUAL_RESIDENTIAL_COUNTRY
174
+ """
175
+ if len(line) < 76:
176
+ return None
177
+
178
+ # Extract fixed-width fields
179
+ company_number = line[0:8].strip()
180
+ record_type = line[8:9]
181
+
182
+ # Type 1 is company record - extract company name
183
+ if record_type == "1":
184
+ # Company record: positions 32-35 = officer count, 36-39 = name length, 40+ = name
185
+ name_part = line[40:].split("<")[0].strip()
186
+ self._current_company_number = company_number
187
+ self._current_company_name = name_part
188
+ return None
189
+
190
+ # Only process officer records (type 2)
191
+ if record_type != "2":
192
+ return None
193
+
194
+ # Position 24: Corporate Indicator - 'Y' = corporate officer, space = individual
195
+ corporate_indicator = line[24:25]
196
+ if corporate_indicator == "Y":
197
+ # Skip corporate officers (companies acting as secretary)
198
+ return None
199
+
200
+ # Use current company info
201
+ if company_number != self._current_company_number:
202
+ self._current_company_number = company_number
203
+ self._current_company_name = ""
204
+
205
+ # Person ID: positions 12-23 (12 chars)
206
+ person_id = line[12:24].strip()
207
+
208
+ # Appointment date: positions 32-39 (CCYYMMDD)
209
+ appt_date_raw = line[32:40].strip()
210
+ appointment_date = _parse_date(appt_date_raw)
211
+
212
+ # Resignation date: positions 40-47 (CCYYMMDD)
213
+ res_date_raw = line[40:48].strip()
214
+ resignation_date = _parse_date(res_date_raw) if res_date_raw else None
215
+
216
+ # Postcode: positions 48-55
217
+ postcode = line[48:56].strip()
218
+
219
+ # Partial DOB: positions 56-63 (CCYYMM + 2 spaces)
220
+ partial_dob_raw = line[56:62].strip()
221
+ birth_date = _parse_date(partial_dob_raw)
222
+
223
+ # Full DOB: positions 64-71 (CCYYMMDD) - prefer this if available
224
+ full_dob_raw = line[64:72].strip()
225
+ if full_dob_raw:
226
+ full_dob = _parse_date(full_dob_raw)
227
+ if full_dob:
228
+ birth_date = full_dob
229
+
230
+ # Variable data starts at position 76
231
+ # First we need to skip the variable data length field
232
+ var_start = 76
233
+ if len(line) <= var_start:
234
+ return None
235
+
236
+ # Parse `<`-separated fields (14 defined fields)
237
+ var_section = line[var_start:]
238
+ fields = var_section.split("<")
239
+
240
+ # Field indices (0-based): 0=TITLE, 1=FORENAMES, 2=SURNAME, ..., 11=OCCUPATION, 12=NATIONALITY, 13=COUNTRY
241
+ forenames = fields[1].strip() if len(fields) > 1 else ""
242
+ surname = fields[2].strip() if len(fields) > 2 else ""
243
+
244
+ # Build full name (without title for cleaner data)
245
+ name_parts = []
246
+ if forenames:
247
+ name_parts.append(forenames)
248
+ if surname:
249
+ name_parts.append(surname)
250
+
251
+ full_name = _normalize_name(" ".join(name_parts))
252
+ if not full_name or not surname:
253
+ return None
254
+
255
+ # Get occupation (field 11), nationality (field 12)
256
+ occupation = fields[11].strip() if len(fields) > 11 else ""
257
+ nationality = fields[12].strip() if len(fields) > 12 else ""
258
+
259
+ # Determine role from occupation
260
+ role = occupation if occupation else "Director"
261
+ person_type = _map_role_to_person_type(role)
262
+
263
+ # Create unique source_id from person_id + company
264
+ source_id = f"{person_id}_{company_number}"
265
+
266
+ # Determine if current (no resignation date)
267
+ is_current = resignation_date is None
268
+
269
+ # Build record data
270
+ record_data = {
271
+ "person_id": person_id,
272
+ "company_number": company_number,
273
+ "company_name": self._current_company_name,
274
+ "appointment_date": appointment_date,
275
+ "resignation_date": resignation_date,
276
+ "postcode": postcode,
277
+ "occupation": occupation,
278
+ "nationality": nationality,
279
+ "is_current": is_current,
280
+ }
281
+
282
+ return PersonRecord(
283
+ name=full_name,
284
+ source="companies_house",
285
+ source_id=source_id,
286
+ country="GB",
287
+ person_type=person_type,
288
+ known_for_role=role,
289
+ known_for_org=self._current_company_name,
290
+ from_date=appointment_date,
291
+ to_date=resignation_date,
292
+ birth_date=birth_date,
293
+ record=record_data,
294
+ )
295
+
296
+ def import_from_zip(
297
+ self,
298
+ zip_path: str | Path,
299
+ limit: Optional[int] = None,
300
+ resume: bool = False,
301
+ current_only: bool = True,
302
+ progress_callback: Optional[Callable[[int, int, int], None]] = None,
303
+ ) -> Iterator[PersonRecord]:
304
+ """
305
+ Import officer records from Companies House bulk zip file.
306
+
307
+ Args:
308
+ zip_path: Path to the Prod195 zip file
309
+ limit: Optional limit on number of records
310
+ resume: If True, resume from saved progress
311
+ current_only: If True, only import current officers (sub_type=30)
312
+ progress_callback: Optional callback(file_idx, line_num, total)
313
+
314
+ Yields:
315
+ PersonRecord for each officer
316
+ """
317
+ zip_path = Path(zip_path)
318
+ if not zip_path.exists():
319
+ raise FileNotFoundError(f"Zip file not found: {zip_path}")
320
+
321
+ # Load or initialize progress
322
+ progress = CHProgress.load() if resume else None
323
+ if progress:
324
+ logger.info(f"Resuming from file {progress.file_index}, line {progress.line_number}")
325
+ logger.info(f"Previously imported: {progress.total_imported}")
326
+ else:
327
+ progress = CHProgress()
328
+
329
+ count = progress.total_imported
330
+
331
+ with zipfile.ZipFile(zip_path, "r") as zf:
332
+ # Get list of .dat files, sorted
333
+ dat_files = sorted([n for n in zf.namelist() if n.endswith(".dat")])
334
+ logger.info(f"Found {len(dat_files)} data files in archive")
335
+
336
+ for file_idx, filename in enumerate(dat_files):
337
+ # Skip files before resume point
338
+ if file_idx < progress.file_index:
339
+ continue
340
+
341
+ logger.info(f"Processing file {file_idx + 1}/{len(dat_files)}: {filename}")
342
+
343
+ start_line = progress.line_number if file_idx == progress.file_index else 0
344
+
345
+ with zf.open(filename) as f:
346
+ for line_num, line_bytes in enumerate(f):
347
+ # Skip lines before resume point
348
+ if line_num < start_line:
349
+ continue
350
+
351
+ if limit and count >= limit:
352
+ progress.file_index = file_idx
353
+ progress.line_number = line_num
354
+ progress.total_imported = count
355
+ progress.save()
356
+ return
357
+
358
+ try:
359
+ line = line_bytes.decode("utf-8", errors="replace").rstrip("\n\r")
360
+ except Exception:
361
+ continue
362
+
363
+ # Skip header
364
+ if line.startswith("DDDD"):
365
+ continue
366
+
367
+ # Parse officer record
368
+ record = self._parse_officer_line(line)
369
+ if record:
370
+ # Skip resigned officers if current_only
371
+ if current_only and record.to_date:
372
+ continue
373
+
374
+ yield record
375
+ count += 1
376
+
377
+ if count % 10000 == 0:
378
+ logger.info(f"Imported {count} officers...")
379
+ progress.file_index = file_idx
380
+ progress.line_number = line_num
381
+ progress.total_imported = count
382
+ progress.last_company = self._current_company_number
383
+ progress.save()
384
+
385
+ if progress_callback and line_num % 10000 == 0:
386
+ progress_callback(file_idx, line_num, count)
387
+
388
+ # Reset line counter for next file
389
+ progress.line_number = 0
390
+
391
+ # Clear progress on successful completion
392
+ CHProgress.clear()
393
+ logger.info(f"Completed CH officers import: {count} total records")
394
+
395
+ def import_from_file(
396
+ self,
397
+ file_path: str | Path,
398
+ limit: Optional[int] = None,
399
+ current_only: bool = True,
400
+ ) -> Iterator[PersonRecord]:
401
+ """
402
+ Import from a single uncompressed .dat file.
403
+
404
+ Args:
405
+ file_path: Path to .dat file
406
+ limit: Optional limit
407
+ current_only: Only current officers
408
+
409
+ Yields:
410
+ PersonRecord for each officer
411
+ """
412
+ file_path = Path(file_path)
413
+ count = 0
414
+
415
+ with open(file_path, "r", encoding="utf-8", errors="replace") as f:
416
+ for line in f:
417
+ if limit and count >= limit:
418
+ break
419
+
420
+ line = line.rstrip("\n\r")
421
+ if line.startswith("DDDD"):
422
+ continue
423
+
424
+ record = self._parse_officer_line(line)
425
+ if record:
426
+ if current_only and record.to_date:
427
+ continue
428
+ yield record
429
+ count += 1
430
+
431
+ logger.info(f"Imported {count} officers from {file_path}")
@@ -269,6 +269,15 @@ class GleifImporter:
269
269
  if name_elem.text:
270
270
  other_names.append(name_elem.text)
271
271
 
272
+ # Get registration dates from Registration element
273
+ registration_elem = find_elem(lei_record, 'Registration')
274
+ initial_reg_date = None
275
+ if registration_elem is not None:
276
+ initial_reg_date = find_text(registration_elem, 'InitialRegistrationDate')
277
+ # Extract just the date part (YYYY-MM-DD) from ISO datetime
278
+ if initial_reg_date and "T" in initial_reg_date:
279
+ initial_reg_date = initial_reg_date.split("T")[0]
280
+
272
281
  # Build record
273
282
  name = legal_name.strip()
274
283
  record_data = {
@@ -281,6 +290,8 @@ class GleifImporter:
281
290
  "entity_category": entity_category,
282
291
  "other_names": other_names,
283
292
  }
293
+ if initial_reg_date:
294
+ record_data["initial_registration_date"] = initial_reg_date
284
295
 
285
296
  return CompanyRecord(
286
297
  name=name,
@@ -288,6 +299,7 @@ class GleifImporter:
288
299
  source_id=lei,
289
300
  region=country,
290
301
  entity_type=entity_type,
302
+ from_date=initial_reg_date,
291
303
  record=record_data,
292
304
  )
293
305
 
@@ -362,6 +374,14 @@ class GleifImporter:
362
374
  country = ""
363
375
  city = ""
364
376
 
377
+ # Get registration dates
378
+ initial_reg_date = registration.get("initialRegistrationDate")
379
+ if not initial_reg_date:
380
+ initial_reg_date = registration.get("InitialRegistrationDate")
381
+ # Extract just the date part (YYYY-MM-DD) from ISO datetime
382
+ if initial_reg_date and "T" in initial_reg_date:
383
+ initial_reg_date = initial_reg_date.split("T")[0]
384
+
365
385
  # Build record with relevant data
366
386
  record_data = {
367
387
  "lei": lei,
@@ -373,6 +393,8 @@ class GleifImporter:
373
393
  "entity_category": entity_category,
374
394
  "other_names": other_names,
375
395
  }
396
+ if initial_reg_date:
397
+ record_data["initial_registration_date"] = initial_reg_date
376
398
 
377
399
  return CompanyRecord(
378
400
  name=name,
@@ -380,6 +402,7 @@ class GleifImporter:
380
402
  source_id=lei,
381
403
  region=country,
382
404
  entity_type=entity_type,
405
+ from_date=initial_reg_date,
383
406
  record=record_data,
384
407
  )
385
408