corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,559 @@
1
+ """
2
+ Companies House importer for the company database.
3
+
4
+ Imports UK company data from Companies House API
5
+ into the embedding database for company name matching.
6
+
7
+ Note: The Companies House API requires a free API key for bulk access.
8
+ Register at: https://developer.company-information.service.gov.uk/
9
+ """
10
+
11
+ import base64
12
+ import json
13
+ import logging
14
+ import os
15
+ import time
16
+ import urllib.error
17
+ import urllib.parse
18
+ import urllib.request
19
+ from pathlib import Path
20
+ from typing import Any, Iterator, Optional
21
+
22
+ from ..models import CompanyRecord, EntityType
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Companies House API endpoints
27
+ CH_API_BASE = "https://api.company-information.service.gov.uk"
28
+ CH_SEARCH_URL = f"{CH_API_BASE}/search/companies"
29
+ CH_COMPANY_URL = f"{CH_API_BASE}/company"
30
+
31
+ # Bulk data download URL
32
+ CH_BULK_DATA_URL = "https://download.companieshouse.gov.uk/BasicCompanyDataAsOneFile-{date}.zip"
33
+ CH_BULK_DATA_PAGE = "https://download.companieshouse.gov.uk/en_output.html"
34
+
35
+ # Company status prefixes to include (active companies)
36
+ ACTIVE_STATUS_PREFIXES = ("active", "open", "live")
37
+
38
+ # Company type prefixes that are operating companies (matched case-insensitively via startswith)
39
+ # Values from bulk CSV data - using prefix matching to handle truncated values
40
+ COMPANY_TYPE_PREFIXES = (
41
+ # Limited companies (most common - ~4.8M records)
42
+ "private limited company",
43
+ "public limited company",
44
+ "private unlimited company",
45
+ # Limited by guarantee (values get truncated in CSV due to commas)
46
+ "pri/ltd by guar",
47
+ "pri/lbg/nsc",
48
+ # LLPs (~46K)
49
+ "limited liability partnership",
50
+ # Community interest companies (~38K)
51
+ "community interest company",
52
+ # Charitable incorporated organisations (~45K combined)
53
+ "charitable incorporated organisation",
54
+ "scottish charitable incorporated organisation",
55
+ # Registered societies (~10K)
56
+ "registered society",
57
+ # Overseas entities (~18K)
58
+ "overseas entity",
59
+ # Other
60
+ "other company type",
61
+ "royal charter",
62
+ "old public company",
63
+ # Note: Excluded - "Limited Partnership" (often used for funds)
64
+ )
65
+
66
+ # Mapping from company_type prefixes to EntityType
67
+ # Uses prefix matching since CSV values can be truncated
68
+ COMPANY_TYPE_TO_ENTITY_TYPE: list[tuple[str, EntityType]] = [
69
+ # Charitable/non-profit (check these first - more specific)
70
+ ("charitable incorporated organisation", EntityType.NONPROFIT),
71
+ ("scottish charitable incorporated organisation", EntityType.NONPROFIT),
72
+ ("community interest company", EntityType.NONPROFIT),
73
+ ("pri/ltd by guar", EntityType.NONPROFIT), # Limited by guarantee - often charities
74
+ ("pri/lbg/nsc", EntityType.NONPROFIT), # Limited by guarantee no share capital
75
+ ("registered society", EntityType.NONPROFIT), # Co-ops, friendly societies
76
+
77
+ # Business entities (default for most)
78
+ ("private limited company", EntityType.BUSINESS),
79
+ ("public limited company", EntityType.BUSINESS),
80
+ ("private unlimited company", EntityType.BUSINESS),
81
+ ("limited liability partnership", EntityType.BUSINESS),
82
+ ("overseas entity", EntityType.BUSINESS),
83
+ ("old public company", EntityType.BUSINESS),
84
+ ("royal charter", EntityType.BUSINESS), # Could be various, default to business
85
+ ("other company type", EntityType.UNKNOWN),
86
+ ]
87
+
88
+
89
+ def _get_entity_type_from_company_type(company_type: str) -> EntityType:
90
+ """Determine EntityType from Companies House company_type."""
91
+ company_type_lower = company_type.lower().strip()
92
+ for prefix, entity_type in COMPANY_TYPE_TO_ENTITY_TYPE:
93
+ if company_type_lower.startswith(prefix):
94
+ return entity_type
95
+ return EntityType.BUSINESS # Default to business for unmatched types
96
+
97
+
98
+ class CompaniesHouseImporter:
99
+ """
100
+ Importer for UK Companies House data.
101
+
102
+ Uses the Companies House API to fetch company records.
103
+ Requires an API key for bulk access.
104
+
105
+ Get a free API key at:
106
+ https://developer.company-information.service.gov.uk/
107
+ """
108
+
109
+ def __init__(
110
+ self,
111
+ api_key: Optional[str] = None,
112
+ active_only: bool = True,
113
+ delay_seconds: float = 0.6, # API rate limit is ~600/min
114
+ ):
115
+ """
116
+ Initialize the Companies House importer.
117
+
118
+ Args:
119
+ api_key: Companies House API key (or set COMPANIES_HOUSE_API_KEY env var)
120
+ active_only: Only import active companies (default True)
121
+ delay_seconds: Delay between requests to respect rate limits
122
+ """
123
+ self._api_key = api_key or os.environ.get("COMPANIES_HOUSE_API_KEY")
124
+ self._active_only = active_only
125
+ self._delay = delay_seconds
126
+
127
+ if not self._api_key:
128
+ logger.warning(
129
+ "No Companies House API key provided. "
130
+ "Set COMPANIES_HOUSE_API_KEY env var or pass api_key parameter. "
131
+ "Get a free key at: https://developer.company-information.service.gov.uk/"
132
+ )
133
+
134
+ def import_from_search(
135
+ self,
136
+ search_terms: list[str],
137
+ limit_per_term: int = 100,
138
+ total_limit: Optional[int] = None,
139
+ ) -> Iterator[CompanyRecord]:
140
+ """
141
+ Import companies by searching for specific terms.
142
+
143
+ This is useful for targeted imports since the API doesn't support
144
+ bulk enumeration without search terms.
145
+
146
+ Args:
147
+ search_terms: List of search terms (e.g., ["bank", "insurance", "energy"])
148
+ limit_per_term: Max results per search term
149
+ total_limit: Optional total limit across all terms
150
+
151
+ Yields:
152
+ CompanyRecord for each company
153
+ """
154
+ if not self._api_key:
155
+ raise ValueError(
156
+ "Companies House API key required. "
157
+ "Set COMPANIES_HOUSE_API_KEY env var."
158
+ )
159
+
160
+ logger.info(f"Starting Companies House import for {len(search_terms)} search terms...")
161
+
162
+ total_count = 0
163
+ seen_ids = set()
164
+
165
+ for term in search_terms:
166
+ if total_limit and total_count >= total_limit:
167
+ break
168
+
169
+ logger.info(f"Searching Companies House for '{term}'...")
170
+
171
+ try:
172
+ for record in self._search_companies(term, limit_per_term):
173
+ if total_limit and total_count >= total_limit:
174
+ break
175
+
176
+ if record.source_id not in seen_ids:
177
+ seen_ids.add(record.source_id)
178
+ total_count += 1
179
+ yield record
180
+
181
+ if total_count % 100 == 0:
182
+ logger.info(f"Imported {total_count} Companies House records")
183
+
184
+ except Exception as e:
185
+ logger.error(f"Failed to search for '{term}': {e}")
186
+ continue
187
+
188
+ # Rate limiting
189
+ time.sleep(self._delay)
190
+
191
+ logger.info(f"Completed Companies House import: {total_count} records")
192
+
193
+ def import_from_file(
194
+ self,
195
+ file_path: str | Path,
196
+ limit: Optional[int] = None,
197
+ ) -> Iterator[CompanyRecord]:
198
+ """
199
+ Import from a local Companies House data file.
200
+
201
+ Companies House provides bulk data products (paid) in CSV format.
202
+ This method can parse those files.
203
+
204
+ Args:
205
+ file_path: Path to Companies House CSV/JSON file
206
+ limit: Optional limit on records
207
+
208
+ Yields:
209
+ CompanyRecord for each company
210
+ """
211
+ import csv
212
+
213
+ file_path = Path(file_path)
214
+
215
+ if not file_path.exists():
216
+ raise FileNotFoundError(f"Companies House file not found: {file_path}")
217
+
218
+ logger.info(f"Importing Companies House data from {file_path}")
219
+
220
+ count = 0
221
+
222
+ if file_path.suffix.lower() == ".csv":
223
+ with open(file_path, "r", encoding="utf-8-sig") as f:
224
+ reader = csv.DictReader(f)
225
+ for row in reader:
226
+ if limit and count >= limit:
227
+ break
228
+
229
+ record = self._parse_csv_row(row)
230
+ if record:
231
+ count += 1
232
+ yield record
233
+
234
+ if count % 10000 == 0:
235
+ logger.info(f"Imported {count} Companies House records")
236
+
237
+ elif file_path.suffix.lower() == ".json":
238
+ with open(file_path, "r", encoding="utf-8") as f:
239
+ data = json.load(f)
240
+
241
+ items = data if isinstance(data, list) else data.get("items", [])
242
+ for item in items:
243
+ if limit and count >= limit:
244
+ break
245
+
246
+ record = self._parse_api_response(item)
247
+ if record:
248
+ count += 1
249
+ yield record
250
+
251
+ logger.info(f"Completed Companies House import: {count} records")
252
+
253
+ def _search_companies(
254
+ self,
255
+ query: str,
256
+ limit: int = 100,
257
+ ) -> Iterator[CompanyRecord]:
258
+ """Search for companies via the API."""
259
+ items_per_page = min(100, limit) # API max is 100
260
+ start_index = 0
261
+ fetched = 0
262
+
263
+ while fetched < limit:
264
+ params = urllib.parse.urlencode({
265
+ "q": query,
266
+ "items_per_page": items_per_page,
267
+ "start_index": start_index,
268
+ })
269
+
270
+ url = f"{CH_SEARCH_URL}?{params}"
271
+ data = self._api_request(url)
272
+
273
+ items = data.get("items", [])
274
+ if not items:
275
+ break
276
+
277
+ for item in items:
278
+ if fetched >= limit:
279
+ break
280
+
281
+ record = self._parse_api_response(item)
282
+ if record:
283
+ fetched += 1
284
+ yield record
285
+
286
+ # Check if more results available
287
+ total_results = data.get("total_results", 0)
288
+ start_index += items_per_page
289
+
290
+ if start_index >= total_results or start_index >= 400: # API limit
291
+ break
292
+
293
+ time.sleep(self._delay)
294
+
295
+ def _api_request(self, url: str) -> dict[str, Any]:
296
+ """Make an authenticated API request."""
297
+ # Companies House uses HTTP Basic Auth with API key as username
298
+ auth_string = base64.b64encode(f"{self._api_key}:".encode()).decode()
299
+
300
+ req = urllib.request.Request(
301
+ url,
302
+ headers={
303
+ "Authorization": f"Basic {auth_string}",
304
+ "Accept": "application/json",
305
+ }
306
+ )
307
+
308
+ with urllib.request.urlopen(req, timeout=30) as response:
309
+ return json.loads(response.read().decode("utf-8"))
310
+
311
+ def _parse_api_response(self, item: dict[str, Any]) -> Optional[CompanyRecord]:
312
+ """Parse an API response item into a CompanyRecord."""
313
+ try:
314
+ company_number = item.get("company_number")
315
+ title = item.get("title") or item.get("company_name", "")
316
+ company_status = item.get("company_status", "").lower()
317
+ company_type = item.get("company_type", "").lower()
318
+
319
+ if not company_number or not title:
320
+ return None
321
+
322
+ # Filter by status if configured
323
+ if self._active_only and not company_status.startswith(ACTIVE_STATUS_PREFIXES):
324
+ return None
325
+
326
+ # Filter to only include actual companies (not sole traders, individuals, etc.)
327
+ if company_type and not company_type.startswith(COMPANY_TYPE_PREFIXES):
328
+ return None
329
+
330
+ # Get address info
331
+ address = item.get("registered_office_address") or item.get("address", {})
332
+ if isinstance(address, dict):
333
+ locality = address.get("locality", "")
334
+ region = address.get("region", "")
335
+ country = address.get("country", "United Kingdom")
336
+ else:
337
+ locality = ""
338
+ region = ""
339
+ country = "United Kingdom"
340
+
341
+ # Determine entity type from company_type
342
+ raw_company_type = item.get("company_type", "")
343
+ entity_type = _get_entity_type_from_company_type(raw_company_type)
344
+
345
+ # Get dates
346
+ date_of_creation = item.get("date_of_creation")
347
+ date_of_cessation = item.get("date_of_cessation") # For dissolved companies
348
+
349
+ # Build record
350
+ record_data = {
351
+ "company_number": company_number,
352
+ "title": title,
353
+ "company_status": company_status,
354
+ "company_type": raw_company_type,
355
+ "date_of_creation": date_of_creation,
356
+ "date_of_cessation": date_of_cessation,
357
+ "locality": locality,
358
+ "region": region,
359
+ "country": country,
360
+ }
361
+
362
+ return CompanyRecord(
363
+ name=title.strip(),
364
+ source="companies_house",
365
+ source_id=company_number,
366
+ region=country,
367
+ entity_type=entity_type,
368
+ from_date=date_of_creation,
369
+ to_date=date_of_cessation,
370
+ record=record_data,
371
+ )
372
+
373
+ except Exception as e:
374
+ logger.debug(f"Failed to parse Companies House item: {e}")
375
+ return None
376
+
377
+ def _parse_csv_row(self, row: dict[str, Any]) -> Optional[CompanyRecord]:
378
+ """Parse a CSV row from bulk data file."""
379
+ try:
380
+ # Normalize column names (strip whitespace from keys)
381
+ row = {k.strip(): v for k, v in row.items()}
382
+
383
+ # Companies House CSV column names
384
+ company_number = row.get("CompanyNumber") or row.get("company_number", "")
385
+ company_name = row.get("CompanyName") or row.get("company_name", "")
386
+ company_status = (row.get("CompanyStatus") or row.get("company_status", "")).lower().strip()
387
+ company_type = (row.get("CompanyCategory") or row.get("company_type", "")).lower().strip()
388
+
389
+ if not company_number or not company_name:
390
+ return None
391
+
392
+ # Strip whitespace from values too
393
+ company_number = company_number.strip()
394
+ company_name = company_name.strip()
395
+
396
+ if self._active_only and not company_status.startswith(ACTIVE_STATUS_PREFIXES):
397
+ return None
398
+
399
+ # Filter to only include actual companies (not sole traders, individuals, etc.)
400
+ if company_type and not company_type.startswith(COMPANY_TYPE_PREFIXES):
401
+ return None
402
+
403
+ # Determine entity type from company_type
404
+ raw_company_type = row.get("CompanyCategory", "").strip()
405
+ entity_type = _get_entity_type_from_company_type(raw_company_type)
406
+
407
+ # Get dates from CSV
408
+ date_of_creation = row.get("IncorporationDate", "").strip() or None
409
+ date_of_cessation = row.get("DissolutionDate", "").strip() or None
410
+
411
+ record_data = {
412
+ "company_number": company_number,
413
+ "title": company_name,
414
+ "company_status": company_status,
415
+ "company_type": raw_company_type,
416
+ "date_of_creation": date_of_creation,
417
+ "date_of_cessation": date_of_cessation,
418
+ "country": row.get("CountryOfOrigin", "United Kingdom").strip(),
419
+ "sic_code": row.get("SICCode.SicText_1", "").strip(),
420
+ }
421
+
422
+ # Use CountryOfOrigin for region
423
+ region = row.get("CountryOfOrigin", "United Kingdom").strip()
424
+
425
+ return CompanyRecord(
426
+ name=company_name,
427
+ source="companies_house",
428
+ source_id=company_number,
429
+ region=region,
430
+ entity_type=entity_type,
431
+ from_date=date_of_creation,
432
+ to_date=date_of_cessation,
433
+ record=record_data,
434
+ )
435
+
436
+ except Exception as e:
437
+ logger.debug(f"Failed to parse CSV row: {e}")
438
+ return None
439
+
440
+ def get_company(self, company_number: str) -> Optional[CompanyRecord]:
441
+ """
442
+ Fetch a specific company by number.
443
+
444
+ Args:
445
+ company_number: UK company registration number
446
+
447
+ Returns:
448
+ CompanyRecord or None if not found
449
+ """
450
+ if not self._api_key:
451
+ raise ValueError("Companies House API key required")
452
+
453
+ try:
454
+ url = f"{CH_COMPANY_URL}/{company_number}"
455
+ data = self._api_request(url)
456
+ return self._parse_api_response(data)
457
+ except urllib.error.HTTPError as e:
458
+ if e.code == 404:
459
+ return None
460
+ raise
461
+
462
+ def download_bulk_data(
463
+ self,
464
+ output_path: Optional[Path] = None,
465
+ force: bool = False,
466
+ ) -> Path:
467
+ """
468
+ Download the bulk company data file from Companies House.
469
+
470
+ This is a free download containing all active UK companies.
471
+ No API key required.
472
+
473
+ Args:
474
+ output_path: Where to save the CSV file (default: temp directory)
475
+ force: Force re-download even if cached
476
+
477
+ Returns:
478
+ Path to the extracted CSV file
479
+ """
480
+ import re
481
+ import shutil
482
+ import tempfile
483
+ import zipfile
484
+
485
+ # Find the latest file date from the download page
486
+ logger.info("Checking for latest Companies House bulk data...")
487
+
488
+ req = urllib.request.Request(
489
+ CH_BULK_DATA_PAGE,
490
+ headers={"User-Agent": "corp-extractor/1.0"}
491
+ )
492
+ with urllib.request.urlopen(req, timeout=30) as response:
493
+ html = response.read().decode("utf-8")
494
+
495
+ # Find the filename pattern: BasicCompanyDataAsOneFile-YYYY-MM-DD.zip
496
+ match = re.search(r'BasicCompanyDataAsOneFile-(\d{4}-\d{2}-\d{2})\.zip', html)
497
+ if not match:
498
+ raise RuntimeError("Could not find bulk data file on Companies House page")
499
+
500
+ file_date = match.group(1)
501
+ download_url = CH_BULK_DATA_URL.format(date=file_date)
502
+
503
+ # Set up output directory
504
+ if output_path is None:
505
+ output_dir = Path(tempfile.gettempdir()) / "companies_house"
506
+ output_dir.mkdir(parents=True, exist_ok=True)
507
+ output_path = output_dir / "BasicCompanyData.csv"
508
+ else:
509
+ output_dir = output_path.parent
510
+
511
+ # Check for cached version
512
+ metadata_path = output_dir / "ch_metadata.json"
513
+ if not force and output_path.exists() and metadata_path.exists():
514
+ try:
515
+ with open(metadata_path, "r") as f:
516
+ cached_metadata = json.load(f)
517
+ if cached_metadata.get("file_date") == file_date:
518
+ logger.info(f"Using cached Companies House data (date: {file_date})")
519
+ return output_path
520
+ except (json.JSONDecodeError, IOError):
521
+ pass
522
+
523
+ # Download the ZIP file
524
+ logger.info(f"Downloading Companies House bulk data ({file_date})...")
525
+ zip_path = output_path.with_suffix(".zip")
526
+
527
+ req = urllib.request.Request(
528
+ download_url,
529
+ headers={"User-Agent": "corp-extractor/1.0"}
530
+ )
531
+ with urllib.request.urlopen(req) as response:
532
+ with open(zip_path, "wb") as f:
533
+ shutil.copyfileobj(response, f)
534
+
535
+ # Extract CSV from ZIP
536
+ logger.info("Extracting CSV file...")
537
+ with zipfile.ZipFile(zip_path, "r") as zf:
538
+ csv_files = [n for n in zf.namelist() if n.endswith(".csv")]
539
+ if not csv_files:
540
+ raise RuntimeError("No CSV file found in ZIP archive")
541
+
542
+ # Extract the first (usually only) CSV file
543
+ with zf.open(csv_files[0]) as src, open(output_path, "wb") as dst:
544
+ shutil.copyfileobj(src, dst)
545
+
546
+ # Clean up ZIP
547
+ zip_path.unlink()
548
+
549
+ # Save metadata
550
+ metadata = {
551
+ "file_date": file_date,
552
+ "downloaded_at": str(output_path.stat().st_mtime),
553
+ "output_path": str(output_path),
554
+ }
555
+ with open(metadata_path, "w") as f:
556
+ json.dump(metadata, f)
557
+
558
+ logger.info(f"Downloaded Companies House data to {output_path}")
559
+ return output_path