corp-extractor 0.9.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +40 -9
  2. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/RECORD +29 -26
  3. statement_extractor/cli.py +866 -77
  4. statement_extractor/database/hub.py +35 -127
  5. statement_extractor/database/importers/__init__.py +10 -2
  6. statement_extractor/database/importers/companies_house.py +16 -2
  7. statement_extractor/database/importers/companies_house_officers.py +431 -0
  8. statement_extractor/database/importers/gleif.py +23 -0
  9. statement_extractor/database/importers/sec_edgar.py +17 -0
  10. statement_extractor/database/importers/sec_form4.py +512 -0
  11. statement_extractor/database/importers/wikidata.py +151 -43
  12. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  13. statement_extractor/database/importers/wikidata_people.py +823 -325
  14. statement_extractor/database/models.py +30 -6
  15. statement_extractor/database/store.py +1485 -60
  16. statement_extractor/document/deduplicator.py +10 -12
  17. statement_extractor/extractor.py +1 -1
  18. statement_extractor/models/__init__.py +3 -2
  19. statement_extractor/models/statement.py +15 -17
  20. statement_extractor/models.py +1 -1
  21. statement_extractor/pipeline/context.py +5 -5
  22. statement_extractor/pipeline/orchestrator.py +12 -12
  23. statement_extractor/plugins/base.py +17 -17
  24. statement_extractor/plugins/extractors/gliner2.py +28 -28
  25. statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
  26. statement_extractor/plugins/qualifiers/person.py +11 -1
  27. statement_extractor/plugins/splitters/t5_gemma.py +35 -39
  28. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  29. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -6,10 +6,8 @@ Provides functionality to:
6
6
  - Upload/publish database updates
7
7
  - Version management for database files
8
8
  - Create "lite" versions without full records for smaller downloads
9
- - Optional gzip compression for reduced file sizes
10
9
  """
11
10
 
12
- import gzip
13
11
  import logging
14
12
  import os
15
13
  import shutil
@@ -25,8 +23,6 @@ DEFAULT_REPO_ID = "Corp-o-Rate-Community/entity-references"
25
23
  DEFAULT_DB_FILENAME = "entities-lite.db" # Lite is the default (smaller download)
26
24
  DEFAULT_DB_FULL_FILENAME = "entities.db"
27
25
  DEFAULT_DB_LITE_FILENAME = "entities-lite.db"
28
- DEFAULT_DB_COMPRESSED_FILENAME = "entities.db.gz"
29
- DEFAULT_DB_LITE_COMPRESSED_FILENAME = "entities-lite.db.gz"
30
26
 
31
27
  # Local cache directory
32
28
  DEFAULT_CACHE_DIR = Path.home() / ".cache" / "corp-extractor"
@@ -139,7 +135,7 @@ def upload_database(
139
135
  commit_message=commit_message,
140
136
  )
141
137
 
142
- logger.info(f"Database uploaded successfully")
138
+ logger.info("Database uploaded successfully")
143
139
  return result
144
140
 
145
141
 
@@ -189,6 +185,33 @@ def check_for_updates(
189
185
  return latest != current_version, latest
190
186
 
191
187
 
188
+ def vacuum_database(db_path: str | Path) -> None:
189
+ """
190
+ VACUUM the database to reclaim space and optimize it.
191
+
192
+ Args:
193
+ db_path: Path to the database file
194
+ """
195
+ db_path = Path(db_path)
196
+ if not db_path.exists():
197
+ raise FileNotFoundError(f"Database not found: {db_path}")
198
+
199
+ original_size = db_path.stat().st_size
200
+ logger.info(f"Running VACUUM on {db_path} ({original_size / (1024*1024):.1f}MB)")
201
+
202
+ # Use isolation_level=None for autocommit (required for VACUUM)
203
+ conn = sqlite3.connect(str(db_path), isolation_level=None)
204
+ try:
205
+ conn.execute("VACUUM")
206
+ finally:
207
+ conn.close()
208
+
209
+ new_size = db_path.stat().st_size
210
+ reduction = (1 - new_size / original_size) * 100
211
+
212
+ logger.info(f"After VACUUM: {new_size / (1024*1024):.1f}MB (reduced {reduction:.1f}%)")
213
+
214
+
192
215
  def create_lite_database(
193
216
  source_db_path: str | Path,
194
217
  output_path: Optional[str | Path] = None,
@@ -248,98 +271,20 @@ def create_lite_database(
248
271
  return output_path
249
272
 
250
273
 
251
- def compress_database(
252
- db_path: str | Path,
253
- output_path: Optional[str | Path] = None,
254
- ) -> Path:
255
- """
256
- Compress a database file using gzip.
257
-
258
- Args:
259
- db_path: Path to the database file
260
- output_path: Output path for compressed file (default: adds .gz suffix)
261
-
262
- Returns:
263
- Path to the compressed file
264
- """
265
- db_path = Path(db_path)
266
- if not db_path.exists():
267
- raise FileNotFoundError(f"Database not found: {db_path}")
268
-
269
- if output_path is None:
270
- output_path = db_path.with_suffix(db_path.suffix + ".gz")
271
- output_path = Path(output_path)
272
-
273
- logger.info(f"Compressing {db_path} to {output_path}")
274
-
275
- with open(db_path, "rb") as f_in:
276
- with gzip.open(output_path, "wb", compresslevel=9) as f_out:
277
- shutil.copyfileobj(f_in, f_out)
278
-
279
- # Log compression results
280
- original_size = db_path.stat().st_size
281
- compressed_size = output_path.stat().st_size
282
- ratio = (1 - compressed_size / original_size) * 100
283
-
284
- logger.info(f"Original: {original_size / (1024*1024):.1f}MB")
285
- logger.info(f"Compressed: {compressed_size / (1024*1024):.1f}MB")
286
- logger.info(f"Compression ratio: {ratio:.1f}%")
287
-
288
- return output_path
289
-
290
-
291
- def decompress_database(
292
- compressed_path: str | Path,
293
- output_path: Optional[str | Path] = None,
294
- ) -> Path:
295
- """
296
- Decompress a gzipped database file.
297
-
298
- Args:
299
- compressed_path: Path to the .gz file
300
- output_path: Output path (default: removes .gz suffix)
301
-
302
- Returns:
303
- Path to the decompressed file
304
- """
305
- compressed_path = Path(compressed_path)
306
- if not compressed_path.exists():
307
- raise FileNotFoundError(f"Compressed file not found: {compressed_path}")
308
-
309
- if output_path is None:
310
- if compressed_path.suffix == ".gz":
311
- output_path = compressed_path.with_suffix("")
312
- else:
313
- output_path = compressed_path.with_stem(compressed_path.stem + "-decompressed")
314
- output_path = Path(output_path)
315
-
316
- logger.info(f"Decompressing {compressed_path} to {output_path}")
317
-
318
- with gzip.open(compressed_path, "rb") as f_in:
319
- with open(output_path, "wb") as f_out:
320
- shutil.copyfileobj(f_in, f_out)
321
-
322
- logger.info(f"Decompressed to {output_path}")
323
- return output_path
324
-
325
-
326
274
  def upload_database_with_variants(
327
275
  db_path: str | Path,
328
276
  repo_id: str = DEFAULT_REPO_ID,
329
277
  commit_message: str = "Update entity database",
330
278
  token: Optional[str] = None,
331
279
  include_lite: bool = True,
332
- include_compressed: bool = True,
333
280
  include_readme: bool = True,
334
281
  ) -> dict[str, str]:
335
282
  """
336
- Upload entity database with optional lite and compressed variants.
283
+ Upload entity database with optional lite variant.
337
284
 
338
- Creates and uploads:
285
+ First VACUUMs the database, then creates and uploads:
339
286
  - entities.db (full database)
340
287
  - entities-lite.db (without record data, smaller)
341
- - entities.db.gz (compressed full database)
342
- - entities-lite.db.gz (compressed lite database)
343
288
  - README.md (dataset card from HUGGINGFACE_README.md)
344
289
 
345
290
  Args:
@@ -348,7 +293,6 @@ def upload_database_with_variants(
348
293
  commit_message: Git commit message
349
294
  token: HuggingFace API token
350
295
  include_lite: Whether to create and upload lite version
351
- include_compressed: Whether to create and upload compressed versions
352
296
  include_readme: Whether to upload the README.md dataset card
353
297
 
354
298
  Returns:
@@ -383,6 +327,9 @@ def upload_database_with_variants(
383
327
  except Exception as e:
384
328
  logger.debug(f"Repo creation note: {e}")
385
329
 
330
+ # VACUUM the database first to optimize it
331
+ vacuum_database(db_path)
332
+
386
333
  results = {}
387
334
 
388
335
  # Create temp directory for variants
@@ -399,20 +346,6 @@ def upload_database_with_variants(
399
346
  create_lite_database(db_path, lite_path)
400
347
  files_to_upload.append((lite_path, DEFAULT_DB_LITE_FILENAME))
401
348
 
402
- # Compressed versions
403
- if include_compressed:
404
- # Compress full database
405
- compressed_path = temp_path / DEFAULT_DB_COMPRESSED_FILENAME
406
- compress_database(db_path, compressed_path)
407
- files_to_upload.append((compressed_path, DEFAULT_DB_COMPRESSED_FILENAME))
408
-
409
- # Compress lite database
410
- if include_lite:
411
- lite_compressed_path = temp_path / DEFAULT_DB_LITE_COMPRESSED_FILENAME
412
- lite_path = temp_path / DEFAULT_DB_LITE_FILENAME
413
- compress_database(lite_path, lite_compressed_path)
414
- files_to_upload.append((lite_compressed_path, DEFAULT_DB_LITE_COMPRESSED_FILENAME))
415
-
416
349
  # Copy all files to a staging directory for upload_folder
417
350
  staging_dir = temp_path / "staging"
418
351
  staging_dir.mkdir()
@@ -455,7 +388,6 @@ def download_database(
455
388
  revision: Optional[str] = None,
456
389
  cache_dir: Optional[Path] = None,
457
390
  force_download: bool = False,
458
- prefer_compressed: bool = True,
459
391
  ) -> Path:
460
392
  """
461
393
  Download entity database from HuggingFace Hub.
@@ -466,10 +398,9 @@ def download_database(
466
398
  revision: Git revision (branch, tag, commit) or None for latest
467
399
  cache_dir: Local cache directory
468
400
  force_download: Force re-download even if cached
469
- prefer_compressed: Try to download compressed version first
470
401
 
471
402
  Returns:
472
- Path to the downloaded database file (decompressed if was .gz)
403
+ Path to the downloaded database file
473
404
  """
474
405
  try:
475
406
  from huggingface_hub import hf_hub_download
@@ -482,34 +413,11 @@ def download_database(
482
413
  cache_dir = cache_dir or DEFAULT_CACHE_DIR
483
414
  cache_dir.mkdir(parents=True, exist_ok=True)
484
415
 
485
- # Try compressed version first if preferred
486
- download_filename = filename
487
- if prefer_compressed and not filename.endswith(".gz"):
488
- compressed_filename = filename + ".gz"
489
- try:
490
- logger.info(f"Trying compressed version: {compressed_filename}")
491
- local_path = hf_hub_download(
492
- repo_id=repo_id,
493
- filename=compressed_filename,
494
- revision=revision,
495
- cache_dir=str(cache_dir),
496
- force_download=force_download,
497
- repo_type="dataset",
498
- )
499
- # Decompress to final location
500
- final_path = cache_dir / filename
501
- decompress_database(local_path, final_path)
502
- logger.info(f"Database downloaded and decompressed to {final_path}")
503
- return final_path
504
- except Exception as e:
505
- logger.debug(f"Compressed version not available: {e}")
506
-
507
- # Download uncompressed version
508
416
  logger.info(f"Downloading entity database from {repo_id}...")
509
417
 
510
418
  local_path = hf_hub_download(
511
419
  repo_id=repo_id,
512
- filename=download_filename,
420
+ filename=filename,
513
421
  revision=revision,
514
422
  cache_dir=str(cache_dir),
515
423
  force_download=force_download,
@@ -4,21 +4,29 @@ Data importers for the entity database.
4
4
  Provides importers for various data sources:
5
5
  - GLEIF: Legal Entity Identifier data
6
6
  - SEC Edgar: US SEC company data
7
+ - SEC Form 4: US SEC insider ownership data (officers/directors)
7
8
  - Companies House: UK company data
8
- - Wikidata: Wikipedia/Wikidata organization data
9
- - Wikidata People: Notable people from Wikipedia/Wikidata
9
+ - Wikidata: Wikipedia/Wikidata organization data (SPARQL-based, may timeout)
10
+ - Wikidata People: Notable people from Wikipedia/Wikidata (SPARQL-based, may timeout)
11
+ - Wikidata Dump: Bulk import from Wikidata JSON dump (recommended for large imports)
10
12
  """
11
13
 
12
14
  from .gleif import GleifImporter
13
15
  from .sec_edgar import SecEdgarImporter
16
+ from .sec_form4 import SecForm4Importer
14
17
  from .companies_house import CompaniesHouseImporter
18
+ from .companies_house_officers import CompaniesHouseOfficersImporter
15
19
  from .wikidata import WikidataImporter
16
20
  from .wikidata_people import WikidataPeopleImporter
21
+ from .wikidata_dump import WikidataDumpImporter
17
22
 
18
23
  __all__ = [
19
24
  "GleifImporter",
20
25
  "SecEdgarImporter",
26
+ "SecForm4Importer",
21
27
  "CompaniesHouseImporter",
28
+ "CompaniesHouseOfficersImporter",
22
29
  "WikidataImporter",
23
30
  "WikidataPeopleImporter",
31
+ "WikidataDumpImporter",
24
32
  ]
@@ -342,13 +342,18 @@ class CompaniesHouseImporter:
342
342
  raw_company_type = item.get("company_type", "")
343
343
  entity_type = _get_entity_type_from_company_type(raw_company_type)
344
344
 
345
+ # Get dates
346
+ date_of_creation = item.get("date_of_creation")
347
+ date_of_cessation = item.get("date_of_cessation") # For dissolved companies
348
+
345
349
  # Build record
346
350
  record_data = {
347
351
  "company_number": company_number,
348
352
  "title": title,
349
353
  "company_status": company_status,
350
354
  "company_type": raw_company_type,
351
- "date_of_creation": item.get("date_of_creation"),
355
+ "date_of_creation": date_of_creation,
356
+ "date_of_cessation": date_of_cessation,
352
357
  "locality": locality,
353
358
  "region": region,
354
359
  "country": country,
@@ -360,6 +365,8 @@ class CompaniesHouseImporter:
360
365
  source_id=company_number,
361
366
  region=country,
362
367
  entity_type=entity_type,
368
+ from_date=date_of_creation,
369
+ to_date=date_of_cessation,
363
370
  record=record_data,
364
371
  )
365
372
 
@@ -397,12 +404,17 @@ class CompaniesHouseImporter:
397
404
  raw_company_type = row.get("CompanyCategory", "").strip()
398
405
  entity_type = _get_entity_type_from_company_type(raw_company_type)
399
406
 
407
+ # Get dates from CSV
408
+ date_of_creation = row.get("IncorporationDate", "").strip() or None
409
+ date_of_cessation = row.get("DissolutionDate", "").strip() or None
410
+
400
411
  record_data = {
401
412
  "company_number": company_number,
402
413
  "title": company_name,
403
414
  "company_status": company_status,
404
415
  "company_type": raw_company_type,
405
- "date_of_creation": row.get("IncorporationDate", "").strip(),
416
+ "date_of_creation": date_of_creation,
417
+ "date_of_cessation": date_of_cessation,
406
418
  "country": row.get("CountryOfOrigin", "United Kingdom").strip(),
407
419
  "sic_code": row.get("SICCode.SicText_1", "").strip(),
408
420
  }
@@ -416,6 +428,8 @@ class CompaniesHouseImporter:
416
428
  source_id=company_number,
417
429
  region=region,
418
430
  entity_type=entity_type,
431
+ from_date=date_of_creation,
432
+ to_date=date_of_cessation,
419
433
  record=record_data,
420
434
  )
421
435