corp-extractor 0.9.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +40 -9
  2. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/RECORD +29 -26
  3. statement_extractor/cli.py +866 -77
  4. statement_extractor/database/hub.py +35 -127
  5. statement_extractor/database/importers/__init__.py +10 -2
  6. statement_extractor/database/importers/companies_house.py +16 -2
  7. statement_extractor/database/importers/companies_house_officers.py +431 -0
  8. statement_extractor/database/importers/gleif.py +23 -0
  9. statement_extractor/database/importers/sec_edgar.py +17 -0
  10. statement_extractor/database/importers/sec_form4.py +512 -0
  11. statement_extractor/database/importers/wikidata.py +151 -43
  12. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  13. statement_extractor/database/importers/wikidata_people.py +823 -325
  14. statement_extractor/database/models.py +30 -6
  15. statement_extractor/database/store.py +1485 -60
  16. statement_extractor/document/deduplicator.py +10 -12
  17. statement_extractor/extractor.py +1 -1
  18. statement_extractor/models/__init__.py +3 -2
  19. statement_extractor/models/statement.py +15 -17
  20. statement_extractor/models.py +1 -1
  21. statement_extractor/pipeline/context.py +5 -5
  22. statement_extractor/pipeline/orchestrator.py +12 -12
  23. statement_extractor/plugins/base.py +17 -17
  24. statement_extractor/plugins/extractors/gliner2.py +28 -28
  25. statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
  26. statement_extractor/plugins/qualifiers/person.py +11 -1
  27. statement_extractor/plugins/splitters/t5_gemma.py +35 -39
  28. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  29. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -439,7 +439,7 @@ def _print_pipeline_json(ctx):
439
439
  """Print pipeline results as JSON."""
440
440
  output = {
441
441
  "statement_count": ctx.statement_count,
442
- "raw_triples": [t.model_dump() for t in ctx.raw_triples],
442
+ "split_sentences": [s.model_dump() for s in ctx.split_sentences],
443
443
  "statements": [s.model_dump() for s in ctx.statements],
444
444
  "labeled_statements": [stmt.as_dict() for stmt in ctx.labeled_statements],
445
445
  "timings": ctx.stage_timings,
@@ -472,9 +472,10 @@ def _print_pipeline_triples(ctx):
472
472
  elif ctx.statements:
473
473
  for stmt in ctx.statements:
474
474
  click.echo(f"{stmt.subject.text}\t{stmt.predicate}\t{stmt.object.text}")
475
- elif ctx.raw_triples:
476
- for triple in ctx.raw_triples:
477
- click.echo(f"{triple.subject_text}\t{triple.predicate_text}\t{triple.object_text}")
475
+ elif ctx.split_sentences:
476
+ # Stage 1 only output - just show the split sentences (no triples yet)
477
+ for sentence in ctx.split_sentences:
478
+ click.echo(sentence.text)
478
479
 
479
480
 
480
481
  def _print_pipeline_table(ctx, verbose: bool):
@@ -528,20 +529,16 @@ def _print_pipeline_table(ctx, verbose: bool):
528
529
 
529
530
  click.echo("-" * 80)
530
531
 
531
- elif ctx.raw_triples:
532
- click.echo(f"\nExtracted {len(ctx.raw_triples)} raw triple(s):\n")
532
+ elif ctx.split_sentences:
533
+ click.echo(f"\nSplit into {len(ctx.split_sentences)} atomic sentence(s):\n")
533
534
  click.echo("-" * 80)
534
535
 
535
- for i, triple in enumerate(ctx.raw_triples, 1):
536
- click.echo(f"{i}. {triple.subject_text}")
537
- click.echo(f" --[{triple.predicate_text}]-->")
538
- click.echo(f" {triple.object_text}")
536
+ for i, sentence in enumerate(ctx.split_sentences, 1):
537
+ text_preview = sentence.text[:100] + "..." if len(sentence.text) > 100 else sentence.text
538
+ click.echo(f"{i}. {text_preview}")
539
539
 
540
540
  if verbose:
541
- click.echo(f" Confidence: {triple.confidence:.2f}")
542
- if triple.source_sentence:
543
- source = triple.source_sentence[:60] + "..." if len(triple.source_sentence) > 60 else triple.source_sentence
544
- click.echo(f" Source: \"{source}\"")
541
+ click.echo(f" Confidence: {sentence.confidence:.2f}")
545
542
 
546
543
  click.echo("-" * 80)
547
544
 
@@ -666,22 +663,27 @@ def db_cmd():
666
663
  Commands:
667
664
  import-gleif Import GLEIF LEI data (~3M records)
668
665
  import-sec Import SEC Edgar bulk data (~100K+ filers)
666
+ import-sec-officers Import SEC Form 4 officers/directors
667
+ import-ch-officers Import UK Companies House officers (Prod195)
669
668
  import-companies-house Import UK Companies House (~5M records)
670
- import-wikidata Import Wikidata organizations
671
- import-people Import Wikidata notable people
669
+ import-wikidata Import Wikidata organizations (SPARQL, may timeout)
670
+ import-people Import Wikidata notable people (SPARQL, may timeout)
671
+ import-wikidata-dump Import from Wikidata JSON dump (recommended)
672
+ canonicalize Link equivalent records across sources
672
673
  status Show database status
673
674
  search Search for an organization
674
675
  search-people Search for a person
675
676
  download Download database from HuggingFace
676
- upload Upload database with lite/compressed variants
677
+ upload Upload database with lite variant
677
678
  create-lite Create lite version (no record data)
678
- compress Compress database with gzip
679
679
 
680
680
  \b
681
681
  Examples:
682
682
  corp-extractor db import-sec --download
683
+ corp-extractor db import-sec-officers --start-year 2023 --limit 10000
683
684
  corp-extractor db import-gleif --download --limit 100000
684
- corp-extractor db import-people --all --limit 10000
685
+ corp-extractor db import-wikidata-dump --download --limit 50000
686
+ corp-extractor db canonicalize
685
687
  corp-extractor db status
686
688
  corp-extractor db search "Apple Inc"
687
689
  corp-extractor db search-people "Tim Cook"
@@ -868,6 +870,213 @@ def db_import_sec(download: bool, file_path: Optional[str], db_path: Optional[st
868
870
  database.close()
869
871
 
870
872
 
873
+ @db_cmd.command("import-sec-officers")
874
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
875
+ @click.option("--start-year", type=int, default=2020, help="Start year (default: 2020)")
876
+ @click.option("--end-year", type=int, help="End year (default: current year)")
877
+ @click.option("--limit", type=int, help="Limit number of records")
878
+ @click.option("--batch-size", type=int, default=1000, help="Batch size for commits (default: 1000)")
879
+ @click.option("--resume", is_flag=True, help="Resume from saved progress")
880
+ @click.option("--skip-existing", is_flag=True, help="Skip records that already exist")
881
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
882
+ def db_import_sec_officers(db_path: Optional[str], start_year: int, end_year: Optional[int], limit: Optional[int], batch_size: int, resume: bool, skip_existing: bool, verbose: bool):
883
+ """
884
+ Import SEC Form 4 insider data into the people database.
885
+
886
+ Downloads Form 4 filings from SEC EDGAR and extracts officers, directors,
887
+ and significant investors (10%+ owners) from each company.
888
+
889
+ Form 4 filings are submitted when insiders buy or sell company stock.
890
+ They contain the person's name, role (officer/director), and company.
891
+
892
+ Rate limited to 5 requests/second to comply with SEC guidelines.
893
+
894
+ \b
895
+ Examples:
896
+ corp-extractor db import-sec-officers --limit 1000
897
+ corp-extractor db import-sec-officers --start-year 2023
898
+ corp-extractor db import-sec-officers --resume
899
+ corp-extractor db import-sec-officers --skip-existing -v
900
+ """
901
+ _configure_logging(verbose)
902
+
903
+ from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
904
+ from .database.embeddings import CompanyEmbedder
905
+ from .database.importers.sec_form4 import SecForm4Importer
906
+
907
+ # Default database path
908
+ if db_path is None:
909
+ db_path_obj = DEFAULT_DB_PATH
910
+ else:
911
+ db_path_obj = Path(db_path)
912
+
913
+ click.echo(f"Importing SEC Form 4 officers/directors to {db_path_obj}...", err=True)
914
+ click.echo(f"Year range: {start_year} - {end_year or 'current'}", err=True)
915
+ if resume:
916
+ click.echo("Resuming from saved progress...", err=True)
917
+
918
+ # Initialize components
919
+ database = get_person_database(db_path=db_path_obj)
920
+ org_database = get_database(db_path=db_path_obj)
921
+ embedder = CompanyEmbedder()
922
+ importer = SecForm4Importer()
923
+
924
+ # Import records in batches
925
+ records = []
926
+ count = 0
927
+ skipped_existing = 0
928
+
929
+ def progress_callback(year: int, quarter: int, filing_idx: int, accession: str, total: int) -> None:
930
+ if verbose and filing_idx % 100 == 0:
931
+ click.echo(f" {year} Q{quarter}: {filing_idx} filings, {total} records", err=True)
932
+
933
+ for record in importer.import_range(
934
+ start_year=start_year,
935
+ end_year=end_year,
936
+ limit=limit,
937
+ resume=resume,
938
+ progress_callback=progress_callback,
939
+ ):
940
+ # Skip existing records if flag is set
941
+ if skip_existing:
942
+ existing = database.get_by_source_id(record.source, record.source_id)
943
+ if existing is not None:
944
+ skipped_existing += 1
945
+ continue
946
+
947
+ # Look up org ID by CIK if available
948
+ issuer_cik = record.record.get("issuer_cik", "")
949
+ if issuer_cik:
950
+ org_id = org_database.get_id_by_source_id("sec_edgar", issuer_cik.zfill(10))
951
+ if org_id is not None:
952
+ record.known_for_org_id = org_id
953
+
954
+ records.append(record)
955
+
956
+ if len(records) >= batch_size:
957
+ embedding_texts = [r.get_embedding_text() for r in records]
958
+ embeddings = embedder.embed_batch(embedding_texts)
959
+ database.insert_batch(records, embeddings)
960
+ count += len(records)
961
+ click.echo(f"Imported {count} records...", err=True)
962
+ records = []
963
+
964
+ # Final batch
965
+ if records:
966
+ embedding_texts = [r.get_embedding_text() for r in records]
967
+ embeddings = embedder.embed_batch(embedding_texts)
968
+ database.insert_batch(records, embeddings)
969
+ count += len(records)
970
+
971
+ if skip_existing and skipped_existing > 0:
972
+ click.echo(f"\nImported {count} SEC officers/directors (skipped {skipped_existing} existing).", err=True)
973
+ else:
974
+ click.echo(f"\nImported {count} SEC officers/directors successfully.", err=True)
975
+
976
+ org_database.close()
977
+ database.close()
978
+
979
+
980
+ @db_cmd.command("import-ch-officers")
981
+ @click.option("--file", "file_path", type=click.Path(exists=True), required=True, help="Path to CH officers zip file (Prod195)")
982
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
983
+ @click.option("--limit", type=int, help="Limit number of records")
984
+ @click.option("--batch-size", type=int, default=1000, help="Batch size for commits (default: 1000)")
985
+ @click.option("--resume", is_flag=True, help="Resume from saved progress")
986
+ @click.option("--include-resigned", is_flag=True, help="Include resigned officers (default: current only)")
987
+ @click.option("--skip-existing", is_flag=True, help="Skip records that already exist")
988
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
989
+ def db_import_ch_officers(file_path: str, db_path: Optional[str], limit: Optional[int], batch_size: int, resume: bool, include_resigned: bool, skip_existing: bool, verbose: bool):
990
+ """
991
+ Import Companies House officers data into the people database.
992
+
993
+ Requires the Prod195 bulk officers zip file from Companies House.
994
+ Request access via BulkProducts@companieshouse.gov.uk.
995
+
996
+ \b
997
+ Examples:
998
+ corp-extractor db import-ch-officers --file officers.zip --limit 10000
999
+ corp-extractor db import-ch-officers --file officers.zip --resume
1000
+ corp-extractor db import-ch-officers --file officers.zip --include-resigned
1001
+ """
1002
+ _configure_logging(verbose)
1003
+
1004
+ from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
1005
+ from .database.embeddings import CompanyEmbedder
1006
+ from .database.importers.companies_house_officers import CompaniesHouseOfficersImporter
1007
+
1008
+ # Default database path
1009
+ if db_path is None:
1010
+ db_path_obj = DEFAULT_DB_PATH
1011
+ else:
1012
+ db_path_obj = Path(db_path)
1013
+
1014
+ click.echo(f"Importing Companies House officers to {db_path_obj}...", err=True)
1015
+ if resume:
1016
+ click.echo("Resuming from saved progress...", err=True)
1017
+
1018
+ # Initialize components
1019
+ database = get_person_database(db_path=db_path_obj)
1020
+ org_database = get_database(db_path=db_path_obj)
1021
+ embedder = CompanyEmbedder()
1022
+ importer = CompaniesHouseOfficersImporter()
1023
+
1024
+ # Import records in batches
1025
+ records = []
1026
+ count = 0
1027
+ skipped_existing = 0
1028
+
1029
+ def progress_callback(file_idx: int, line_num: int, total: int) -> None:
1030
+ if verbose:
1031
+ click.echo(f" File {file_idx}: line {line_num}, {total} records", err=True)
1032
+
1033
+ for record in importer.import_from_zip(
1034
+ file_path,
1035
+ limit=limit,
1036
+ resume=resume,
1037
+ current_only=not include_resigned,
1038
+ progress_callback=progress_callback,
1039
+ ):
1040
+ # Skip existing records if flag is set
1041
+ if skip_existing:
1042
+ existing = database.get_by_source_id(record.source, record.source_id)
1043
+ if existing is not None:
1044
+ skipped_existing += 1
1045
+ continue
1046
+
1047
+ # Look up org ID by company number if available
1048
+ company_number = record.record.get("company_number", "")
1049
+ if company_number:
1050
+ org_id = org_database.get_id_by_source_id("companies_house", company_number)
1051
+ if org_id is not None:
1052
+ record.known_for_org_id = org_id
1053
+
1054
+ records.append(record)
1055
+
1056
+ if len(records) >= batch_size:
1057
+ embedding_texts = [r.get_embedding_text() for r in records]
1058
+ embeddings = embedder.embed_batch(embedding_texts)
1059
+ database.insert_batch(records, embeddings)
1060
+ count += len(records)
1061
+ click.echo(f"Imported {count} records...", err=True)
1062
+ records = []
1063
+
1064
+ # Final batch
1065
+ if records:
1066
+ embedding_texts = [r.get_embedding_text() for r in records]
1067
+ embeddings = embedder.embed_batch(embedding_texts)
1068
+ database.insert_batch(records, embeddings)
1069
+ count += len(records)
1070
+
1071
+ if skip_existing and skipped_existing > 0:
1072
+ click.echo(f"\nImported {count} CH officers (skipped {skipped_existing} existing).", err=True)
1073
+ else:
1074
+ click.echo(f"\nImported {count} CH officers successfully.", err=True)
1075
+
1076
+ org_database.close()
1077
+ database.close()
1078
+
1079
+
871
1080
  @db_cmd.command("import-wikidata")
872
1081
  @click.option("--db", "db_path", type=click.Path(), help="Database path")
873
1082
  @click.option("--limit", type=int, help="Limit number of records")
@@ -947,23 +1156,32 @@ def db_import_wikidata(db_path: Optional[str], limit: Optional[int], batch_size:
947
1156
  "academic", "scientist", "journalist", "entrepreneur", "activist"
948
1157
  ]), default="executive", help="Person type to import")
949
1158
  @click.option("--all", "import_all", is_flag=True, help="Run all person type queries sequentially")
1159
+ @click.option("--enrich", is_flag=True, help="Query individual people to get role/org data (slower, resumable)")
1160
+ @click.option("--enrich-only", is_flag=True, help="Only enrich existing people (skip bulk import)")
1161
+ @click.option("--enrich-dates", is_flag=True, help="Query individual people to get start/end dates (slower)")
1162
+ @click.option("--skip-existing", is_flag=True, help="Skip records that already exist (default: update them)")
950
1163
  @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
951
- def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: int, query_type: str, import_all: bool, verbose: bool):
1164
+ def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: int, query_type: str, import_all: bool, enrich: bool, enrich_only: bool, enrich_dates: bool, skip_existing: bool, verbose: bool):
952
1165
  """
953
1166
  Import notable people data from Wikidata via SPARQL.
954
1167
 
1168
+ Uses a two-phase approach for reliability:
1169
+ 1. Bulk import: Fast fetch of QID, name, country (no timeouts)
1170
+ 2. Enrich (optional): Per-person queries for role/org/dates
1171
+
955
1172
  Imports people with English Wikipedia articles (ensures notability).
956
- Includes executives, politicians, athletes, artists, academics, and more.
957
1173
 
958
1174
  \b
959
1175
  Examples:
960
1176
  corp-extractor db import-people --type executive --limit 5000
961
1177
  corp-extractor db import-people --all --limit 10000
1178
+ corp-extractor db import-people --type executive --enrich
1179
+ corp-extractor db import-people --enrich-only --limit 100
962
1180
  corp-extractor db import-people --type politician -v
963
1181
  """
964
1182
  _configure_logging(verbose)
965
1183
 
966
- from .database.store import get_person_database, DEFAULT_DB_PATH
1184
+ from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
967
1185
  from .database.embeddings import CompanyEmbedder
968
1186
  from .database.importers.wikidata_people import WikidataPeopleImporter
969
1187
 
@@ -977,35 +1195,558 @@ def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: i
977
1195
 
978
1196
  # Initialize components
979
1197
  database = get_person_database(db_path=db_path_obj)
1198
+ org_database = get_database(db_path=db_path_obj)
980
1199
  embedder = CompanyEmbedder()
981
1200
  importer = WikidataPeopleImporter(batch_size=batch_size)
982
1201
 
983
- # Batch processing
984
- records = []
985
1202
  count = 0
986
1203
 
987
- for record in importer.import_from_sparql(limit=limit, query_type=query_type, import_all=import_all):
988
- records.append(record)
1204
+ # Phase 1: Bulk import (fast, minimal data) - skip if --enrich-only
1205
+ if not enrich_only:
1206
+ records = []
1207
+ skipped_existing = 0
989
1208
 
990
- if len(records) >= batch_size:
991
- # Generate embeddings using the combined name|role|org format
1209
+ click.echo("Phase 1: Bulk import (QID, name, country)...", err=True)
1210
+
1211
+ for record in importer.import_from_sparql(limit=limit, query_type=query_type, import_all=import_all):
1212
+ # Skip existing records if flag is set
1213
+ if skip_existing:
1214
+ existing = database.get_by_source_id(record.source, record.source_id)
1215
+ if existing is not None:
1216
+ skipped_existing += 1
1217
+ continue
1218
+
1219
+ records.append(record)
1220
+
1221
+ if len(records) >= batch_size:
1222
+ # Generate embeddings (just name for now, will re-embed after enrichment)
1223
+ embedding_texts = [r.get_embedding_text() for r in records]
1224
+ embeddings = embedder.embed_batch(embedding_texts)
1225
+ database.insert_batch(records, embeddings)
1226
+ count += len(records)
1227
+
1228
+ click.echo(f" Imported {count} people...", err=True)
1229
+ records = []
1230
+
1231
+ # Final batch
1232
+ if records:
992
1233
  embedding_texts = [r.get_embedding_text() for r in records]
993
1234
  embeddings = embedder.embed_batch(embedding_texts)
994
1235
  database.insert_batch(records, embeddings)
995
1236
  count += len(records)
996
- click.echo(f" Imported {count} people...", err=True)
997
- records = []
998
1237
 
999
- # Final batch
1000
- if records:
1001
- embedding_texts = [r.get_embedding_text() for r in records]
1002
- embeddings = embedder.embed_batch(embedding_texts)
1003
- database.insert_batch(records, embeddings)
1004
- count += len(records)
1238
+ if skip_existing and skipped_existing > 0:
1239
+ click.echo(f"\nPhase 1 complete: {count} people imported (skipped {skipped_existing} existing).", err=True)
1240
+ else:
1241
+ click.echo(f"\nPhase 1 complete: {count} people imported.", err=True)
1242
+ else:
1243
+ click.echo("Skipping Phase 1 (bulk import) - using existing database records.", err=True)
1244
+ # Enable enrich if enrich_only is set
1245
+ enrich = True
1246
+
1247
+ # Phase 2: Enrich with role/org/dates (optional, slower but resumable)
1248
+ if enrich:
1249
+ click.echo("\nPhase 2: Enriching with role/org/dates (parallel queries)...", err=True)
1250
+ # Get all people without role/org
1251
+ people_to_enrich = []
1252
+ enriched_count = 0
1253
+ for record in database.iter_records():
1254
+ if not record.known_for_role and not record.known_for_org:
1255
+ people_to_enrich.append(record)
1256
+ enriched_count += 1
1257
+ # Apply limit if --enrich-only
1258
+ if enrich_only and limit and enriched_count >= limit:
1259
+ break
1260
+
1261
+ if people_to_enrich:
1262
+ click.echo(f"Found {len(people_to_enrich)} people to enrich...", err=True)
1263
+ importer.enrich_people_role_org_batch(people_to_enrich, delay_seconds=0.1, max_workers=5)
1264
+
1265
+ # Persist the enriched data and re-generate embeddings
1266
+ updated = 0
1267
+ org_count = 0
1268
+ date_count = 0
1269
+ for person in people_to_enrich:
1270
+ if person.known_for_role or person.known_for_org:
1271
+ # Look up org ID if we have org_qid
1272
+ org_qid = person.record.get("org_qid", "")
1273
+ if org_qid:
1274
+ org_id = org_database.get_id_by_source_id("wikipedia", org_qid)
1275
+ if org_id is not None:
1276
+ person.known_for_org_id = org_id
1277
+
1278
+ # Update the record with new role/org/dates and re-embed
1279
+ new_embedding_text = person.get_embedding_text()
1280
+ new_embedding = embedder.embed(new_embedding_text)
1281
+ if database.update_role_org(
1282
+ person.source, person.source_id,
1283
+ person.known_for_role, person.known_for_org,
1284
+ person.known_for_org_id, new_embedding,
1285
+ person.from_date, person.to_date,
1286
+ ):
1287
+ updated += 1
1288
+ if person.known_for_org:
1289
+ org_count += 1
1290
+ if person.from_date or person.to_date:
1291
+ date_count += 1
1292
+ if verbose:
1293
+ date_str = ""
1294
+ if person.from_date or person.to_date:
1295
+ date_str = f" ({person.from_date or '?'} - {person.to_date or '?'})"
1296
+ click.echo(f" {person.name}: {person.known_for_role} at {person.known_for_org}{date_str}", err=True)
1297
+
1298
+ click.echo(f"Updated {updated} people ({org_count} with orgs, {date_count} with dates).", err=True)
1299
+
1300
+ # Phase 3: Enrich with dates (optional, even slower)
1301
+ if enrich_dates:
1302
+ click.echo("\nPhase 3: Enriching with dates...", err=True)
1303
+ # Get all people without dates but with role (dates are associated with positions)
1304
+ people_to_enrich = []
1305
+ for record in database.iter_records():
1306
+ if not record.from_date and not record.to_date and record.known_for_role:
1307
+ people_to_enrich.append(record)
1308
+
1309
+ if people_to_enrich:
1310
+ click.echo(f"Found {len(people_to_enrich)} people to enrich with dates...", err=True)
1311
+ enriched = importer.enrich_people_batch(people_to_enrich, delay_seconds=0.3)
1312
+
1313
+ # Persist the enriched dates
1314
+ updated = 0
1315
+ for person in people_to_enrich:
1316
+ if person.from_date or person.to_date:
1317
+ if database.update_dates(person.source, person.source_id, person.from_date, person.to_date):
1318
+ updated += 1
1319
+ if verbose:
1320
+ click.echo(f" {person.name}: {person.from_date or '?'} - {person.to_date or '?'}", err=True)
1321
+
1322
+ click.echo(f"Updated {updated} people with dates.", err=True)
1323
+
1324
+ org_database.close()
1325
+ database.close()
1326
+
1327
+
1328
+ @db_cmd.command("import-wikidata-dump")
1329
+ @click.option("--dump", "dump_path", type=click.Path(exists=True), help="Path to Wikidata JSON dump file (.bz2 or .gz)")
1330
+ @click.option("--download", is_flag=True, help="Download latest dump first (~100GB)")
1331
+ @click.option("--force", is_flag=True, help="Force re-download even if cached")
1332
+ @click.option("--no-aria2", is_flag=True, help="Don't use aria2c even if available (slower)")
1333
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1334
+ @click.option("--people/--no-people", default=True, help="Import people (default: yes)")
1335
+ @click.option("--orgs/--no-orgs", default=True, help="Import organizations (default: yes)")
1336
+ @click.option("--require-enwiki", is_flag=True, help="Only import orgs with English Wikipedia articles")
1337
+ @click.option("--resume", is_flag=True, help="Resume from last position in dump file (tracks entity index)")
1338
+ @click.option("--skip-updates", is_flag=True, help="Skip Q codes already in database (no updates)")
1339
+ @click.option("--limit", type=int, help="Max records per type (people and/or orgs)")
1340
+ @click.option("--batch-size", type=int, default=10000, help="Batch size for commits (default: 10000)")
1341
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1342
+ def db_import_wikidata_dump(
1343
+ dump_path: Optional[str],
1344
+ download: bool,
1345
+ force: bool,
1346
+ no_aria2: bool,
1347
+ db_path: Optional[str],
1348
+ people: bool,
1349
+ orgs: bool,
1350
+ require_enwiki: bool,
1351
+ resume: bool,
1352
+ skip_updates: bool,
1353
+ limit: Optional[int],
1354
+ batch_size: int,
1355
+ verbose: bool,
1356
+ ):
1357
+ """
1358
+ Import people and organizations from Wikidata JSON dump.
1359
+
1360
+ This uses the full Wikidata JSON dump (~100GB compressed) to import
1361
+ all humans and organizations with English Wikipedia articles. This
1362
+ avoids SPARQL query timeouts that occur with large result sets.
1363
+
1364
+ The dump is streamed line-by-line to minimize memory usage.
1365
+
1366
+ \b
1367
+ Features:
1368
+ - No timeouts (processes locally)
1369
+ - Complete coverage (all notable people/orgs)
1370
+ - Resumable with --resume (tracks position in dump file)
1371
+ - Skip existing with --skip-updates (loads existing Q codes)
1372
+ - People like Andy Burnham are captured via occupation (P106)
1373
+
1374
+ \b
1375
+ Resume options:
1376
+ - --resume: Resume from where the dump processing left off (tracks entity index).
1377
+ Progress is saved after each batch. Use this if import was interrupted.
1378
+ - --skip-updates: Skip Q codes already in database (no updates to existing records).
1379
+ Use this to add new records without re-processing existing ones.
1380
+
1381
+ \b
1382
+ Examples:
1383
+ corp-extractor db import-wikidata-dump --dump /path/to/dump.json.bz2 --limit 10000
1384
+ corp-extractor db import-wikidata-dump --download --people --no-orgs --limit 50000
1385
+ corp-extractor db import-wikidata-dump --dump dump.json.bz2 --orgs --no-people
1386
+ corp-extractor db import-wikidata-dump --dump dump.json.bz2 --resume # Resume interrupted import
1387
+ corp-extractor db import-wikidata-dump --dump dump.json.bz2 --skip-updates # Skip existing Q codes
1388
+ """
1389
+ _configure_logging(verbose)
1390
+
1391
+ from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
1392
+ from .database.embeddings import CompanyEmbedder
1393
+ from .database.importers.wikidata_dump import WikidataDumpImporter, DumpProgress
1394
+
1395
+ if not dump_path and not download:
1396
+ raise click.UsageError("Either --dump path or --download is required")
1397
+
1398
+ if not people and not orgs:
1399
+ raise click.UsageError("Must import at least one of --people or --orgs")
1400
+
1401
+ # Default database path
1402
+ if db_path is None:
1403
+ db_path_obj = DEFAULT_DB_PATH
1404
+ else:
1405
+ db_path_obj = Path(db_path)
1406
+
1407
+ click.echo(f"Importing Wikidata dump to {db_path_obj}...", err=True)
1408
+
1409
+ # Initialize importer
1410
+ importer = WikidataDumpImporter(dump_path=dump_path)
1411
+
1412
+ # Download if requested
1413
+ if download:
1414
+ import shutil
1415
+ dump_target = importer.get_dump_path()
1416
+ click.echo(f"Downloading Wikidata dump (~100GB) to:", err=True)
1417
+ click.echo(f" {dump_target}", err=True)
1418
+
1419
+ # Check for aria2c
1420
+ has_aria2 = shutil.which("aria2c") is not None
1421
+ use_aria2 = has_aria2 and not no_aria2
1422
+
1423
+ if use_aria2:
1424
+ click.echo(" Using aria2c for fast parallel download (16 connections)", err=True)
1425
+ dump_file = importer.download_dump(force=force, use_aria2=True)
1426
+ click.echo(f"\nUsing dump: {dump_file}", err=True)
1427
+ else:
1428
+ if not has_aria2:
1429
+ click.echo("", err=True)
1430
+ click.echo(" TIP: Install aria2c for 10-20x faster downloads:", err=True)
1431
+ click.echo(" brew install aria2 (macOS)", err=True)
1432
+ click.echo(" apt install aria2 (Ubuntu/Debian)", err=True)
1433
+ click.echo("", err=True)
1434
+
1435
+ # Use urllib to get content length first
1436
+ import urllib.request
1437
+ req = urllib.request.Request(
1438
+ "https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2",
1439
+ headers={"User-Agent": "corp-extractor/1.0"},
1440
+ method="HEAD"
1441
+ )
1442
+ with urllib.request.urlopen(req) as response:
1443
+ total_size = int(response.headers.get("content-length", 0))
1444
+
1445
+ if total_size:
1446
+ total_gb = total_size / (1024 ** 3)
1447
+ click.echo(f" Size: {total_gb:.1f} GB", err=True)
1448
+
1449
+ # Download with progress bar
1450
+ progress_bar = None
1451
+
1452
+ def update_progress(downloaded: int, total: int) -> None:
1453
+ nonlocal progress_bar
1454
+ if progress_bar is None and total > 0:
1455
+ progress_bar = click.progressbar(
1456
+ length=total,
1457
+ label="Downloading",
1458
+ show_percent=True,
1459
+ show_pos=True,
1460
+ item_show_func=lambda x: f"{(x or 0) / (1024**3):.1f} GB" if x else "",
1461
+ )
1462
+ progress_bar.__enter__()
1463
+ if progress_bar:
1464
+ # Update to absolute position
1465
+ progress_bar.update(downloaded - progress_bar.pos)
1466
+
1467
+ try:
1468
+ dump_file = importer.download_dump(force=force, use_aria2=False, progress_callback=update_progress)
1469
+ finally:
1470
+ if progress_bar:
1471
+ progress_bar.__exit__(None, None, None)
1472
+
1473
+ click.echo(f"\nUsing dump: {dump_file}", err=True)
1474
+ elif dump_path:
1475
+ click.echo(f"Using dump: {dump_path}", err=True)
1476
+
1477
+ # Initialize embedder (loads model, may take time on first run)
1478
+ click.echo("Loading embedding model...", err=True)
1479
+ sys.stderr.flush()
1480
+ embedder = CompanyEmbedder()
1481
+ click.echo("Embedding model loaded.", err=True)
1482
+ sys.stderr.flush()
1483
+
1484
+ # Load existing QID labels from database and seed the importer's cache
1485
+ database = get_person_database(db_path=db_path_obj)
1486
+ existing_labels = database.get_all_qid_labels()
1487
+ if existing_labels:
1488
+ click.echo(f"Loaded {len(existing_labels):,} existing QID labels from DB", err=True)
1489
+ importer.set_label_cache(existing_labels)
1490
+ known_qids_at_start = set(existing_labels.keys())
1491
+
1492
+ # Load existing source_ids for skip_updates mode
1493
+ existing_people_ids: set[str] = set()
1494
+ existing_org_ids: set[str] = set()
1495
+ if skip_updates:
1496
+ click.echo("Loading existing records for --skip-updates...", err=True)
1497
+ if people:
1498
+ existing_people_ids = database.get_all_source_ids(source="wikidata")
1499
+ click.echo(f" Found {len(existing_people_ids):,} existing people Q codes", err=True)
1500
+ if orgs:
1501
+ org_database = get_database(db_path=db_path_obj)
1502
+ existing_org_ids = org_database.get_all_source_ids(source="wikipedia")
1503
+ click.echo(f" Found {len(existing_org_ids):,} existing org Q codes", err=True)
1504
+
1505
+ # Load progress for resume mode (position-based resume)
1506
+ progress: Optional[DumpProgress] = None
1507
+ start_index = 0
1508
+ if resume:
1509
+ progress = DumpProgress.load()
1510
+ if progress:
1511
+ # Verify the progress is for the same dump file
1512
+ actual_dump_path = importer._dump_path or Path(dump_path) if dump_path else importer.get_dump_path()
1513
+ if progress.matches_dump(actual_dump_path):
1514
+ start_index = progress.entity_index
1515
+ click.echo(f"Resuming from entity index {start_index:,}", err=True)
1516
+ click.echo(f" Last entity: {progress.last_entity_id}", err=True)
1517
+ click.echo(f" Last updated: {progress.last_updated}", err=True)
1518
+ else:
1519
+ click.echo("Warning: Progress file is for a different dump, starting from beginning", err=True)
1520
+ progress = None
1521
+ else:
1522
+ click.echo("No progress file found, starting from beginning", err=True)
1523
+
1524
+ # Initialize progress tracking
1525
+ if progress is None:
1526
+ actual_dump_path = importer._dump_path or Path(dump_path) if dump_path else importer.get_dump_path()
1527
+ progress = DumpProgress(
1528
+ dump_path=str(actual_dump_path),
1529
+ dump_size=actual_dump_path.stat().st_size if actual_dump_path.exists() else 0,
1530
+ )
1005
1531
 
1006
- click.echo(f"\nImported {count} people successfully.", err=True)
1532
+ # Helper to persist new labels after each batch
1533
+ def persist_new_labels() -> int:
1534
+ new_labels = importer.get_new_labels_since(known_qids_at_start)
1535
+ if new_labels:
1536
+ database.insert_qid_labels(new_labels)
1537
+ known_qids_at_start.update(new_labels.keys())
1538
+ return len(new_labels)
1539
+ return 0
1540
+
1541
+ # Combined import - single pass through the dump for both people and orgs
1542
+ click.echo("\n=== Combined Import (single dump pass) ===", err=True)
1543
+ sys.stderr.flush() # Ensure output is visible immediately
1544
+ if people:
1545
+ click.echo(f" People: {'up to ' + str(limit) + ' records' if limit else 'unlimited'}", err=True)
1546
+ if skip_updates and existing_people_ids:
1547
+ click.echo(f" Skip updates: {len(existing_people_ids):,} existing Q codes", err=True)
1548
+ if orgs:
1549
+ click.echo(f" Orgs: {'up to ' + str(limit) + ' records' if limit else 'unlimited'}", err=True)
1550
+ if require_enwiki:
1551
+ click.echo(" Filter: only orgs with English Wikipedia articles", err=True)
1552
+ if skip_updates and existing_org_ids:
1553
+ click.echo(f" Skip updates: {len(existing_org_ids):,} existing Q codes", err=True)
1554
+ if start_index > 0:
1555
+ click.echo(f" Resuming from entity index {start_index:,}", err=True)
1556
+
1557
+ # Initialize databases
1558
+ person_database = get_person_database(db_path=db_path_obj)
1559
+ org_database = get_database(db_path=db_path_obj) if orgs else None
1560
+
1561
+ # Batches for each type
1562
+ people_records: list = []
1563
+ org_records: list = []
1564
+ people_count = 0
1565
+ orgs_count = 0
1566
+ last_entity_index = start_index
1567
+ last_entity_id = ""
1568
+
1569
+ def combined_progress_callback(entity_index: int, entity_id: str, ppl_count: int, org_count: int) -> None:
1570
+ nonlocal last_entity_index, last_entity_id
1571
+ last_entity_index = entity_index
1572
+ last_entity_id = entity_id
1573
+
1574
+ def save_progress() -> None:
1575
+ if progress:
1576
+ progress.entity_index = last_entity_index
1577
+ progress.last_entity_id = last_entity_id
1578
+ progress.people_yielded = people_count
1579
+ progress.orgs_yielded = orgs_count
1580
+ progress.save()
1581
+
1582
+ def flush_people_batch() -> None:
1583
+ nonlocal people_records, people_count
1584
+ if people_records:
1585
+ embedding_texts = [r.get_embedding_text() for r in people_records]
1586
+ embeddings = embedder.embed_batch(embedding_texts)
1587
+ person_database.insert_batch(people_records, embeddings)
1588
+ people_count += len(people_records)
1589
+ people_records = []
1590
+
1591
+ def flush_org_batch() -> None:
1592
+ nonlocal org_records, orgs_count
1593
+ if org_records and org_database:
1594
+ names = [r.name for r in org_records]
1595
+ embeddings = embedder.embed_batch(names)
1596
+ org_database.insert_batch(org_records, embeddings)
1597
+ orgs_count += len(org_records)
1598
+ org_records = []
1599
+
1600
+ # Calculate total for progress bar (if limits set for both)
1601
+ total_limit = None
1602
+ if limit and people and orgs:
1603
+ total_limit = limit * 2 # Rough estimate
1604
+ elif limit:
1605
+ total_limit = limit
1606
+
1607
+ click.echo("Starting dump iteration...", err=True)
1608
+ sys.stderr.flush()
1609
+
1610
+ records_seen = 0
1611
+ try:
1612
+ if total_limit:
1613
+ # Use progress bar when we have limits
1614
+ with click.progressbar(
1615
+ length=total_limit,
1616
+ label="Processing dump",
1617
+ show_percent=True,
1618
+ show_pos=True,
1619
+ ) as pbar:
1620
+ for record_type, record in importer.import_all(
1621
+ people_limit=limit if people else 0,
1622
+ orgs_limit=limit if orgs else 0,
1623
+ import_people=people,
1624
+ import_orgs=orgs,
1625
+ require_enwiki=require_enwiki,
1626
+ skip_people_ids=existing_people_ids if skip_updates else None,
1627
+ skip_org_ids=existing_org_ids if skip_updates else None,
1628
+ start_index=start_index,
1629
+ progress_callback=combined_progress_callback,
1630
+ ):
1631
+ records_seen += 1
1632
+ pbar.update(1)
1633
+
1634
+ if record_type == "person":
1635
+ people_records.append(record)
1636
+ if len(people_records) >= batch_size:
1637
+ flush_people_batch()
1638
+ persist_new_labels()
1639
+ save_progress()
1640
+ else: # org
1641
+ org_records.append(record)
1642
+ if len(org_records) >= batch_size:
1643
+ flush_org_batch()
1644
+ persist_new_labels()
1645
+ save_progress()
1646
+ else:
1647
+ # No limit - show counter updates
1648
+ for record_type, record in importer.import_all(
1649
+ people_limit=None,
1650
+ orgs_limit=None,
1651
+ import_people=people,
1652
+ import_orgs=orgs,
1653
+ require_enwiki=require_enwiki,
1654
+ skip_people_ids=existing_people_ids if skip_updates else None,
1655
+ skip_org_ids=existing_org_ids if skip_updates else None,
1656
+ start_index=start_index,
1657
+ progress_callback=combined_progress_callback,
1658
+ ):
1659
+ records_seen += 1
1660
+ # Show first record immediately as proof of life
1661
+ if records_seen == 1:
1662
+ click.echo(f" First record found: {record.name}", err=True)
1663
+ sys.stderr.flush()
1664
+
1665
+ if record_type == "person":
1666
+ people_records.append(record)
1667
+ if len(people_records) >= batch_size:
1668
+ flush_people_batch()
1669
+ persist_new_labels()
1670
+ save_progress()
1671
+ click.echo(f"\r Progress: {people_count:,} people, {orgs_count:,} orgs...", nl=False, err=True)
1672
+ sys.stderr.flush()
1673
+ else: # org
1674
+ org_records.append(record)
1675
+ if len(org_records) >= batch_size:
1676
+ flush_org_batch()
1677
+ persist_new_labels()
1678
+ save_progress()
1679
+ click.echo(f"\r Progress: {people_count:,} people, {orgs_count:,} orgs...", nl=False, err=True)
1680
+ sys.stderr.flush()
1681
+
1682
+ click.echo("", err=True) # Newline after counter
1683
+
1684
+ # Final batches
1685
+ flush_people_batch()
1686
+ flush_org_batch()
1687
+ persist_new_labels()
1688
+ save_progress()
1689
+
1690
+ finally:
1691
+ # Ensure we save progress even on interrupt
1692
+ save_progress()
1693
+
1694
+ click.echo(f"Import complete: {people_count:,} people, {orgs_count:,} orgs", err=True)
1695
+
1696
+ # Keep references for final label resolution
1697
+ database = person_database
1698
+ if org_database:
1699
+ org_database.close()
1700
+
1701
+ # Final label resolution pass for any remaining unresolved QIDs
1702
+ click.echo("\n=== Final QID Label Resolution ===", err=True)
1703
+
1704
+ # Get the full label cache (includes labels from DB + new ones from import)
1705
+ all_labels = importer.get_label_cache()
1706
+ click.echo(f" Total labels in cache: {len(all_labels):,}", err=True)
1707
+
1708
+ # Check for any remaining unresolved QIDs in the database
1709
+ people_unresolved = database.get_unresolved_qids()
1710
+ click.echo(f" Unresolved QIDs in people: {len(people_unresolved):,}", err=True)
1711
+
1712
+ org_unresolved: set[str] = set()
1713
+ if orgs:
1714
+ org_database = get_database(db_path=db_path_obj)
1715
+ org_unresolved = org_database.get_unresolved_qids()
1716
+ click.echo(f" Unresolved QIDs in orgs: {len(org_unresolved):,}", err=True)
1717
+
1718
+ all_unresolved = people_unresolved | org_unresolved
1719
+ need_sparql = all_unresolved - set(all_labels.keys())
1720
+
1721
+ if need_sparql:
1722
+ click.echo(f" Resolving {len(need_sparql):,} remaining QIDs via SPARQL...", err=True)
1723
+ sparql_resolved = importer.resolve_qids_via_sparql(need_sparql)
1724
+ all_labels.update(sparql_resolved)
1725
+ # Persist newly resolved labels
1726
+ if sparql_resolved:
1727
+ database.insert_qid_labels(sparql_resolved)
1728
+ click.echo(f" SPARQL resolved and stored: {len(sparql_resolved):,}", err=True)
1729
+
1730
+ # Update records with any newly resolved labels
1731
+ if all_labels:
1732
+ updates, deletes = database.resolve_qid_labels(all_labels)
1733
+ if updates or deletes:
1734
+ click.echo(f" People: {updates:,} updated, {deletes:,} duplicates deleted", err=True)
1735
+
1736
+ if orgs:
1737
+ org_database = get_database(db_path=db_path_obj)
1738
+ org_updates = org_database.resolve_qid_labels(all_labels)
1739
+ if org_updates:
1740
+ click.echo(f" Updated orgs: {org_updates:,} regions", err=True)
1741
+ org_database.close()
1742
+
1743
+ # Final stats
1744
+ final_label_count = database.get_qid_labels_count()
1745
+ click.echo(f" Total labels in DB: {final_label_count:,}", err=True)
1007
1746
  database.close()
1008
1747
 
1748
+ click.echo("\nWikidata dump import complete!", err=True)
1749
+
1009
1750
 
1010
1751
  @db_cmd.command("search-people")
1011
1752
  @click.argument("query")
@@ -1185,12 +1926,93 @@ def db_status(db_path: Optional[str]):
1185
1926
  for source, count in stats.by_source.items():
1186
1927
  click.echo(f" {source}: {count:,}")
1187
1928
 
1929
+ # Show canonicalization stats
1930
+ canon_stats = database.get_canon_stats()
1931
+ if canon_stats["canonicalized_records"] > 0:
1932
+ click.echo("\nCanonicalization:")
1933
+ click.echo(f" Canonicalized: {canon_stats['canonicalized_records']:,} / {canon_stats['total_records']:,}")
1934
+ click.echo(f" Canonical groups: {canon_stats['canonical_groups']:,}")
1935
+ click.echo(f" Multi-record groups: {canon_stats['multi_record_groups']:,}")
1936
+ click.echo(f" Records in multi-groups: {canon_stats['records_in_multi_groups']:,}")
1937
+ else:
1938
+ click.echo("\nCanonicalization: Not run yet")
1939
+ click.echo(" Run 'corp-extractor db canonicalize' to link equivalent records")
1940
+
1188
1941
  database.close()
1189
1942
 
1190
1943
  except Exception as e:
1191
1944
  raise click.ClickException(f"Failed to read database: {e}")
1192
1945
 
1193
1946
 
1947
+ @db_cmd.command("canonicalize")
1948
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1949
+ @click.option("--batch-size", type=int, default=10000, help="Batch size for updates (default: 10000)")
1950
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1951
+ def db_canonicalize(db_path: Optional[str], batch_size: int, verbose: bool):
1952
+ """
1953
+ Canonicalize organizations by linking equivalent records across sources.
1954
+
1955
+ Records are considered equivalent if they share:
1956
+ - Same LEI (globally unique legal entity identifier)
1957
+ - Same ticker symbol
1958
+ - Same CIK (SEC identifier)
1959
+ - Same normalized name (after lowercasing, removing dots)
1960
+ - Same name with suffix expansion (Ltd -> Limited, etc.)
1961
+
1962
+ For each group, the highest-priority source becomes canonical:
1963
+ gleif > sec_edgar > companies_house > wikipedia
1964
+
1965
+ Canonicalization enables better search re-ranking by boosting results
1966
+ that have records from multiple authoritative sources.
1967
+
1968
+ \b
1969
+ Examples:
1970
+ corp-extractor db canonicalize
1971
+ corp-extractor db canonicalize -v
1972
+ corp-extractor db canonicalize --db /path/to/entities.db
1973
+ """
1974
+ _configure_logging(verbose)
1975
+
1976
+ from .database import OrganizationDatabase
1977
+ from .database.store import get_person_database
1978
+
1979
+ try:
1980
+ # Canonicalize organizations
1981
+ database = OrganizationDatabase(db_path=db_path)
1982
+ click.echo("Running organization canonicalization...", err=True)
1983
+
1984
+ result = database.canonicalize(batch_size=batch_size)
1985
+
1986
+ click.echo("\nOrganization Canonicalization Results")
1987
+ click.echo("=" * 40)
1988
+ click.echo(f"Total records processed: {result['total_records']:,}")
1989
+ click.echo(f"Equivalence groups found: {result['groups_found']:,}")
1990
+ click.echo(f"Multi-record groups: {result['multi_record_groups']:,}")
1991
+ click.echo(f"Records updated: {result['records_updated']:,}")
1992
+
1993
+ database.close()
1994
+
1995
+ # Canonicalize people
1996
+ db_path_obj = Path(db_path) if db_path else None
1997
+ person_db = get_person_database(db_path=db_path_obj)
1998
+ click.echo("\nRunning people canonicalization...", err=True)
1999
+
2000
+ people_result = person_db.canonicalize(batch_size=batch_size)
2001
+
2002
+ click.echo("\nPeople Canonicalization Results")
2003
+ click.echo("=" * 40)
2004
+ click.echo(f"Total records processed: {people_result['total_records']:,}")
2005
+ click.echo(f"Matched by organization: {people_result['matched_by_org']:,}")
2006
+ click.echo(f"Matched by date overlap: {people_result['matched_by_date']:,}")
2007
+ click.echo(f"Canonical groups: {people_result['canonical_groups']:,}")
2008
+ click.echo(f"Records in multi-record groups: {people_result['records_in_groups']:,}")
2009
+
2010
+ person_db.close()
2011
+
2012
+ except Exception as e:
2013
+ raise click.ClickException(f"Canonicalization failed: {e}")
2014
+
2015
+
1194
2016
  @db_cmd.command("search")
1195
2017
  @click.argument("query")
1196
2018
  @click.option("--db", "db_path", type=click.Path(), help="Database path")
@@ -1247,10 +2069,9 @@ def db_search(query: str, db_path: Optional[str], top_k: int, source: Optional[s
1247
2069
  @click.option("--repo", type=str, default="Corp-o-Rate-Community/entity-references", help="HuggingFace repo ID")
1248
2070
  @click.option("--db", "db_path", type=click.Path(), help="Output path for database")
1249
2071
  @click.option("--full", is_flag=True, help="Download full version (larger, includes record metadata)")
1250
- @click.option("--no-compress", is_flag=True, help="Download uncompressed version (slower)")
1251
2072
  @click.option("--force", is_flag=True, help="Force re-download")
1252
2073
  @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1253
- def db_download(repo: str, db_path: Optional[str], full: bool, no_compress: bool, force: bool, verbose: bool):
2074
+ def db_download(repo: str, db_path: Optional[str], full: bool, force: bool, verbose: bool):
1254
2075
  """
1255
2076
  Download entity database from HuggingFace Hub.
1256
2077
 
@@ -1274,7 +2095,6 @@ def db_download(repo: str, db_path: Optional[str], full: bool, no_compress: bool
1274
2095
  repo_id=repo,
1275
2096
  filename=filename,
1276
2097
  force_download=force,
1277
- prefer_compressed=not no_compress,
1278
2098
  )
1279
2099
  click.echo(f"Database downloaded to: {path}")
1280
2100
  except Exception as e:
@@ -1286,27 +2106,23 @@ def db_download(repo: str, db_path: Optional[str], full: bool, no_compress: bool
1286
2106
  @click.option("--repo", type=str, default="Corp-o-Rate-Community/entity-references", help="HuggingFace repo ID")
1287
2107
  @click.option("--message", type=str, default="Update entity database", help="Commit message")
1288
2108
  @click.option("--no-lite", is_flag=True, help="Skip creating lite version (without record data)")
1289
- @click.option("--no-compress", is_flag=True, help="Skip creating compressed versions")
1290
2109
  @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1291
- def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, no_compress: bool, verbose: bool):
2110
+ def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, verbose: bool):
1292
2111
  """
1293
- Upload entity database to HuggingFace Hub with variants.
2112
+ Upload entity database to HuggingFace Hub.
1294
2113
 
1295
- If no path is provided, uploads from the default cache location.
1296
-
1297
- By default uploads:
2114
+ First VACUUMs the database, then creates and uploads:
1298
2115
  - entities.db (full database)
1299
2116
  - entities-lite.db (without record data, smaller)
1300
- - entities.db.gz (compressed full)
1301
- - entities-lite.db.gz (compressed lite)
1302
2117
 
2118
+ If no path is provided, uploads from the default cache location.
1303
2119
  Requires HF_TOKEN environment variable to be set.
1304
2120
 
1305
2121
  \b
1306
2122
  Examples:
1307
2123
  corp-extractor db upload
1308
2124
  corp-extractor db upload /path/to/entities.db
1309
- corp-extractor db upload --no-lite --no-compress
2125
+ corp-extractor db upload --no-lite
1310
2126
  corp-extractor db upload --repo my-org/my-entity-db
1311
2127
  """
1312
2128
  _configure_logging(verbose)
@@ -1322,10 +2138,9 @@ def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, no
1322
2138
  )
1323
2139
 
1324
2140
  click.echo(f"Uploading {db_path} to {repo}...", err=True)
2141
+ click.echo(" - Running VACUUM to optimize database", err=True)
1325
2142
  if not no_lite:
1326
2143
  click.echo(" - Creating lite version (without record data)", err=True)
1327
- if not no_compress:
1328
- click.echo(" - Creating compressed versions", err=True)
1329
2144
 
1330
2145
  try:
1331
2146
  results = upload_database_with_variants(
@@ -1333,7 +2148,6 @@ def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, no
1333
2148
  repo_id=repo,
1334
2149
  commit_message=message,
1335
2150
  include_lite=not no_lite,
1336
- include_compressed=not no_compress,
1337
2151
  )
1338
2152
  click.echo(f"\nUploaded {len(results)} file(s) successfully:")
1339
2153
  for filename, url in results.items():
@@ -1371,31 +2185,6 @@ def db_create_lite(db_path: str, output: Optional[str], verbose: bool):
1371
2185
  raise click.ClickException(f"Failed to create lite database: {e}")
1372
2186
 
1373
2187
 
1374
- @db_cmd.command("compress")
1375
- @click.argument("db_path", type=click.Path(exists=True))
1376
- @click.option("-o", "--output", type=click.Path(), help="Output path (default: adds .gz suffix)")
1377
- @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1378
- def db_compress(db_path: str, output: Optional[str], verbose: bool):
1379
- """
1380
- Compress a database file using gzip.
1381
-
1382
- \b
1383
- Examples:
1384
- corp-extractor db compress entities.db
1385
- corp-extractor db compress entities.db -o entities.db.gz
1386
- """
1387
- _configure_logging(verbose)
1388
- from .database.hub import compress_database
1389
-
1390
- click.echo(f"Compressing {db_path}...", err=True)
1391
-
1392
- try:
1393
- compressed_path = compress_database(db_path, output)
1394
- click.echo(f"Compressed database created: {compressed_path}")
1395
- except Exception as e:
1396
- raise click.ClickException(f"Compression failed: {e}")
1397
-
1398
-
1399
2188
  @db_cmd.command("repair-embeddings")
1400
2189
  @click.option("--db", "db_path", type=click.Path(), help="Database path")
1401
2190
  @click.option("--batch-size", type=int, default=1000, help="Batch size for embedding generation (default: 1000)")