corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
  2. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
  3. statement_extractor/cli.py +1317 -101
  4. statement_extractor/database/embeddings.py +45 -0
  5. statement_extractor/database/hub.py +86 -136
  6. statement_extractor/database/importers/__init__.py +10 -2
  7. statement_extractor/database/importers/companies_house.py +16 -2
  8. statement_extractor/database/importers/companies_house_officers.py +431 -0
  9. statement_extractor/database/importers/gleif.py +23 -0
  10. statement_extractor/database/importers/import_utils.py +264 -0
  11. statement_extractor/database/importers/sec_edgar.py +17 -0
  12. statement_extractor/database/importers/sec_form4.py +512 -0
  13. statement_extractor/database/importers/wikidata.py +151 -43
  14. statement_extractor/database/importers/wikidata_dump.py +2282 -0
  15. statement_extractor/database/importers/wikidata_people.py +867 -325
  16. statement_extractor/database/migrate_v2.py +852 -0
  17. statement_extractor/database/models.py +155 -7
  18. statement_extractor/database/schema_v2.py +409 -0
  19. statement_extractor/database/seed_data.py +359 -0
  20. statement_extractor/database/store.py +3449 -233
  21. statement_extractor/document/deduplicator.py +10 -12
  22. statement_extractor/extractor.py +1 -1
  23. statement_extractor/models/__init__.py +3 -2
  24. statement_extractor/models/statement.py +15 -17
  25. statement_extractor/models.py +1 -1
  26. statement_extractor/pipeline/context.py +5 -5
  27. statement_extractor/pipeline/orchestrator.py +12 -12
  28. statement_extractor/plugins/base.py +17 -17
  29. statement_extractor/plugins/extractors/gliner2.py +28 -28
  30. statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
  31. statement_extractor/plugins/qualifiers/person.py +120 -53
  32. statement_extractor/plugins/splitters/t5_gemma.py +35 -39
  33. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
  34. {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
@@ -439,7 +439,7 @@ def _print_pipeline_json(ctx):
439
439
  """Print pipeline results as JSON."""
440
440
  output = {
441
441
  "statement_count": ctx.statement_count,
442
- "raw_triples": [t.model_dump() for t in ctx.raw_triples],
442
+ "split_sentences": [s.model_dump() for s in ctx.split_sentences],
443
443
  "statements": [s.model_dump() for s in ctx.statements],
444
444
  "labeled_statements": [stmt.as_dict() for stmt in ctx.labeled_statements],
445
445
  "timings": ctx.stage_timings,
@@ -472,9 +472,10 @@ def _print_pipeline_triples(ctx):
472
472
  elif ctx.statements:
473
473
  for stmt in ctx.statements:
474
474
  click.echo(f"{stmt.subject.text}\t{stmt.predicate}\t{stmt.object.text}")
475
- elif ctx.raw_triples:
476
- for triple in ctx.raw_triples:
477
- click.echo(f"{triple.subject_text}\t{triple.predicate_text}\t{triple.object_text}")
475
+ elif ctx.split_sentences:
476
+ # Stage 1 only output - just show the split sentences (no triples yet)
477
+ for sentence in ctx.split_sentences:
478
+ click.echo(sentence.text)
478
479
 
479
480
 
480
481
  def _print_pipeline_table(ctx, verbose: bool):
@@ -528,20 +529,16 @@ def _print_pipeline_table(ctx, verbose: bool):
528
529
 
529
530
  click.echo("-" * 80)
530
531
 
531
- elif ctx.raw_triples:
532
- click.echo(f"\nExtracted {len(ctx.raw_triples)} raw triple(s):\n")
532
+ elif ctx.split_sentences:
533
+ click.echo(f"\nSplit into {len(ctx.split_sentences)} atomic sentence(s):\n")
533
534
  click.echo("-" * 80)
534
535
 
535
- for i, triple in enumerate(ctx.raw_triples, 1):
536
- click.echo(f"{i}. {triple.subject_text}")
537
- click.echo(f" --[{triple.predicate_text}]-->")
538
- click.echo(f" {triple.object_text}")
536
+ for i, sentence in enumerate(ctx.split_sentences, 1):
537
+ text_preview = sentence.text[:100] + "..." if len(sentence.text) > 100 else sentence.text
538
+ click.echo(f"{i}. {text_preview}")
539
539
 
540
540
  if verbose:
541
- click.echo(f" Confidence: {triple.confidence:.2f}")
542
- if triple.source_sentence:
543
- source = triple.source_sentence[:60] + "..." if len(triple.source_sentence) > 60 else triple.source_sentence
544
- click.echo(f" Source: \"{source}\"")
541
+ click.echo(f" Confidence: {sentence.confidence:.2f}")
545
542
 
546
543
  click.echo("-" * 80)
547
544
 
@@ -666,22 +663,27 @@ def db_cmd():
666
663
  Commands:
667
664
  import-gleif Import GLEIF LEI data (~3M records)
668
665
  import-sec Import SEC Edgar bulk data (~100K+ filers)
666
+ import-sec-officers Import SEC Form 4 officers/directors
667
+ import-ch-officers Import UK Companies House officers (Prod195)
669
668
  import-companies-house Import UK Companies House (~5M records)
670
- import-wikidata Import Wikidata organizations
671
- import-people Import Wikidata notable people
669
+ import-wikidata Import Wikidata organizations (SPARQL, may timeout)
670
+ import-people Import Wikidata notable people (SPARQL, may timeout)
671
+ import-wikidata-dump Import from Wikidata JSON dump (recommended)
672
+ canonicalize Link equivalent records across sources
672
673
  status Show database status
673
674
  search Search for an organization
674
675
  search-people Search for a person
675
676
  download Download database from HuggingFace
676
- upload Upload database with lite/compressed variants
677
+ upload Upload database with lite variant
677
678
  create-lite Create lite version (no record data)
678
- compress Compress database with gzip
679
679
 
680
680
  \b
681
681
  Examples:
682
682
  corp-extractor db import-sec --download
683
+ corp-extractor db import-sec-officers --start-year 2023 --limit 10000
683
684
  corp-extractor db import-gleif --download --limit 100000
684
- corp-extractor db import-people --all --limit 10000
685
+ corp-extractor db import-wikidata-dump --download --limit 50000
686
+ corp-extractor db canonicalize
685
687
  corp-extractor db status
686
688
  corp-extractor db search "Apple Inc"
687
689
  corp-extractor db search-people "Tim Cook"
@@ -781,10 +783,10 @@ def db_import_gleif(file_path: Optional[str], download: bool, force: bool, db_pa
781
783
  records.append(record)
782
784
 
783
785
  if len(records) >= batch_size:
784
- # Embed and insert batch
786
+ # Embed and insert batch (both float32 and int8)
785
787
  names = [r.name for r in records]
786
- embeddings = embedder.embed_batch(names)
787
- database.insert_batch(records, embeddings)
788
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
789
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
788
790
  count += len(records)
789
791
  click.echo(f"Imported {count} records...", err=True)
790
792
  records = []
@@ -792,8 +794,8 @@ def db_import_gleif(file_path: Optional[str], download: bool, force: bool, db_pa
792
794
  # Final batch
793
795
  if records:
794
796
  names = [r.name for r in records]
795
- embeddings = embedder.embed_batch(names)
796
- database.insert_batch(records, embeddings)
797
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
798
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
797
799
  count += len(records)
798
800
 
799
801
  click.echo(f"\nImported {count} GLEIF records successfully.", err=True)
@@ -851,8 +853,8 @@ def db_import_sec(download: bool, file_path: Optional[str], db_path: Optional[st
851
853
 
852
854
  if len(records) >= batch_size:
853
855
  names = [r.name for r in records]
854
- embeddings = embedder.embed_batch(names)
855
- database.insert_batch(records, embeddings)
856
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
857
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
856
858
  count += len(records)
857
859
  click.echo(f"Imported {count} records...", err=True)
858
860
  records = []
@@ -860,14 +862,221 @@ def db_import_sec(download: bool, file_path: Optional[str], db_path: Optional[st
860
862
  # Final batch
861
863
  if records:
862
864
  names = [r.name for r in records]
863
- embeddings = embedder.embed_batch(names)
864
- database.insert_batch(records, embeddings)
865
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
866
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
865
867
  count += len(records)
866
868
 
867
869
  click.echo(f"\nImported {count} SEC Edgar records successfully.", err=True)
868
870
  database.close()
869
871
 
870
872
 
873
+ @db_cmd.command("import-sec-officers")
874
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
875
+ @click.option("--start-year", type=int, default=2020, help="Start year (default: 2020)")
876
+ @click.option("--end-year", type=int, help="End year (default: current year)")
877
+ @click.option("--limit", type=int, help="Limit number of records")
878
+ @click.option("--batch-size", type=int, default=1000, help="Batch size for commits (default: 1000)")
879
+ @click.option("--resume", is_flag=True, help="Resume from saved progress")
880
+ @click.option("--skip-existing", is_flag=True, help="Skip records that already exist")
881
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
882
+ def db_import_sec_officers(db_path: Optional[str], start_year: int, end_year: Optional[int], limit: Optional[int], batch_size: int, resume: bool, skip_existing: bool, verbose: bool):
883
+ """
884
+ Import SEC Form 4 insider data into the people database.
885
+
886
+ Downloads Form 4 filings from SEC EDGAR and extracts officers, directors,
887
+ and significant investors (10%+ owners) from each company.
888
+
889
+ Form 4 filings are submitted when insiders buy or sell company stock.
890
+ They contain the person's name, role (officer/director), and company.
891
+
892
+ Rate limited to 5 requests/second to comply with SEC guidelines.
893
+
894
+ \b
895
+ Examples:
896
+ corp-extractor db import-sec-officers --limit 1000
897
+ corp-extractor db import-sec-officers --start-year 2023
898
+ corp-extractor db import-sec-officers --resume
899
+ corp-extractor db import-sec-officers --skip-existing -v
900
+ """
901
+ _configure_logging(verbose)
902
+
903
+ from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
904
+ from .database.embeddings import CompanyEmbedder
905
+ from .database.importers.sec_form4 import SecForm4Importer
906
+
907
+ # Default database path
908
+ if db_path is None:
909
+ db_path_obj = DEFAULT_DB_PATH
910
+ else:
911
+ db_path_obj = Path(db_path)
912
+
913
+ click.echo(f"Importing SEC Form 4 officers/directors to {db_path_obj}...", err=True)
914
+ click.echo(f"Year range: {start_year} - {end_year or 'current'}", err=True)
915
+ if resume:
916
+ click.echo("Resuming from saved progress...", err=True)
917
+
918
+ # Initialize components
919
+ database = get_person_database(db_path=db_path_obj)
920
+ org_database = get_database(db_path=db_path_obj)
921
+ embedder = CompanyEmbedder()
922
+ importer = SecForm4Importer()
923
+
924
+ # Import records in batches
925
+ records = []
926
+ count = 0
927
+ skipped_existing = 0
928
+
929
+ def progress_callback(year: int, quarter: int, filing_idx: int, accession: str, total: int) -> None:
930
+ if verbose and filing_idx % 100 == 0:
931
+ click.echo(f" {year} Q{quarter}: {filing_idx} filings, {total} records", err=True)
932
+
933
+ for record in importer.import_range(
934
+ start_year=start_year,
935
+ end_year=end_year,
936
+ limit=limit,
937
+ resume=resume,
938
+ progress_callback=progress_callback,
939
+ ):
940
+ # Skip existing records if flag is set
941
+ if skip_existing:
942
+ existing = database.get_by_source_id(record.source, record.source_id)
943
+ if existing is not None:
944
+ skipped_existing += 1
945
+ continue
946
+
947
+ # Look up org ID by CIK if available
948
+ issuer_cik = record.record.get("issuer_cik", "")
949
+ if issuer_cik:
950
+ org_id = org_database.get_id_by_source_id("sec_edgar", issuer_cik.zfill(10))
951
+ if org_id is not None:
952
+ record.known_for_org_id = org_id
953
+
954
+ records.append(record)
955
+
956
+ if len(records) >= batch_size:
957
+ embedding_texts = [r.get_embedding_text() for r in records]
958
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
959
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
960
+ count += len(records)
961
+ click.echo(f"Imported {count} records...", err=True)
962
+ records = []
963
+
964
+ # Final batch
965
+ if records:
966
+ embedding_texts = [r.get_embedding_text() for r in records]
967
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
968
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
969
+ count += len(records)
970
+
971
+ if skip_existing and skipped_existing > 0:
972
+ click.echo(f"\nImported {count} SEC officers/directors (skipped {skipped_existing} existing).", err=True)
973
+ else:
974
+ click.echo(f"\nImported {count} SEC officers/directors successfully.", err=True)
975
+
976
+ org_database.close()
977
+ database.close()
978
+
979
+
980
+ @db_cmd.command("import-ch-officers")
981
+ @click.option("--file", "file_path", type=click.Path(exists=True), required=True, help="Path to CH officers zip file (Prod195)")
982
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
983
+ @click.option("--limit", type=int, help="Limit number of records")
984
+ @click.option("--batch-size", type=int, default=1000, help="Batch size for commits (default: 1000)")
985
+ @click.option("--resume", is_flag=True, help="Resume from saved progress")
986
+ @click.option("--include-resigned", is_flag=True, help="Include resigned officers (default: current only)")
987
+ @click.option("--skip-existing", is_flag=True, help="Skip records that already exist")
988
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
989
+ def db_import_ch_officers(file_path: str, db_path: Optional[str], limit: Optional[int], batch_size: int, resume: bool, include_resigned: bool, skip_existing: bool, verbose: bool):
990
+ """
991
+ Import Companies House officers data into the people database.
992
+
993
+ Requires the Prod195 bulk officers zip file from Companies House.
994
+ Request access via BulkProducts@companieshouse.gov.uk.
995
+
996
+ \b
997
+ Examples:
998
+ corp-extractor db import-ch-officers --file officers.zip --limit 10000
999
+ corp-extractor db import-ch-officers --file officers.zip --resume
1000
+ corp-extractor db import-ch-officers --file officers.zip --include-resigned
1001
+ """
1002
+ _configure_logging(verbose)
1003
+
1004
+ from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
1005
+ from .database.embeddings import CompanyEmbedder
1006
+ from .database.importers.companies_house_officers import CompaniesHouseOfficersImporter
1007
+
1008
+ # Default database path
1009
+ if db_path is None:
1010
+ db_path_obj = DEFAULT_DB_PATH
1011
+ else:
1012
+ db_path_obj = Path(db_path)
1013
+
1014
+ click.echo(f"Importing Companies House officers to {db_path_obj}...", err=True)
1015
+ if resume:
1016
+ click.echo("Resuming from saved progress...", err=True)
1017
+
1018
+ # Initialize components
1019
+ database = get_person_database(db_path=db_path_obj)
1020
+ org_database = get_database(db_path=db_path_obj)
1021
+ embedder = CompanyEmbedder()
1022
+ importer = CompaniesHouseOfficersImporter()
1023
+
1024
+ # Import records in batches
1025
+ records = []
1026
+ count = 0
1027
+ skipped_existing = 0
1028
+
1029
+ def progress_callback(file_idx: int, line_num: int, total: int) -> None:
1030
+ if verbose:
1031
+ click.echo(f" File {file_idx}: line {line_num}, {total} records", err=True)
1032
+
1033
+ for record in importer.import_from_zip(
1034
+ file_path,
1035
+ limit=limit,
1036
+ resume=resume,
1037
+ current_only=not include_resigned,
1038
+ progress_callback=progress_callback,
1039
+ ):
1040
+ # Skip existing records if flag is set
1041
+ if skip_existing:
1042
+ existing = database.get_by_source_id(record.source, record.source_id)
1043
+ if existing is not None:
1044
+ skipped_existing += 1
1045
+ continue
1046
+
1047
+ # Look up org ID by company number if available
1048
+ company_number = record.record.get("company_number", "")
1049
+ if company_number:
1050
+ org_id = org_database.get_id_by_source_id("companies_house", company_number)
1051
+ if org_id is not None:
1052
+ record.known_for_org_id = org_id
1053
+
1054
+ records.append(record)
1055
+
1056
+ if len(records) >= batch_size:
1057
+ embedding_texts = [r.get_embedding_text() for r in records]
1058
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
1059
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
1060
+ count += len(records)
1061
+ click.echo(f"Imported {count} records...", err=True)
1062
+ records = []
1063
+
1064
+ # Final batch
1065
+ if records:
1066
+ embedding_texts = [r.get_embedding_text() for r in records]
1067
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
1068
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
1069
+ count += len(records)
1070
+
1071
+ if skip_existing and skipped_existing > 0:
1072
+ click.echo(f"\nImported {count} CH officers (skipped {skipped_existing} existing).", err=True)
1073
+ else:
1074
+ click.echo(f"\nImported {count} CH officers successfully.", err=True)
1075
+
1076
+ org_database.close()
1077
+ database.close()
1078
+
1079
+
871
1080
  @db_cmd.command("import-wikidata")
872
1081
  @click.option("--db", "db_path", type=click.Path(), help="Database path")
873
1082
  @click.option("--limit", type=int, help="Limit number of records")
@@ -921,8 +1130,8 @@ def db_import_wikidata(db_path: Optional[str], limit: Optional[int], batch_size:
921
1130
 
922
1131
  if len(records) >= batch_size:
923
1132
  names = [r.name for r in records]
924
- embeddings = embedder.embed_batch(names)
925
- database.insert_batch(records, embeddings)
1133
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
1134
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
926
1135
  count += len(records)
927
1136
  click.echo(f"Imported {count} records...", err=True)
928
1137
  records = []
@@ -930,8 +1139,8 @@ def db_import_wikidata(db_path: Optional[str], limit: Optional[int], batch_size:
930
1139
  # Final batch
931
1140
  if records:
932
1141
  names = [r.name for r in records]
933
- embeddings = embedder.embed_batch(names)
934
- database.insert_batch(records, embeddings)
1142
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
1143
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
935
1144
  count += len(records)
936
1145
 
937
1146
  click.echo(f"\nImported {count} Wikidata records successfully.", err=True)
@@ -947,23 +1156,32 @@ def db_import_wikidata(db_path: Optional[str], limit: Optional[int], batch_size:
947
1156
  "academic", "scientist", "journalist", "entrepreneur", "activist"
948
1157
  ]), default="executive", help="Person type to import")
949
1158
  @click.option("--all", "import_all", is_flag=True, help="Run all person type queries sequentially")
1159
+ @click.option("--enrich", is_flag=True, help="Query individual people to get role/org data (slower, resumable)")
1160
+ @click.option("--enrich-only", is_flag=True, help="Only enrich existing people (skip bulk import)")
1161
+ @click.option("--enrich-dates", is_flag=True, help="Query individual people to get start/end dates (slower)")
1162
+ @click.option("--skip-existing", is_flag=True, help="Skip records that already exist (default: update them)")
950
1163
  @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
951
- def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: int, query_type: str, import_all: bool, verbose: bool):
1164
+ def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: int, query_type: str, import_all: bool, enrich: bool, enrich_only: bool, enrich_dates: bool, skip_existing: bool, verbose: bool):
952
1165
  """
953
1166
  Import notable people data from Wikidata via SPARQL.
954
1167
 
1168
+ Uses a two-phase approach for reliability:
1169
+ 1. Bulk import: Fast fetch of QID, name, country (no timeouts)
1170
+ 2. Enrich (optional): Per-person queries for role/org/dates
1171
+
955
1172
  Imports people with English Wikipedia articles (ensures notability).
956
- Includes executives, politicians, athletes, artists, academics, and more.
957
1173
 
958
1174
  \b
959
1175
  Examples:
960
1176
  corp-extractor db import-people --type executive --limit 5000
961
1177
  corp-extractor db import-people --all --limit 10000
1178
+ corp-extractor db import-people --type executive --enrich
1179
+ corp-extractor db import-people --enrich-only --limit 100
962
1180
  corp-extractor db import-people --type politician -v
963
1181
  """
964
1182
  _configure_logging(verbose)
965
1183
 
966
- from .database.store import get_person_database, DEFAULT_DB_PATH
1184
+ from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
967
1185
  from .database.embeddings import CompanyEmbedder
968
1186
  from .database.importers.wikidata_people import WikidataPeopleImporter
969
1187
 
@@ -977,35 +1195,677 @@ def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: i
977
1195
 
978
1196
  # Initialize components
979
1197
  database = get_person_database(db_path=db_path_obj)
1198
+ org_database = get_database(db_path=db_path_obj)
980
1199
  embedder = CompanyEmbedder()
981
1200
  importer = WikidataPeopleImporter(batch_size=batch_size)
982
1201
 
983
- # Batch processing
984
- records = []
985
1202
  count = 0
986
1203
 
987
- for record in importer.import_from_sparql(limit=limit, query_type=query_type, import_all=import_all):
988
- records.append(record)
1204
+ # Phase 1: Bulk import (fast, minimal data) - skip if --enrich-only
1205
+ if not enrich_only:
1206
+ records = []
1207
+ skipped_existing = 0
989
1208
 
990
- if len(records) >= batch_size:
991
- # Generate embeddings using the combined name|role|org format
1209
+ click.echo("Phase 1: Bulk import (QID, name, country)...", err=True)
1210
+
1211
+ for record in importer.import_from_sparql(limit=limit, query_type=query_type, import_all=import_all):
1212
+ # Skip existing records if flag is set
1213
+ if skip_existing:
1214
+ existing = database.get_by_source_id(record.source, record.source_id)
1215
+ if existing is not None:
1216
+ skipped_existing += 1
1217
+ continue
1218
+
1219
+ records.append(record)
1220
+
1221
+ if len(records) >= batch_size:
1222
+ # Generate embeddings (both float32 and int8)
1223
+ embedding_texts = [r.get_embedding_text() for r in records]
1224
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
1225
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
1226
+ count += len(records)
1227
+
1228
+ click.echo(f" Imported {count} people...", err=True)
1229
+ records = []
1230
+
1231
+ # Final batch
1232
+ if records:
992
1233
  embedding_texts = [r.get_embedding_text() for r in records]
993
- embeddings = embedder.embed_batch(embedding_texts)
994
- database.insert_batch(records, embeddings)
1234
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
1235
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
995
1236
  count += len(records)
996
- click.echo(f" Imported {count} people...", err=True)
997
- records = []
998
1237
 
999
- # Final batch
1000
- if records:
1001
- embedding_texts = [r.get_embedding_text() for r in records]
1002
- embeddings = embedder.embed_batch(embedding_texts)
1003
- database.insert_batch(records, embeddings)
1004
- count += len(records)
1238
+ if skip_existing and skipped_existing > 0:
1239
+ click.echo(f"\nPhase 1 complete: {count} people imported (skipped {skipped_existing} existing).", err=True)
1240
+ else:
1241
+ click.echo(f"\nPhase 1 complete: {count} people imported.", err=True)
1242
+ else:
1243
+ click.echo("Skipping Phase 1 (bulk import) - using existing database records.", err=True)
1244
+ # Enable enrich if enrich_only is set
1245
+ enrich = True
1246
+
1247
+ # Phase 2: Enrich with role/org/dates (optional, slower but resumable)
1248
+ if enrich:
1249
+ click.echo("\nPhase 2: Enriching with role/org/dates (parallel queries)...", err=True)
1250
+ # Get all people without role/org
1251
+ people_to_enrich = []
1252
+ enriched_count = 0
1253
+ for record in database.iter_records():
1254
+ if not record.known_for_role and not record.known_for_org:
1255
+ people_to_enrich.append(record)
1256
+ enriched_count += 1
1257
+ # Apply limit if --enrich-only
1258
+ if enrich_only and limit and enriched_count >= limit:
1259
+ break
1260
+
1261
+ if people_to_enrich:
1262
+ click.echo(f"Found {len(people_to_enrich)} people to enrich...", err=True)
1263
+ importer.enrich_people_role_org_batch(people_to_enrich, delay_seconds=0.1, max_workers=5)
1264
+
1265
+ # Persist the enriched data and re-generate embeddings
1266
+ updated = 0
1267
+ org_count = 0
1268
+ date_count = 0
1269
+ for person in people_to_enrich:
1270
+ if person.known_for_role or person.known_for_org:
1271
+ # Look up org ID if we have org_qid
1272
+ org_qid = person.record.get("org_qid", "")
1273
+ if org_qid:
1274
+ org_id = org_database.get_id_by_source_id("wikipedia", org_qid)
1275
+ if org_id is not None:
1276
+ person.known_for_org_id = org_id
1277
+
1278
+ # Update the record with new role/org/dates and re-embed
1279
+ new_embedding_text = person.get_embedding_text()
1280
+ new_embedding = embedder.embed(new_embedding_text)
1281
+ if database.update_role_org(
1282
+ person.source, person.source_id,
1283
+ person.known_for_role, person.known_for_org,
1284
+ person.known_for_org_id, new_embedding,
1285
+ person.from_date, person.to_date,
1286
+ ):
1287
+ updated += 1
1288
+ if person.known_for_org:
1289
+ org_count += 1
1290
+ if person.from_date or person.to_date:
1291
+ date_count += 1
1292
+ if verbose:
1293
+ date_str = ""
1294
+ if person.from_date or person.to_date:
1295
+ date_str = f" ({person.from_date or '?'} - {person.to_date or '?'})"
1296
+ click.echo(f" {person.name}: {person.known_for_role} at {person.known_for_org}{date_str}", err=True)
1297
+
1298
+ click.echo(f"Updated {updated} people ({org_count} with orgs, {date_count} with dates).", err=True)
1299
+
1300
+ # Phase 3: Enrich with dates (optional, even slower)
1301
+ if enrich_dates:
1302
+ click.echo("\nPhase 3: Enriching with dates...", err=True)
1303
+ # Get all people without dates but with role (dates are associated with positions)
1304
+ people_to_enrich = []
1305
+ for record in database.iter_records():
1306
+ if not record.from_date and not record.to_date and record.known_for_role:
1307
+ people_to_enrich.append(record)
1308
+
1309
+ if people_to_enrich:
1310
+ click.echo(f"Found {len(people_to_enrich)} people to enrich with dates...", err=True)
1311
+ enriched = importer.enrich_people_batch(people_to_enrich, delay_seconds=0.3)
1312
+
1313
+ # Persist the enriched dates
1314
+ updated = 0
1315
+ for person in people_to_enrich:
1316
+ if person.from_date or person.to_date:
1317
+ if database.update_dates(person.source, person.source_id, person.from_date, person.to_date):
1318
+ updated += 1
1319
+ if verbose:
1320
+ click.echo(f" {person.name}: {person.from_date or '?'} - {person.to_date or '?'}", err=True)
1321
+
1322
+ click.echo(f"Updated {updated} people with dates.", err=True)
1323
+
1324
+ org_database.close()
1325
+ database.close()
1326
+
1327
+
1328
+ @db_cmd.command("import-wikidata-dump")
1329
+ @click.option("--dump", "dump_path", type=click.Path(exists=True), help="Path to Wikidata JSON dump file (.bz2 or .gz)")
1330
+ @click.option("--download", is_flag=True, help="Download latest dump first (~100GB)")
1331
+ @click.option("--force", is_flag=True, help="Force re-download even if cached")
1332
+ @click.option("--no-aria2", is_flag=True, help="Don't use aria2c even if available (slower)")
1333
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1334
+ @click.option("--people/--no-people", default=True, help="Import people (default: yes)")
1335
+ @click.option("--orgs/--no-orgs", default=True, help="Import organizations (default: yes)")
1336
+ @click.option("--locations/--no-locations", default=False, help="Import locations (default: no)")
1337
+ @click.option("--require-enwiki", is_flag=True, help="Only import orgs with English Wikipedia articles")
1338
+ @click.option("--resume", is_flag=True, help="Resume from last position in dump file (tracks entity index)")
1339
+ @click.option("--skip-updates", is_flag=True, help="Skip Q codes already in database (no updates)")
1340
+ @click.option("--limit", type=int, help="Max records per type (people and/or orgs)")
1341
+ @click.option("--batch-size", type=int, default=10000, help="Batch size for commits (default: 10000)")
1342
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1343
+ def db_import_wikidata_dump(
1344
+ dump_path: Optional[str],
1345
+ download: bool,
1346
+ force: bool,
1347
+ no_aria2: bool,
1348
+ db_path: Optional[str],
1349
+ people: bool,
1350
+ orgs: bool,
1351
+ locations: bool,
1352
+ require_enwiki: bool,
1353
+ resume: bool,
1354
+ skip_updates: bool,
1355
+ limit: Optional[int],
1356
+ batch_size: int,
1357
+ verbose: bool,
1358
+ ):
1359
+ """
1360
+ Import people, organizations, and locations from Wikidata JSON dump.
1361
+
1362
+ This uses the full Wikidata JSON dump (~100GB compressed) to import
1363
+ all humans and organizations with English Wikipedia articles. This
1364
+ avoids SPARQL query timeouts that occur with large result sets.
1365
+
1366
+ The dump is streamed line-by-line to minimize memory usage.
1367
+
1368
+ \b
1369
+ Features:
1370
+ - No timeouts (processes locally)
1371
+ - Complete coverage (all notable people/orgs)
1372
+ - Resumable with --resume (tracks position in dump file)
1373
+ - Skip existing with --skip-updates (loads existing Q codes)
1374
+ - People like Andy Burnham are captured via occupation (P106)
1375
+ - Locations (countries, cities, regions) with parent hierarchy
1376
+
1377
+ \b
1378
+ Resume options:
1379
+ - --resume: Resume from where the dump processing left off (tracks entity index).
1380
+ Progress is saved after each batch. Use this if import was interrupted.
1381
+ - --skip-updates: Skip Q codes already in database (no updates to existing records).
1382
+ Use this to add new records without re-processing existing ones.
1383
+
1384
+ \b
1385
+ Examples:
1386
+ corp-extractor db import-wikidata-dump --dump /path/to/dump.json.bz2 --limit 10000
1387
+ corp-extractor db import-wikidata-dump --download --people --no-orgs --limit 50000
1388
+ corp-extractor db import-wikidata-dump --dump dump.json.bz2 --orgs --no-people
1389
+ corp-extractor db import-wikidata-dump --dump dump.json.bz2 --locations --no-people --no-orgs # Locations only
1390
+ corp-extractor db import-wikidata-dump --dump dump.json.bz2 --resume # Resume interrupted import
1391
+ corp-extractor db import-wikidata-dump --dump dump.json.bz2 --skip-updates # Skip existing Q codes
1392
+ """
1393
+ _configure_logging(verbose)
1394
+
1395
+ from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
1396
+ from .database.embeddings import CompanyEmbedder
1397
+ from .database.importers.wikidata_dump import WikidataDumpImporter, DumpProgress
1398
+
1399
+ if not dump_path and not download:
1400
+ raise click.UsageError("Either --dump path or --download is required")
1401
+
1402
+ if not people and not orgs and not locations:
1403
+ raise click.UsageError("Must import at least one of --people, --orgs, or --locations")
1404
+
1405
+ # Default database path
1406
+ if db_path is None:
1407
+ db_path_obj = DEFAULT_DB_PATH
1408
+ else:
1409
+ db_path_obj = Path(db_path)
1410
+
1411
+ click.echo(f"Importing Wikidata dump to {db_path_obj}...", err=True)
1412
+
1413
+ # Initialize importer
1414
+ importer = WikidataDumpImporter(dump_path=dump_path)
1415
+
1416
+ # Download if requested
1417
+ if download:
1418
+ import shutil
1419
+ dump_target = importer.get_dump_path()
1420
+ click.echo(f"Downloading Wikidata dump (~100GB) to:", err=True)
1421
+ click.echo(f" {dump_target}", err=True)
1422
+
1423
+ # Check for aria2c
1424
+ has_aria2 = shutil.which("aria2c") is not None
1425
+ use_aria2 = has_aria2 and not no_aria2
1426
+
1427
+ if use_aria2:
1428
+ click.echo(" Using aria2c for fast parallel download (16 connections)", err=True)
1429
+ dump_file = importer.download_dump(force=force, use_aria2=True)
1430
+ click.echo(f"\nUsing dump: {dump_file}", err=True)
1431
+ else:
1432
+ if not has_aria2:
1433
+ click.echo("", err=True)
1434
+ click.echo(" TIP: Install aria2c for 10-20x faster downloads:", err=True)
1435
+ click.echo(" brew install aria2 (macOS)", err=True)
1436
+ click.echo(" apt install aria2 (Ubuntu/Debian)", err=True)
1437
+ click.echo("", err=True)
1438
+
1439
+ # Use urllib to get content length first
1440
+ import urllib.request
1441
+ req = urllib.request.Request(
1442
+ "https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2",
1443
+ headers={"User-Agent": "corp-extractor/1.0"},
1444
+ method="HEAD"
1445
+ )
1446
+ with urllib.request.urlopen(req) as response:
1447
+ total_size = int(response.headers.get("content-length", 0))
1448
+
1449
+ if total_size:
1450
+ total_gb = total_size / (1024 ** 3)
1451
+ click.echo(f" Size: {total_gb:.1f} GB", err=True)
1452
+
1453
+ # Download with progress bar
1454
+ progress_bar = None
1455
+
1456
+ def update_progress(downloaded: int, total: int) -> None:
1457
+ nonlocal progress_bar
1458
+ if progress_bar is None and total > 0:
1459
+ progress_bar = click.progressbar(
1460
+ length=total,
1461
+ label="Downloading",
1462
+ show_percent=True,
1463
+ show_pos=True,
1464
+ item_show_func=lambda x: f"{(x or 0) / (1024**3):.1f} GB" if x else "",
1465
+ )
1466
+ progress_bar.__enter__()
1467
+ if progress_bar:
1468
+ # Update to absolute position
1469
+ progress_bar.update(downloaded - progress_bar.pos)
1470
+
1471
+ try:
1472
+ dump_file = importer.download_dump(force=force, use_aria2=False, progress_callback=update_progress)
1473
+ finally:
1474
+ if progress_bar:
1475
+ progress_bar.__exit__(None, None, None)
1476
+
1477
+ click.echo(f"\nUsing dump: {dump_file}", err=True)
1478
+ elif dump_path:
1479
+ click.echo(f"Using dump: {dump_path}", err=True)
1480
+
1481
+ # Initialize embedder (loads model, may take time on first run)
1482
+ click.echo("Loading embedding model...", err=True)
1483
+ sys.stderr.flush()
1484
+ embedder = CompanyEmbedder()
1485
+ click.echo("Embedding model loaded.", err=True)
1486
+ sys.stderr.flush()
1487
+
1488
+ # Load existing QID labels from database and seed the importer's cache
1489
+ database = get_person_database(db_path=db_path_obj)
1490
+ existing_labels = database.get_all_qid_labels()
1491
+ if existing_labels:
1492
+ click.echo(f"Loaded {len(existing_labels):,} existing QID labels from DB", err=True)
1493
+ importer.set_label_cache(existing_labels)
1494
+ known_qids_at_start = set(existing_labels.keys())
1495
+
1496
+ # Load existing source_ids for skip_updates mode
1497
+ existing_people_ids: set[str] = set()
1498
+ existing_org_ids: set[str] = set()
1499
+ if skip_updates:
1500
+ click.echo("Loading existing records for --skip-updates...", err=True)
1501
+ if people:
1502
+ existing_people_ids = database.get_all_source_ids(source="wikidata")
1503
+ click.echo(f" Found {len(existing_people_ids):,} existing people Q codes", err=True)
1504
+ if orgs:
1505
+ org_database = get_database(db_path=db_path_obj)
1506
+ existing_org_ids = org_database.get_all_source_ids(source="wikipedia")
1507
+ click.echo(f" Found {len(existing_org_ids):,} existing org Q codes", err=True)
1508
+
1509
+ # Load progress for resume mode (position-based resume)
1510
+ progress: Optional[DumpProgress] = None
1511
+ start_index = 0
1512
+ if resume:
1513
+ progress = DumpProgress.load()
1514
+ if progress:
1515
+ # Verify the progress is for the same dump file
1516
+ actual_dump_path = importer._dump_path or Path(dump_path) if dump_path else importer.get_dump_path()
1517
+ if progress.matches_dump(actual_dump_path):
1518
+ start_index = progress.entity_index
1519
+ click.echo(f"Resuming from entity index {start_index:,}", err=True)
1520
+ click.echo(f" Last entity: {progress.last_entity_id}", err=True)
1521
+ click.echo(f" Last updated: {progress.last_updated}", err=True)
1522
+ else:
1523
+ click.echo("Warning: Progress file is for a different dump, starting from beginning", err=True)
1524
+ progress = None
1525
+ else:
1526
+ click.echo("No progress file found, starting from beginning", err=True)
1527
+
1528
+ # Initialize progress tracking
1529
+ if progress is None:
1530
+ actual_dump_path = importer._dump_path or Path(dump_path) if dump_path else importer.get_dump_path()
1531
+ progress = DumpProgress(
1532
+ dump_path=str(actual_dump_path),
1533
+ dump_size=actual_dump_path.stat().st_size if actual_dump_path.exists() else 0,
1534
+ )
1535
+
1536
+ # Helper to persist new labels after each batch
1537
+ def persist_new_labels() -> int:
1538
+ new_labels = importer.get_new_labels_since(known_qids_at_start)
1539
+ if new_labels:
1540
+ database.insert_qid_labels(new_labels)
1541
+ known_qids_at_start.update(new_labels.keys())
1542
+ return len(new_labels)
1543
+ return 0
1544
+
1545
+ # ========================================
1546
+ # Location-only import (separate pass)
1547
+ # ========================================
1548
+ if locations and not people and not orgs:
1549
+ from .database.store import get_locations_database
1550
+
1551
+ click.echo("\n=== Location Import ===", err=True)
1552
+ click.echo(f" Locations: {'up to ' + str(limit) + ' records' if limit else 'unlimited'}", err=True)
1553
+ if require_enwiki:
1554
+ click.echo(" Filter: only locations with English Wikipedia articles", err=True)
1555
+
1556
+ # Initialize locations database
1557
+ locations_database = get_locations_database(db_path=db_path_obj)
1558
+
1559
+ # Load existing location Q codes for skip_updates mode
1560
+ existing_location_ids: set[str] = set()
1561
+ if skip_updates:
1562
+ existing_location_ids = locations_database.get_all_source_ids(source="wikidata")
1563
+ click.echo(f" Skip updates: {len(existing_location_ids):,} existing Q codes", err=True)
1564
+
1565
+ if start_index > 0:
1566
+ click.echo(f" Resuming from entity index {start_index:,}", err=True)
1567
+
1568
+ location_records: list = []
1569
+ locations_count = 0
1570
+ last_entity_index = start_index
1571
+ last_entity_id = ""
1572
+
1573
+ def location_progress_callback(entity_index: int, entity_id: str, loc_count: int) -> None:
1574
+ nonlocal last_entity_index, last_entity_id
1575
+ last_entity_index = entity_index
1576
+ last_entity_id = entity_id
1577
+
1578
+ def save_location_progress() -> None:
1579
+ if progress:
1580
+ progress.entity_index = last_entity_index
1581
+ progress.last_entity_id = last_entity_id
1582
+ progress.save()
1583
+
1584
+ def flush_location_batch() -> None:
1585
+ nonlocal location_records, locations_count
1586
+ if location_records:
1587
+ inserted = locations_database.insert_batch(location_records)
1588
+ locations_count += inserted
1589
+ location_records = []
1590
+
1591
+ click.echo("Starting dump iteration...", err=True)
1592
+ sys.stderr.flush()
1593
+
1594
+ try:
1595
+ if limit:
1596
+ # Use progress bar when we have limits
1597
+ with click.progressbar(
1598
+ length=limit,
1599
+ label="Processing dump",
1600
+ show_percent=True,
1601
+ show_pos=True,
1602
+ ) as pbar:
1603
+ for record in importer.import_locations(
1604
+ limit=limit,
1605
+ require_enwiki=require_enwiki,
1606
+ skip_ids=existing_location_ids if skip_updates else None,
1607
+ start_index=start_index,
1608
+ progress_callback=location_progress_callback,
1609
+ ):
1610
+ pbar.update(1)
1611
+ location_records.append(record)
1612
+ if len(location_records) >= batch_size:
1613
+ flush_location_batch()
1614
+ persist_new_labels()
1615
+ save_location_progress()
1616
+ else:
1617
+ # No limit - show counter updates
1618
+ for record in importer.import_locations(
1619
+ limit=None,
1620
+ require_enwiki=require_enwiki,
1621
+ skip_ids=existing_location_ids if skip_updates else None,
1622
+ start_index=start_index,
1623
+ progress_callback=location_progress_callback,
1624
+ ):
1625
+ location_records.append(record)
1626
+ if len(location_records) >= batch_size:
1627
+ flush_location_batch()
1628
+ persist_new_labels()
1629
+ save_location_progress()
1630
+ click.echo(f"\r Progress: {locations_count:,} locations...", nl=False, err=True)
1631
+ sys.stderr.flush()
1632
+
1633
+ click.echo("", err=True) # Newline after counter
1634
+
1635
+ # Final batches
1636
+ flush_location_batch()
1637
+ persist_new_labels()
1638
+ save_location_progress()
1639
+
1640
+ finally:
1641
+ # Ensure we save progress even on interrupt
1642
+ save_location_progress()
1643
+
1644
+ click.echo(f"\nLocation import complete: {locations_count:,} locations", err=True)
1645
+
1646
+ # Final label resolution
1647
+ click.echo("\n=== Final QID Label Resolution ===", err=True)
1648
+ all_labels = importer.get_label_cache()
1649
+ click.echo(f" Total labels in cache: {len(all_labels):,}", err=True)
1650
+
1651
+ # Final stats
1652
+ final_label_count = database.get_qid_labels_count()
1653
+ click.echo(f" Total labels in DB: {final_label_count:,}", err=True)
1654
+
1655
+ locations_database.close()
1656
+ database.close()
1657
+ click.echo("\nWikidata dump import complete!", err=True)
1658
+ return
1005
1659
 
1006
- click.echo(f"\nImported {count} people successfully.", err=True)
1660
+ # Combined import - single pass through the dump for both people and orgs
1661
+ click.echo("\n=== Combined Import (single dump pass) ===", err=True)
1662
+ sys.stderr.flush() # Ensure output is visible immediately
1663
+ if people:
1664
+ click.echo(f" People: {'up to ' + str(limit) + ' records' if limit else 'unlimited'}", err=True)
1665
+ if skip_updates and existing_people_ids:
1666
+ click.echo(f" Skip updates: {len(existing_people_ids):,} existing Q codes", err=True)
1667
+ if orgs:
1668
+ click.echo(f" Orgs: {'up to ' + str(limit) + ' records' if limit else 'unlimited'}", err=True)
1669
+ if require_enwiki:
1670
+ click.echo(" Filter: only orgs with English Wikipedia articles", err=True)
1671
+ if skip_updates and existing_org_ids:
1672
+ click.echo(f" Skip updates: {len(existing_org_ids):,} existing Q codes", err=True)
1673
+ if start_index > 0:
1674
+ click.echo(f" Resuming from entity index {start_index:,}", err=True)
1675
+
1676
+ # Initialize databases
1677
+ person_database = get_person_database(db_path=db_path_obj)
1678
+ org_database = get_database(db_path=db_path_obj) if orgs else None
1679
+
1680
+ # Batches for each type
1681
+ people_records: list = []
1682
+ org_records: list = []
1683
+ people_count = 0
1684
+ orgs_count = 0
1685
+ last_entity_index = start_index
1686
+ last_entity_id = ""
1687
+
1688
+ def combined_progress_callback(entity_index: int, entity_id: str, ppl_count: int, org_count: int) -> None:
1689
+ nonlocal last_entity_index, last_entity_id
1690
+ last_entity_index = entity_index
1691
+ last_entity_id = entity_id
1692
+
1693
+ def save_progress() -> None:
1694
+ if progress:
1695
+ progress.entity_index = last_entity_index
1696
+ progress.last_entity_id = last_entity_id
1697
+ progress.people_yielded = people_count
1698
+ progress.orgs_yielded = orgs_count
1699
+ progress.save()
1700
+
1701
+ def flush_people_batch() -> None:
1702
+ nonlocal people_records, people_count
1703
+ if people_records:
1704
+ embedding_texts = [r.get_embedding_text() for r in people_records]
1705
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
1706
+ person_database.insert_batch(people_records, embeddings, scalar_embeddings=scalar_embeddings)
1707
+ people_count += len(people_records)
1708
+ people_records = []
1709
+
1710
+ def flush_org_batch() -> None:
1711
+ nonlocal org_records, orgs_count
1712
+ if org_records and org_database:
1713
+ names = [r.name for r in org_records]
1714
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
1715
+ org_database.insert_batch(org_records, embeddings, scalar_embeddings=scalar_embeddings)
1716
+ orgs_count += len(org_records)
1717
+ org_records = []
1718
+
1719
+ # Calculate total for progress bar (if limits set for both)
1720
+ total_limit = None
1721
+ if limit and people and orgs:
1722
+ total_limit = limit * 2 # Rough estimate
1723
+ elif limit:
1724
+ total_limit = limit
1725
+
1726
+ click.echo("Starting dump iteration...", err=True)
1727
+ sys.stderr.flush()
1728
+
1729
+ records_seen = 0
1730
+ try:
1731
+ if total_limit:
1732
+ # Use progress bar when we have limits
1733
+ with click.progressbar(
1734
+ length=total_limit,
1735
+ label="Processing dump",
1736
+ show_percent=True,
1737
+ show_pos=True,
1738
+ ) as pbar:
1739
+ for record_type, record in importer.import_all(
1740
+ people_limit=limit if people else 0,
1741
+ orgs_limit=limit if orgs else 0,
1742
+ import_people=people,
1743
+ import_orgs=orgs,
1744
+ require_enwiki=require_enwiki,
1745
+ skip_people_ids=existing_people_ids if skip_updates else None,
1746
+ skip_org_ids=existing_org_ids if skip_updates else None,
1747
+ start_index=start_index,
1748
+ progress_callback=combined_progress_callback,
1749
+ ):
1750
+ records_seen += 1
1751
+ pbar.update(1)
1752
+
1753
+ if record_type == "person":
1754
+ people_records.append(record)
1755
+ if len(people_records) >= batch_size:
1756
+ flush_people_batch()
1757
+ persist_new_labels()
1758
+ save_progress()
1759
+ else: # org
1760
+ org_records.append(record)
1761
+ if len(org_records) >= batch_size:
1762
+ flush_org_batch()
1763
+ persist_new_labels()
1764
+ save_progress()
1765
+ else:
1766
+ # No limit - show counter updates
1767
+ for record_type, record in importer.import_all(
1768
+ people_limit=None,
1769
+ orgs_limit=None,
1770
+ import_people=people,
1771
+ import_orgs=orgs,
1772
+ require_enwiki=require_enwiki,
1773
+ skip_people_ids=existing_people_ids if skip_updates else None,
1774
+ skip_org_ids=existing_org_ids if skip_updates else None,
1775
+ start_index=start_index,
1776
+ progress_callback=combined_progress_callback,
1777
+ ):
1778
+ records_seen += 1
1779
+ # Show first record immediately as proof of life
1780
+ if records_seen == 1:
1781
+ click.echo(f" First record found: {record.name}", err=True)
1782
+ sys.stderr.flush()
1783
+
1784
+ if record_type == "person":
1785
+ people_records.append(record)
1786
+ if len(people_records) >= batch_size:
1787
+ flush_people_batch()
1788
+ persist_new_labels()
1789
+ save_progress()
1790
+ click.echo(f"\r Progress: {people_count:,} people, {orgs_count:,} orgs...", nl=False, err=True)
1791
+ sys.stderr.flush()
1792
+ else: # org
1793
+ org_records.append(record)
1794
+ if len(org_records) >= batch_size:
1795
+ flush_org_batch()
1796
+ persist_new_labels()
1797
+ save_progress()
1798
+ click.echo(f"\r Progress: {people_count:,} people, {orgs_count:,} orgs...", nl=False, err=True)
1799
+ sys.stderr.flush()
1800
+
1801
+ click.echo("", err=True) # Newline after counter
1802
+
1803
+ # Final batches
1804
+ flush_people_batch()
1805
+ flush_org_batch()
1806
+ persist_new_labels()
1807
+ save_progress()
1808
+
1809
+ finally:
1810
+ # Ensure we save progress even on interrupt
1811
+ save_progress()
1812
+
1813
+ click.echo(f"Import complete: {people_count:,} people, {orgs_count:,} orgs", err=True)
1814
+
1815
+ # Keep references for final label resolution
1816
+ database = person_database
1817
+ if org_database:
1818
+ org_database.close()
1819
+
1820
+ # Final label resolution pass for any remaining unresolved QIDs
1821
+ click.echo("\n=== Final QID Label Resolution ===", err=True)
1822
+
1823
+ # Get the full label cache (includes labels from DB + new ones from import)
1824
+ all_labels = importer.get_label_cache()
1825
+ click.echo(f" Total labels in cache: {len(all_labels):,}", err=True)
1826
+
1827
+ # Check for any remaining unresolved QIDs in the database
1828
+ people_unresolved = database.get_unresolved_qids()
1829
+ click.echo(f" Unresolved QIDs in people: {len(people_unresolved):,}", err=True)
1830
+
1831
+ org_unresolved: set[str] = set()
1832
+ if orgs:
1833
+ org_database = get_database(db_path=db_path_obj)
1834
+ org_unresolved = org_database.get_unresolved_qids()
1835
+ click.echo(f" Unresolved QIDs in orgs: {len(org_unresolved):,}", err=True)
1836
+
1837
+ all_unresolved = people_unresolved | org_unresolved
1838
+ need_sparql = all_unresolved - set(all_labels.keys())
1839
+
1840
+ if need_sparql:
1841
+ click.echo(f" Resolving {len(need_sparql):,} remaining QIDs via SPARQL...", err=True)
1842
+ sparql_resolved = importer.resolve_qids_via_sparql(need_sparql)
1843
+ all_labels.update(sparql_resolved)
1844
+ # Persist newly resolved labels
1845
+ if sparql_resolved:
1846
+ database.insert_qid_labels(sparql_resolved)
1847
+ click.echo(f" SPARQL resolved and stored: {len(sparql_resolved):,}", err=True)
1848
+
1849
+ # Update records with any newly resolved labels
1850
+ if all_labels:
1851
+ updates, deletes = database.resolve_qid_labels(all_labels)
1852
+ if updates or deletes:
1853
+ click.echo(f" People: {updates:,} updated, {deletes:,} duplicates deleted", err=True)
1854
+
1855
+ if orgs:
1856
+ org_database = get_database(db_path=db_path_obj)
1857
+ org_updates, org_deletes = org_database.resolve_qid_labels(all_labels)
1858
+ if org_updates or org_deletes:
1859
+ click.echo(f" Orgs: {org_updates:,} updated, {org_deletes:,} duplicates deleted", err=True)
1860
+ org_database.close()
1861
+
1862
+ # Final stats
1863
+ final_label_count = database.get_qid_labels_count()
1864
+ click.echo(f" Total labels in DB: {final_label_count:,}", err=True)
1007
1865
  database.close()
1008
1866
 
1867
+ click.echo("\nWikidata dump import complete!", err=True)
1868
+
1009
1869
 
1010
1870
  @db_cmd.command("search-people")
1011
1871
  @click.argument("query")
@@ -1134,8 +1994,8 @@ def db_import_companies_house(
1134
1994
 
1135
1995
  if len(records) >= batch_size:
1136
1996
  names = [r.name for r in records]
1137
- embeddings = embedder.embed_batch(names)
1138
- database.insert_batch(records, embeddings)
1997
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
1998
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
1139
1999
  count += len(records)
1140
2000
  click.echo(f"Imported {count} records...", err=True)
1141
2001
  records = []
@@ -1143,8 +2003,8 @@ def db_import_companies_house(
1143
2003
  # Final batch
1144
2004
  if records:
1145
2005
  names = [r.name for r in records]
1146
- embeddings = embedder.embed_batch(names)
1147
- database.insert_batch(records, embeddings)
2006
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
2007
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
1148
2008
  count += len(records)
1149
2009
 
1150
2010
  click.echo(f"\nImported {count} Companies House records successfully.", err=True)
@@ -1163,6 +2023,7 @@ def db_status(db_path: Optional[str]):
1163
2023
  corp-extractor db status --db /path/to/entities.db
1164
2024
  """
1165
2025
  from .database import OrganizationDatabase
2026
+ from .database.store import get_person_database
1166
2027
 
1167
2028
  try:
1168
2029
  database = OrganizationDatabase(db_path=db_path)
@@ -1180,17 +2041,119 @@ def db_status(db_path: Optional[str]):
1180
2041
  click.echo(f"\n⚠️ Missing embeddings: {missing_embeddings:,}")
1181
2042
  click.echo(" Run 'corp-extractor db repair-embeddings' to fix")
1182
2043
 
2044
+ # Show embedding counts (float32 and scalar)
2045
+ org_fp32 = database.get_float32_embedding_count()
2046
+ org_int8 = database.get_scalar_embedding_count()
2047
+ click.echo(f"\nOrganization embeddings:")
2048
+ click.echo(f" float32: {org_fp32:,}")
2049
+ click.echo(f" int8 (scalar): {org_int8:,}")
2050
+ if org_fp32 > 0 and org_int8 < org_fp32:
2051
+ click.echo(f" ⚠️ {org_fp32 - org_int8:,} missing scalar embeddings")
2052
+ click.echo(" Run 'corp-extractor db backfill-scalar' to generate")
2053
+
2054
+ # Person embeddings
2055
+ person_db = get_person_database(db_path=db_path)
2056
+ person_fp32 = person_db.get_float32_embedding_count()
2057
+ person_int8 = person_db.get_scalar_embedding_count()
2058
+ if person_fp32 > 0:
2059
+ click.echo(f"\nPerson embeddings:")
2060
+ click.echo(f" float32: {person_fp32:,}")
2061
+ click.echo(f" int8 (scalar): {person_int8:,}")
2062
+ if person_int8 < person_fp32:
2063
+ click.echo(f" ⚠️ {person_fp32 - person_int8:,} missing scalar embeddings")
2064
+
1183
2065
  if stats.by_source:
1184
2066
  click.echo("\nRecords by source:")
1185
2067
  for source, count in stats.by_source.items():
1186
2068
  click.echo(f" {source}: {count:,}")
1187
2069
 
2070
+ # Show canonicalization stats
2071
+ canon_stats = database.get_canon_stats()
2072
+ if canon_stats["canonicalized_records"] > 0:
2073
+ click.echo("\nCanonicalization:")
2074
+ click.echo(f" Canonicalized: {canon_stats['canonicalized_records']:,} / {canon_stats['total_records']:,}")
2075
+ click.echo(f" Canonical groups: {canon_stats['canonical_groups']:,}")
2076
+ click.echo(f" Multi-record groups: {canon_stats['multi_record_groups']:,}")
2077
+ click.echo(f" Records in multi-groups: {canon_stats['records_in_multi_groups']:,}")
2078
+ else:
2079
+ click.echo("\nCanonicalization: Not run yet")
2080
+ click.echo(" Run 'corp-extractor db canonicalize' to link equivalent records")
2081
+
1188
2082
  database.close()
1189
2083
 
1190
2084
  except Exception as e:
1191
2085
  raise click.ClickException(f"Failed to read database: {e}")
1192
2086
 
1193
2087
 
2088
+ @db_cmd.command("canonicalize")
2089
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
2090
+ @click.option("--batch-size", type=int, default=10000, help="Batch size for updates (default: 10000)")
2091
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
2092
+ def db_canonicalize(db_path: Optional[str], batch_size: int, verbose: bool):
2093
+ """
2094
+ Canonicalize organizations by linking equivalent records across sources.
2095
+
2096
+ Records are considered equivalent if they share:
2097
+ - Same LEI (globally unique legal entity identifier)
2098
+ - Same ticker symbol
2099
+ - Same CIK (SEC identifier)
2100
+ - Same normalized name (after lowercasing, removing dots)
2101
+ - Same name with suffix expansion (Ltd -> Limited, etc.)
2102
+
2103
+ For each group, the highest-priority source becomes canonical:
2104
+ gleif > sec_edgar > companies_house > wikipedia
2105
+
2106
+ Canonicalization enables better search re-ranking by boosting results
2107
+ that have records from multiple authoritative sources.
2108
+
2109
+ \b
2110
+ Examples:
2111
+ corp-extractor db canonicalize
2112
+ corp-extractor db canonicalize -v
2113
+ corp-extractor db canonicalize --db /path/to/entities.db
2114
+ """
2115
+ _configure_logging(verbose)
2116
+
2117
+ from .database import OrganizationDatabase
2118
+ from .database.store import get_person_database
2119
+
2120
+ try:
2121
+ # Canonicalize organizations
2122
+ database = OrganizationDatabase(db_path=db_path)
2123
+ click.echo("Running organization canonicalization...", err=True)
2124
+
2125
+ result = database.canonicalize(batch_size=batch_size)
2126
+
2127
+ click.echo("\nOrganization Canonicalization Results")
2128
+ click.echo("=" * 40)
2129
+ click.echo(f"Total records processed: {result['total_records']:,}")
2130
+ click.echo(f"Equivalence groups found: {result['groups_found']:,}")
2131
+ click.echo(f"Multi-record groups: {result['multi_record_groups']:,}")
2132
+ click.echo(f"Records updated: {result['records_updated']:,}")
2133
+
2134
+ database.close()
2135
+
2136
+ # Canonicalize people
2137
+ db_path_obj = Path(db_path) if db_path else None
2138
+ person_db = get_person_database(db_path=db_path_obj)
2139
+ click.echo("\nRunning people canonicalization...", err=True)
2140
+
2141
+ people_result = person_db.canonicalize(batch_size=batch_size)
2142
+
2143
+ click.echo("\nPeople Canonicalization Results")
2144
+ click.echo("=" * 40)
2145
+ click.echo(f"Total records processed: {people_result['total_records']:,}")
2146
+ click.echo(f"Matched by organization: {people_result['matched_by_org']:,}")
2147
+ click.echo(f"Matched by date overlap: {people_result['matched_by_date']:,}")
2148
+ click.echo(f"Canonical groups: {people_result['canonical_groups']:,}")
2149
+ click.echo(f"Records in multi-record groups: {people_result['records_in_groups']:,}")
2150
+
2151
+ person_db.close()
2152
+
2153
+ except Exception as e:
2154
+ raise click.ClickException(f"Canonicalization failed: {e}")
2155
+
2156
+
1194
2157
  @db_cmd.command("search")
1195
2158
  @click.argument("query")
1196
2159
  @click.option("--db", "db_path", type=click.Path(), help="Database path")
@@ -1247,10 +2210,9 @@ def db_search(query: str, db_path: Optional[str], top_k: int, source: Optional[s
1247
2210
  @click.option("--repo", type=str, default="Corp-o-Rate-Community/entity-references", help="HuggingFace repo ID")
1248
2211
  @click.option("--db", "db_path", type=click.Path(), help="Output path for database")
1249
2212
  @click.option("--full", is_flag=True, help="Download full version (larger, includes record metadata)")
1250
- @click.option("--no-compress", is_flag=True, help="Download uncompressed version (slower)")
1251
2213
  @click.option("--force", is_flag=True, help="Force re-download")
1252
2214
  @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1253
- def db_download(repo: str, db_path: Optional[str], full: bool, no_compress: bool, force: bool, verbose: bool):
2215
+ def db_download(repo: str, db_path: Optional[str], full: bool, force: bool, verbose: bool):
1254
2216
  """
1255
2217
  Download entity database from HuggingFace Hub.
1256
2218
 
@@ -1274,7 +2236,6 @@ def db_download(repo: str, db_path: Optional[str], full: bool, no_compress: bool
1274
2236
  repo_id=repo,
1275
2237
  filename=filename,
1276
2238
  force_download=force,
1277
- prefer_compressed=not no_compress,
1278
2239
  )
1279
2240
  click.echo(f"Database downloaded to: {path}")
1280
2241
  except Exception as e:
@@ -1286,27 +2247,23 @@ def db_download(repo: str, db_path: Optional[str], full: bool, no_compress: bool
1286
2247
  @click.option("--repo", type=str, default="Corp-o-Rate-Community/entity-references", help="HuggingFace repo ID")
1287
2248
  @click.option("--message", type=str, default="Update entity database", help="Commit message")
1288
2249
  @click.option("--no-lite", is_flag=True, help="Skip creating lite version (without record data)")
1289
- @click.option("--no-compress", is_flag=True, help="Skip creating compressed versions")
1290
2250
  @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1291
- def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, no_compress: bool, verbose: bool):
2251
+ def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, verbose: bool):
1292
2252
  """
1293
- Upload entity database to HuggingFace Hub with variants.
2253
+ Upload entity database to HuggingFace Hub.
1294
2254
 
1295
- If no path is provided, uploads from the default cache location.
1296
-
1297
- By default uploads:
2255
+ First VACUUMs the database, then creates and uploads:
1298
2256
  - entities.db (full database)
1299
2257
  - entities-lite.db (without record data, smaller)
1300
- - entities.db.gz (compressed full)
1301
- - entities-lite.db.gz (compressed lite)
1302
2258
 
2259
+ If no path is provided, uploads from the default cache location.
1303
2260
  Requires HF_TOKEN environment variable to be set.
1304
2261
 
1305
2262
  \b
1306
2263
  Examples:
1307
2264
  corp-extractor db upload
1308
2265
  corp-extractor db upload /path/to/entities.db
1309
- corp-extractor db upload --no-lite --no-compress
2266
+ corp-extractor db upload --no-lite
1310
2267
  corp-extractor db upload --repo my-org/my-entity-db
1311
2268
  """
1312
2269
  _configure_logging(verbose)
@@ -1322,10 +2279,9 @@ def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, no
1322
2279
  )
1323
2280
 
1324
2281
  click.echo(f"Uploading {db_path} to {repo}...", err=True)
2282
+ click.echo(" - Running VACUUM to optimize database", err=True)
1325
2283
  if not no_lite:
1326
2284
  click.echo(" - Creating lite version (without record data)", err=True)
1327
- if not no_compress:
1328
- click.echo(" - Creating compressed versions", err=True)
1329
2285
 
1330
2286
  try:
1331
2287
  results = upload_database_with_variants(
@@ -1333,7 +2289,6 @@ def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, no
1333
2289
  repo_id=repo,
1334
2290
  commit_message=message,
1335
2291
  include_lite=not no_lite,
1336
- include_compressed=not no_compress,
1337
2292
  )
1338
2293
  click.echo(f"\nUploaded {len(results)} file(s) successfully:")
1339
2294
  for filename, url in results.items():
@@ -1371,31 +2326,6 @@ def db_create_lite(db_path: str, output: Optional[str], verbose: bool):
1371
2326
  raise click.ClickException(f"Failed to create lite database: {e}")
1372
2327
 
1373
2328
 
1374
- @db_cmd.command("compress")
1375
- @click.argument("db_path", type=click.Path(exists=True))
1376
- @click.option("-o", "--output", type=click.Path(), help="Output path (default: adds .gz suffix)")
1377
- @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1378
- def db_compress(db_path: str, output: Optional[str], verbose: bool):
1379
- """
1380
- Compress a database file using gzip.
1381
-
1382
- \b
1383
- Examples:
1384
- corp-extractor db compress entities.db
1385
- corp-extractor db compress entities.db -o entities.db.gz
1386
- """
1387
- _configure_logging(verbose)
1388
- from .database.hub import compress_database
1389
-
1390
- click.echo(f"Compressing {db_path}...", err=True)
1391
-
1392
- try:
1393
- compressed_path = compress_database(db_path, output)
1394
- click.echo(f"Compressed database created: {compressed_path}")
1395
- except Exception as e:
1396
- raise click.ClickException(f"Compression failed: {e}")
1397
-
1398
-
1399
2329
  @db_cmd.command("repair-embeddings")
1400
2330
  @click.option("--db", "db_path", type=click.Path(), help="Database path")
1401
2331
  @click.option("--batch-size", type=int, default=1000, help="Batch size for embedding generation (default: 1000)")
@@ -1441,9 +2371,9 @@ def db_repair_embeddings(db_path: Optional[str], batch_size: int, source: Option
1441
2371
  names.append(name)
1442
2372
 
1443
2373
  if len(names) >= batch_size:
1444
- # Generate embeddings
1445
- embeddings = embedder.embed_batch(names)
1446
- database.insert_embeddings_batch(org_ids, embeddings)
2374
+ # Generate both float32 and int8 embeddings
2375
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
2376
+ database.insert_both_embeddings_batch(org_ids, embeddings, scalar_embeddings)
1447
2377
  count += len(names)
1448
2378
  click.echo(f"Repaired {count:,} / {missing_count:,} embeddings...", err=True)
1449
2379
  org_ids = []
@@ -1451,14 +2381,161 @@ def db_repair_embeddings(db_path: Optional[str], batch_size: int, source: Option
1451
2381
 
1452
2382
  # Final batch
1453
2383
  if names:
1454
- embeddings = embedder.embed_batch(names)
1455
- database.insert_embeddings_batch(org_ids, embeddings)
2384
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
2385
+ database.insert_both_embeddings_batch(org_ids, embeddings, scalar_embeddings)
1456
2386
  count += len(names)
1457
2387
 
1458
2388
  click.echo(f"\nRepaired {count:,} embeddings successfully.", err=True)
1459
2389
  database.close()
1460
2390
 
1461
2391
 
2392
+ @db_cmd.command("backfill-scalar")
2393
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
2394
+ @click.option("--batch-size", type=int, default=10000, help="Batch size for processing (default: 10000)")
2395
+ @click.option("--embed-batch-size", type=int, default=64, help="Batch size for embedding generation (default: 64)")
2396
+ @click.option("--skip-generate", is_flag=True, help="Skip generating missing float32 embeddings (only quantize existing)")
2397
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
2398
+ def db_backfill_scalar(db_path: Optional[str], batch_size: int, embed_batch_size: int, skip_generate: bool, verbose: bool):
2399
+ """
2400
+ Backfill scalar (int8) embeddings for the entity database.
2401
+
2402
+ This command handles two cases:
2403
+ 1. Records with float32 but missing scalar → quantize existing
2404
+ 2. Records missing both embeddings → generate both from scratch
2405
+
2406
+ Scalar embeddings provide 75% storage reduction with ~92% recall at top-100.
2407
+
2408
+ \b
2409
+ Examples:
2410
+ corp-extractor db backfill-scalar
2411
+ corp-extractor db backfill-scalar --batch-size 5000 -v
2412
+ corp-extractor db backfill-scalar --skip-generate # Only quantize existing
2413
+ """
2414
+ _configure_logging(verbose)
2415
+ import numpy as np
2416
+
2417
+ from .database import OrganizationDatabase, CompanyEmbedder
2418
+ from .database.store import get_person_database
2419
+
2420
+ embedder = None # Lazy load only if needed
2421
+
2422
+ # Process organizations
2423
+ org_db = OrganizationDatabase(db_path=db_path)
2424
+
2425
+ # Phase 1: Quantize existing float32 embeddings to scalar
2426
+ org_quantized = 0
2427
+ click.echo("Phase 1: Quantizing existing float32 embeddings to scalar...", err=True)
2428
+ for batch_ids in org_db.get_missing_scalar_embedding_ids(batch_size=batch_size):
2429
+ fp32_map = org_db.get_embeddings_by_ids(batch_ids)
2430
+ if not fp32_map:
2431
+ continue
2432
+
2433
+ ids = list(fp32_map.keys())
2434
+ int8_embeddings = np.array([
2435
+ np.clip(np.round(fp32_map[i] * 127), -127, 127).astype(np.int8)
2436
+ for i in ids
2437
+ ])
2438
+
2439
+ org_db.insert_scalar_embeddings_batch(ids, int8_embeddings)
2440
+ org_quantized += len(ids)
2441
+ click.echo(f" Quantized {org_quantized:,} organization embeddings...", err=True)
2442
+
2443
+ click.echo(f"Quantized {org_quantized:,} organization embeddings.", err=True)
2444
+
2445
+ # Phase 2: Generate embeddings for records missing both
2446
+ org_generated = 0
2447
+ if not skip_generate:
2448
+ click.echo("\nPhase 2: Generating embeddings for organizations missing both...", err=True)
2449
+
2450
+ for batch in org_db.get_missing_all_embedding_ids(batch_size=batch_size):
2451
+ if not batch:
2452
+ continue
2453
+
2454
+ # Lazy load embedder
2455
+ if embedder is None:
2456
+ click.echo(" Loading embedding model...", err=True)
2457
+ embedder = CompanyEmbedder()
2458
+
2459
+ # Process in smaller batches for embedding generation
2460
+ for i in range(0, len(batch), embed_batch_size):
2461
+ sub_batch = batch[i:i + embed_batch_size]
2462
+ ids = [item[0] for item in sub_batch]
2463
+ names = [item[1] for item in sub_batch]
2464
+
2465
+ # Generate both float32 and int8 embeddings
2466
+ fp32_batch, int8_batch = embedder.embed_batch_and_quantize(names, batch_size=embed_batch_size)
2467
+
2468
+ # Insert both
2469
+ org_db.insert_both_embeddings_batch(ids, fp32_batch, int8_batch)
2470
+ org_generated += len(ids)
2471
+
2472
+ if org_generated % 10000 == 0:
2473
+ click.echo(f" Generated {org_generated:,} organization embeddings...", err=True)
2474
+
2475
+ click.echo(f"Generated {org_generated:,} organization embeddings.", err=True)
2476
+
2477
+ # Process people
2478
+ person_db = get_person_database(db_path=db_path)
2479
+
2480
+ # Phase 1: Quantize existing float32 embeddings to scalar
2481
+ person_quantized = 0
2482
+ click.echo("\nPhase 1: Quantizing existing float32 person embeddings to scalar...", err=True)
2483
+ for batch_ids in person_db.get_missing_scalar_embedding_ids(batch_size=batch_size):
2484
+ fp32_map = person_db.get_embeddings_by_ids(batch_ids)
2485
+ if not fp32_map:
2486
+ continue
2487
+
2488
+ ids = list(fp32_map.keys())
2489
+ int8_embeddings = np.array([
2490
+ np.clip(np.round(fp32_map[i] * 127), -127, 127).astype(np.int8)
2491
+ for i in ids
2492
+ ])
2493
+
2494
+ person_db.insert_scalar_embeddings_batch(ids, int8_embeddings)
2495
+ person_quantized += len(ids)
2496
+ click.echo(f" Quantized {person_quantized:,} person embeddings...", err=True)
2497
+
2498
+ click.echo(f"Quantized {person_quantized:,} person embeddings.", err=True)
2499
+
2500
+ # Phase 2: Generate embeddings for records missing both
2501
+ person_generated = 0
2502
+ if not skip_generate:
2503
+ click.echo("\nPhase 2: Generating embeddings for people missing both...", err=True)
2504
+
2505
+ for batch in person_db.get_missing_all_embedding_ids(batch_size=batch_size):
2506
+ if not batch:
2507
+ continue
2508
+
2509
+ # Lazy load embedder
2510
+ if embedder is None:
2511
+ click.echo(" Loading embedding model...", err=True)
2512
+ embedder = CompanyEmbedder()
2513
+
2514
+ # Process in smaller batches for embedding generation
2515
+ for i in range(0, len(batch), embed_batch_size):
2516
+ sub_batch = batch[i:i + embed_batch_size]
2517
+ ids = [item[0] for item in sub_batch]
2518
+ names = [item[1] for item in sub_batch]
2519
+
2520
+ # Generate both float32 and int8 embeddings
2521
+ fp32_batch, int8_batch = embedder.embed_batch_and_quantize(names, batch_size=embed_batch_size)
2522
+
2523
+ # Insert both
2524
+ person_db.insert_both_embeddings_batch(ids, fp32_batch, int8_batch)
2525
+ person_generated += len(ids)
2526
+
2527
+ if person_generated % 10000 == 0:
2528
+ click.echo(f" Generated {person_generated:,} person embeddings...", err=True)
2529
+
2530
+ click.echo(f"Generated {person_generated:,} person embeddings.", err=True)
2531
+
2532
+ # Summary
2533
+ click.echo(f"\nSummary:", err=True)
2534
+ click.echo(f" Organizations: {org_quantized:,} quantized, {org_generated:,} generated", err=True)
2535
+ click.echo(f" People: {person_quantized:,} quantized, {person_generated:,} generated", err=True)
2536
+ click.echo(f" Total: {org_quantized + org_generated + person_quantized + person_generated:,} embeddings processed", err=True)
2537
+
2538
+
1462
2539
  @db_cmd.command("migrate")
1463
2540
  @click.argument("db_path", type=click.Path(exists=True))
1464
2541
  @click.option("--rename-file", is_flag=True, help="Also rename companies.db to entities.db")
@@ -1520,6 +2597,145 @@ def db_migrate(db_path: str, rename_file: bool, yes: bool, verbose: bool):
1520
2597
  raise click.ClickException(f"Migration failed: {e}")
1521
2598
 
1522
2599
 
2600
+ @db_cmd.command("migrate-v2")
2601
+ @click.argument("source_db", type=click.Path(exists=True))
2602
+ @click.argument("target_db", type=click.Path())
2603
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
2604
+ @click.option("--resume", is_flag=True, help="Resume from last completed step")
2605
+ def db_migrate_v2(source_db: str, target_db: str, verbose: bool, resume: bool):
2606
+ """
2607
+ Migrate database from v1 schema to v2 normalized schema.
2608
+
2609
+ Creates a NEW database file with the v2 normalized schema.
2610
+ The original database is preserved unchanged.
2611
+
2612
+ Use --resume to continue a migration that was interrupted.
2613
+
2614
+ \b
2615
+ V2 changes:
2616
+ - TEXT enum fields replaced with INTEGER foreign keys
2617
+ - New enum lookup tables (source_types, people_types, etc.)
2618
+ - New roles and locations tables
2619
+ - QIDs stored as integers (Q prefix stripped)
2620
+ - Human-readable views for queries
2621
+
2622
+ \b
2623
+ Examples:
2624
+ corp-extractor db migrate-v2 entities.db entities-v2.db
2625
+ corp-extractor db migrate-v2 entities.db entities-v2.db --resume
2626
+ corp-extractor db migrate-v2 ~/.cache/corp-extractor/entities.db ./entities-v2.db -v
2627
+ """
2628
+ _configure_logging(verbose)
2629
+
2630
+ from pathlib import Path
2631
+ from .database.migrate_v2 import migrate_database
2632
+
2633
+ source_path = Path(source_db)
2634
+ target_path = Path(target_db)
2635
+
2636
+ if target_path.exists() and not resume:
2637
+ raise click.ClickException(
2638
+ f"Target database already exists: {target_path}\n"
2639
+ "Use --resume to continue an interrupted migration."
2640
+ )
2641
+
2642
+ if resume:
2643
+ click.echo(f"Resuming migration from {source_path} to {target_path}...")
2644
+ else:
2645
+ click.echo(f"Migrating {source_path} to {target_path}...")
2646
+
2647
+ try:
2648
+ stats = migrate_database(source_path, target_path, resume=resume)
2649
+
2650
+ click.echo("\nMigration complete:")
2651
+ for key, value in stats.items():
2652
+ click.echo(f" {key}: {value:,}")
2653
+
2654
+ except Exception as e:
2655
+ raise click.ClickException(f"Migration failed: {e}")
2656
+
2657
+
2658
+ @db_cmd.command("search-roles")
2659
+ @click.argument("query")
2660
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
2661
+ @click.option("--limit", default=10, help="Maximum results to return")
2662
+ def db_search_roles(query: str, db_path: Optional[str], limit: int):
2663
+ """
2664
+ Search for roles by name.
2665
+
2666
+ \b
2667
+ Examples:
2668
+ corp-extractor db search-roles "CEO"
2669
+ corp-extractor db search-roles "Chief Executive" --limit 5
2670
+ """
2671
+ from .database.store import get_roles_database
2672
+
2673
+ roles_db = get_roles_database(db_path)
2674
+ results = roles_db.search(query, top_k=limit)
2675
+
2676
+ if not results:
2677
+ click.echo(f"No roles found matching '{query}'")
2678
+ return
2679
+
2680
+ click.echo(f"Found {len(results)} role(s) matching '{query}':")
2681
+ for role_id, name, score in results:
2682
+ click.echo(f" [{role_id}] {name} (score: {score:.2f})")
2683
+
2684
+
2685
+ @db_cmd.command("search-locations")
2686
+ @click.argument("query")
2687
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
2688
+ @click.option("--type", "location_type", type=str, help="Filter by simplified type (country, city, etc.)")
2689
+ @click.option("--limit", default=10, help="Maximum results to return")
2690
+ def db_search_locations(query: str, db_path: Optional[str], location_type: Optional[str], limit: int):
2691
+ """
2692
+ Search for locations by name.
2693
+
2694
+ \b
2695
+ Examples:
2696
+ corp-extractor db search-locations "California"
2697
+ corp-extractor db search-locations "Paris" --type city
2698
+ corp-extractor db search-locations "Germany" --type country
2699
+ """
2700
+ from .database.store import get_locations_database
2701
+
2702
+ locations_db = get_locations_database(db_path)
2703
+ results = locations_db.search(query, top_k=limit, simplified_type=location_type)
2704
+
2705
+ if not results:
2706
+ click.echo(f"No locations found matching '{query}'")
2707
+ return
2708
+
2709
+ click.echo(f"Found {len(results)} location(s) matching '{query}':")
2710
+ for loc_id, name, score in results:
2711
+ click.echo(f" [{loc_id}] {name} (score: {score:.2f})")
2712
+
2713
+
2714
+ @db_cmd.command("import-locations")
2715
+ @click.option("--from-pycountry", is_flag=True, help="Import countries from pycountry")
2716
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
2717
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
2718
+ def db_import_locations(from_pycountry: bool, db_path: Optional[str], verbose: bool):
2719
+ """
2720
+ Import locations into the database.
2721
+
2722
+ \b
2723
+ Examples:
2724
+ corp-extractor db import-locations --from-pycountry
2725
+ """
2726
+ _configure_logging(verbose)
2727
+
2728
+ if not from_pycountry:
2729
+ raise click.UsageError("Must specify --from-pycountry")
2730
+
2731
+ from .database.store import get_locations_database
2732
+
2733
+ locations_db = get_locations_database(db_path)
2734
+ count = locations_db.import_from_pycountry()
2735
+
2736
+ click.echo(f"Imported {count:,} locations from pycountry")
2737
+
2738
+
1523
2739
  # =============================================================================
1524
2740
  # Document commands
1525
2741
  # =============================================================================