corp-extractor 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -783,10 +783,10 @@ def db_import_gleif(file_path: Optional[str], download: bool, force: bool, db_pa
783
783
  records.append(record)
784
784
 
785
785
  if len(records) >= batch_size:
786
- # Embed and insert batch
786
+ # Embed and insert batch (both float32 and int8)
787
787
  names = [r.name for r in records]
788
- embeddings = embedder.embed_batch(names)
789
- database.insert_batch(records, embeddings)
788
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
789
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
790
790
  count += len(records)
791
791
  click.echo(f"Imported {count} records...", err=True)
792
792
  records = []
@@ -794,8 +794,8 @@ def db_import_gleif(file_path: Optional[str], download: bool, force: bool, db_pa
794
794
  # Final batch
795
795
  if records:
796
796
  names = [r.name for r in records]
797
- embeddings = embedder.embed_batch(names)
798
- database.insert_batch(records, embeddings)
797
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
798
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
799
799
  count += len(records)
800
800
 
801
801
  click.echo(f"\nImported {count} GLEIF records successfully.", err=True)
@@ -853,8 +853,8 @@ def db_import_sec(download: bool, file_path: Optional[str], db_path: Optional[st
853
853
 
854
854
  if len(records) >= batch_size:
855
855
  names = [r.name for r in records]
856
- embeddings = embedder.embed_batch(names)
857
- database.insert_batch(records, embeddings)
856
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
857
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
858
858
  count += len(records)
859
859
  click.echo(f"Imported {count} records...", err=True)
860
860
  records = []
@@ -862,8 +862,8 @@ def db_import_sec(download: bool, file_path: Optional[str], db_path: Optional[st
862
862
  # Final batch
863
863
  if records:
864
864
  names = [r.name for r in records]
865
- embeddings = embedder.embed_batch(names)
866
- database.insert_batch(records, embeddings)
865
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
866
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
867
867
  count += len(records)
868
868
 
869
869
  click.echo(f"\nImported {count} SEC Edgar records successfully.", err=True)
@@ -955,8 +955,8 @@ def db_import_sec_officers(db_path: Optional[str], start_year: int, end_year: Op
955
955
 
956
956
  if len(records) >= batch_size:
957
957
  embedding_texts = [r.get_embedding_text() for r in records]
958
- embeddings = embedder.embed_batch(embedding_texts)
959
- database.insert_batch(records, embeddings)
958
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
959
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
960
960
  count += len(records)
961
961
  click.echo(f"Imported {count} records...", err=True)
962
962
  records = []
@@ -964,8 +964,8 @@ def db_import_sec_officers(db_path: Optional[str], start_year: int, end_year: Op
964
964
  # Final batch
965
965
  if records:
966
966
  embedding_texts = [r.get_embedding_text() for r in records]
967
- embeddings = embedder.embed_batch(embedding_texts)
968
- database.insert_batch(records, embeddings)
967
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
968
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
969
969
  count += len(records)
970
970
 
971
971
  if skip_existing and skipped_existing > 0:
@@ -1055,8 +1055,8 @@ def db_import_ch_officers(file_path: str, db_path: Optional[str], limit: Optiona
1055
1055
 
1056
1056
  if len(records) >= batch_size:
1057
1057
  embedding_texts = [r.get_embedding_text() for r in records]
1058
- embeddings = embedder.embed_batch(embedding_texts)
1059
- database.insert_batch(records, embeddings)
1058
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
1059
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
1060
1060
  count += len(records)
1061
1061
  click.echo(f"Imported {count} records...", err=True)
1062
1062
  records = []
@@ -1064,8 +1064,8 @@ def db_import_ch_officers(file_path: str, db_path: Optional[str], limit: Optiona
1064
1064
  # Final batch
1065
1065
  if records:
1066
1066
  embedding_texts = [r.get_embedding_text() for r in records]
1067
- embeddings = embedder.embed_batch(embedding_texts)
1068
- database.insert_batch(records, embeddings)
1067
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
1068
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
1069
1069
  count += len(records)
1070
1070
 
1071
1071
  if skip_existing and skipped_existing > 0:
@@ -1130,8 +1130,8 @@ def db_import_wikidata(db_path: Optional[str], limit: Optional[int], batch_size:
1130
1130
 
1131
1131
  if len(records) >= batch_size:
1132
1132
  names = [r.name for r in records]
1133
- embeddings = embedder.embed_batch(names)
1134
- database.insert_batch(records, embeddings)
1133
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
1134
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
1135
1135
  count += len(records)
1136
1136
  click.echo(f"Imported {count} records...", err=True)
1137
1137
  records = []
@@ -1139,8 +1139,8 @@ def db_import_wikidata(db_path: Optional[str], limit: Optional[int], batch_size:
1139
1139
  # Final batch
1140
1140
  if records:
1141
1141
  names = [r.name for r in records]
1142
- embeddings = embedder.embed_batch(names)
1143
- database.insert_batch(records, embeddings)
1142
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
1143
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
1144
1144
  count += len(records)
1145
1145
 
1146
1146
  click.echo(f"\nImported {count} Wikidata records successfully.", err=True)
@@ -1219,10 +1219,10 @@ def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: i
1219
1219
  records.append(record)
1220
1220
 
1221
1221
  if len(records) >= batch_size:
1222
- # Generate embeddings (just name for now, will re-embed after enrichment)
1222
+ # Generate embeddings (both float32 and int8)
1223
1223
  embedding_texts = [r.get_embedding_text() for r in records]
1224
- embeddings = embedder.embed_batch(embedding_texts)
1225
- database.insert_batch(records, embeddings)
1224
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
1225
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
1226
1226
  count += len(records)
1227
1227
 
1228
1228
  click.echo(f" Imported {count} people...", err=True)
@@ -1231,8 +1231,8 @@ def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: i
1231
1231
  # Final batch
1232
1232
  if records:
1233
1233
  embedding_texts = [r.get_embedding_text() for r in records]
1234
- embeddings = embedder.embed_batch(embedding_texts)
1235
- database.insert_batch(records, embeddings)
1234
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
1235
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
1236
1236
  count += len(records)
1237
1237
 
1238
1238
  if skip_existing and skipped_existing > 0:
@@ -1333,6 +1333,7 @@ def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: i
1333
1333
  @click.option("--db", "db_path", type=click.Path(), help="Database path")
1334
1334
  @click.option("--people/--no-people", default=True, help="Import people (default: yes)")
1335
1335
  @click.option("--orgs/--no-orgs", default=True, help="Import organizations (default: yes)")
1336
+ @click.option("--locations/--no-locations", default=False, help="Import locations (default: no)")
1336
1337
  @click.option("--require-enwiki", is_flag=True, help="Only import orgs with English Wikipedia articles")
1337
1338
  @click.option("--resume", is_flag=True, help="Resume from last position in dump file (tracks entity index)")
1338
1339
  @click.option("--skip-updates", is_flag=True, help="Skip Q codes already in database (no updates)")
@@ -1347,6 +1348,7 @@ def db_import_wikidata_dump(
1347
1348
  db_path: Optional[str],
1348
1349
  people: bool,
1349
1350
  orgs: bool,
1351
+ locations: bool,
1350
1352
  require_enwiki: bool,
1351
1353
  resume: bool,
1352
1354
  skip_updates: bool,
@@ -1355,7 +1357,7 @@ def db_import_wikidata_dump(
1355
1357
  verbose: bool,
1356
1358
  ):
1357
1359
  """
1358
- Import people and organizations from Wikidata JSON dump.
1360
+ Import people, organizations, and locations from Wikidata JSON dump.
1359
1361
 
1360
1362
  This uses the full Wikidata JSON dump (~100GB compressed) to import
1361
1363
  all humans and organizations with English Wikipedia articles. This
@@ -1370,6 +1372,7 @@ def db_import_wikidata_dump(
1370
1372
  - Resumable with --resume (tracks position in dump file)
1371
1373
  - Skip existing with --skip-updates (loads existing Q codes)
1372
1374
  - People like Andy Burnham are captured via occupation (P106)
1375
+ - Locations (countries, cities, regions) with parent hierarchy
1373
1376
 
1374
1377
  \b
1375
1378
  Resume options:
@@ -1383,6 +1386,7 @@ def db_import_wikidata_dump(
1383
1386
  corp-extractor db import-wikidata-dump --dump /path/to/dump.json.bz2 --limit 10000
1384
1387
  corp-extractor db import-wikidata-dump --download --people --no-orgs --limit 50000
1385
1388
  corp-extractor db import-wikidata-dump --dump dump.json.bz2 --orgs --no-people
1389
+ corp-extractor db import-wikidata-dump --dump dump.json.bz2 --locations --no-people --no-orgs # Locations only
1386
1390
  corp-extractor db import-wikidata-dump --dump dump.json.bz2 --resume # Resume interrupted import
1387
1391
  corp-extractor db import-wikidata-dump --dump dump.json.bz2 --skip-updates # Skip existing Q codes
1388
1392
  """
@@ -1395,8 +1399,8 @@ def db_import_wikidata_dump(
1395
1399
  if not dump_path and not download:
1396
1400
  raise click.UsageError("Either --dump path or --download is required")
1397
1401
 
1398
- if not people and not orgs:
1399
- raise click.UsageError("Must import at least one of --people or --orgs")
1402
+ if not people and not orgs and not locations:
1403
+ raise click.UsageError("Must import at least one of --people, --orgs, or --locations")
1400
1404
 
1401
1405
  # Default database path
1402
1406
  if db_path is None:
@@ -1538,6 +1542,121 @@ def db_import_wikidata_dump(
1538
1542
  return len(new_labels)
1539
1543
  return 0
1540
1544
 
1545
+ # ========================================
1546
+ # Location-only import (separate pass)
1547
+ # ========================================
1548
+ if locations and not people and not orgs:
1549
+ from .database.store import get_locations_database
1550
+
1551
+ click.echo("\n=== Location Import ===", err=True)
1552
+ click.echo(f" Locations: {'up to ' + str(limit) + ' records' if limit else 'unlimited'}", err=True)
1553
+ if require_enwiki:
1554
+ click.echo(" Filter: only locations with English Wikipedia articles", err=True)
1555
+
1556
+ # Initialize locations database
1557
+ locations_database = get_locations_database(db_path=db_path_obj)
1558
+
1559
+ # Load existing location Q codes for skip_updates mode
1560
+ existing_location_ids: set[str] = set()
1561
+ if skip_updates:
1562
+ existing_location_ids = locations_database.get_all_source_ids(source="wikidata")
1563
+ click.echo(f" Skip updates: {len(existing_location_ids):,} existing Q codes", err=True)
1564
+
1565
+ if start_index > 0:
1566
+ click.echo(f" Resuming from entity index {start_index:,}", err=True)
1567
+
1568
+ location_records: list = []
1569
+ locations_count = 0
1570
+ last_entity_index = start_index
1571
+ last_entity_id = ""
1572
+
1573
+ def location_progress_callback(entity_index: int, entity_id: str, loc_count: int) -> None:
1574
+ nonlocal last_entity_index, last_entity_id
1575
+ last_entity_index = entity_index
1576
+ last_entity_id = entity_id
1577
+
1578
+ def save_location_progress() -> None:
1579
+ if progress:
1580
+ progress.entity_index = last_entity_index
1581
+ progress.last_entity_id = last_entity_id
1582
+ progress.save()
1583
+
1584
+ def flush_location_batch() -> None:
1585
+ nonlocal location_records, locations_count
1586
+ if location_records:
1587
+ inserted = locations_database.insert_batch(location_records)
1588
+ locations_count += inserted
1589
+ location_records = []
1590
+
1591
+ click.echo("Starting dump iteration...", err=True)
1592
+ sys.stderr.flush()
1593
+
1594
+ try:
1595
+ if limit:
1596
+ # Use progress bar when we have limits
1597
+ with click.progressbar(
1598
+ length=limit,
1599
+ label="Processing dump",
1600
+ show_percent=True,
1601
+ show_pos=True,
1602
+ ) as pbar:
1603
+ for record in importer.import_locations(
1604
+ limit=limit,
1605
+ require_enwiki=require_enwiki,
1606
+ skip_ids=existing_location_ids if skip_updates else None,
1607
+ start_index=start_index,
1608
+ progress_callback=location_progress_callback,
1609
+ ):
1610
+ pbar.update(1)
1611
+ location_records.append(record)
1612
+ if len(location_records) >= batch_size:
1613
+ flush_location_batch()
1614
+ persist_new_labels()
1615
+ save_location_progress()
1616
+ else:
1617
+ # No limit - show counter updates
1618
+ for record in importer.import_locations(
1619
+ limit=None,
1620
+ require_enwiki=require_enwiki,
1621
+ skip_ids=existing_location_ids if skip_updates else None,
1622
+ start_index=start_index,
1623
+ progress_callback=location_progress_callback,
1624
+ ):
1625
+ location_records.append(record)
1626
+ if len(location_records) >= batch_size:
1627
+ flush_location_batch()
1628
+ persist_new_labels()
1629
+ save_location_progress()
1630
+ click.echo(f"\r Progress: {locations_count:,} locations...", nl=False, err=True)
1631
+ sys.stderr.flush()
1632
+
1633
+ click.echo("", err=True) # Newline after counter
1634
+
1635
+ # Final batches
1636
+ flush_location_batch()
1637
+ persist_new_labels()
1638
+ save_location_progress()
1639
+
1640
+ finally:
1641
+ # Ensure we save progress even on interrupt
1642
+ save_location_progress()
1643
+
1644
+ click.echo(f"\nLocation import complete: {locations_count:,} locations", err=True)
1645
+
1646
+ # Final label resolution
1647
+ click.echo("\n=== Final QID Label Resolution ===", err=True)
1648
+ all_labels = importer.get_label_cache()
1649
+ click.echo(f" Total labels in cache: {len(all_labels):,}", err=True)
1650
+
1651
+ # Final stats
1652
+ final_label_count = database.get_qid_labels_count()
1653
+ click.echo(f" Total labels in DB: {final_label_count:,}", err=True)
1654
+
1655
+ locations_database.close()
1656
+ database.close()
1657
+ click.echo("\nWikidata dump import complete!", err=True)
1658
+ return
1659
+
1541
1660
  # Combined import - single pass through the dump for both people and orgs
1542
1661
  click.echo("\n=== Combined Import (single dump pass) ===", err=True)
1543
1662
  sys.stderr.flush() # Ensure output is visible immediately
@@ -1583,8 +1702,8 @@ def db_import_wikidata_dump(
1583
1702
  nonlocal people_records, people_count
1584
1703
  if people_records:
1585
1704
  embedding_texts = [r.get_embedding_text() for r in people_records]
1586
- embeddings = embedder.embed_batch(embedding_texts)
1587
- person_database.insert_batch(people_records, embeddings)
1705
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
1706
+ person_database.insert_batch(people_records, embeddings, scalar_embeddings=scalar_embeddings)
1588
1707
  people_count += len(people_records)
1589
1708
  people_records = []
1590
1709
 
@@ -1592,8 +1711,8 @@ def db_import_wikidata_dump(
1592
1711
  nonlocal org_records, orgs_count
1593
1712
  if org_records and org_database:
1594
1713
  names = [r.name for r in org_records]
1595
- embeddings = embedder.embed_batch(names)
1596
- org_database.insert_batch(org_records, embeddings)
1714
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
1715
+ org_database.insert_batch(org_records, embeddings, scalar_embeddings=scalar_embeddings)
1597
1716
  orgs_count += len(org_records)
1598
1717
  org_records = []
1599
1718
 
@@ -1735,9 +1854,9 @@ def db_import_wikidata_dump(
1735
1854
 
1736
1855
  if orgs:
1737
1856
  org_database = get_database(db_path=db_path_obj)
1738
- org_updates = org_database.resolve_qid_labels(all_labels)
1739
- if org_updates:
1740
- click.echo(f" Updated orgs: {org_updates:,} regions", err=True)
1857
+ org_updates, org_deletes = org_database.resolve_qid_labels(all_labels)
1858
+ if org_updates or org_deletes:
1859
+ click.echo(f" Orgs: {org_updates:,} updated, {org_deletes:,} duplicates deleted", err=True)
1741
1860
  org_database.close()
1742
1861
 
1743
1862
  # Final stats
@@ -1875,8 +1994,8 @@ def db_import_companies_house(
1875
1994
 
1876
1995
  if len(records) >= batch_size:
1877
1996
  names = [r.name for r in records]
1878
- embeddings = embedder.embed_batch(names)
1879
- database.insert_batch(records, embeddings)
1997
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
1998
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
1880
1999
  count += len(records)
1881
2000
  click.echo(f"Imported {count} records...", err=True)
1882
2001
  records = []
@@ -1884,8 +2003,8 @@ def db_import_companies_house(
1884
2003
  # Final batch
1885
2004
  if records:
1886
2005
  names = [r.name for r in records]
1887
- embeddings = embedder.embed_batch(names)
1888
- database.insert_batch(records, embeddings)
2006
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
2007
+ database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
1889
2008
  count += len(records)
1890
2009
 
1891
2010
  click.echo(f"\nImported {count} Companies House records successfully.", err=True)
@@ -1904,6 +2023,7 @@ def db_status(db_path: Optional[str]):
1904
2023
  corp-extractor db status --db /path/to/entities.db
1905
2024
  """
1906
2025
  from .database import OrganizationDatabase
2026
+ from .database.store import get_person_database
1907
2027
 
1908
2028
  try:
1909
2029
  database = OrganizationDatabase(db_path=db_path)
@@ -1921,6 +2041,27 @@ def db_status(db_path: Optional[str]):
1921
2041
  click.echo(f"\n⚠️ Missing embeddings: {missing_embeddings:,}")
1922
2042
  click.echo(" Run 'corp-extractor db repair-embeddings' to fix")
1923
2043
 
2044
+ # Show embedding counts (float32 and scalar)
2045
+ org_fp32 = database.get_float32_embedding_count()
2046
+ org_int8 = database.get_scalar_embedding_count()
2047
+ click.echo(f"\nOrganization embeddings:")
2048
+ click.echo(f" float32: {org_fp32:,}")
2049
+ click.echo(f" int8 (scalar): {org_int8:,}")
2050
+ if org_fp32 > 0 and org_int8 < org_fp32:
2051
+ click.echo(f" ⚠️ {org_fp32 - org_int8:,} missing scalar embeddings")
2052
+ click.echo(" Run 'corp-extractor db backfill-scalar' to generate")
2053
+
2054
+ # Person embeddings
2055
+ person_db = get_person_database(db_path=db_path)
2056
+ person_fp32 = person_db.get_float32_embedding_count()
2057
+ person_int8 = person_db.get_scalar_embedding_count()
2058
+ if person_fp32 > 0:
2059
+ click.echo(f"\nPerson embeddings:")
2060
+ click.echo(f" float32: {person_fp32:,}")
2061
+ click.echo(f" int8 (scalar): {person_int8:,}")
2062
+ if person_int8 < person_fp32:
2063
+ click.echo(f" ⚠️ {person_fp32 - person_int8:,} missing scalar embeddings")
2064
+
1924
2065
  if stats.by_source:
1925
2066
  click.echo("\nRecords by source:")
1926
2067
  for source, count in stats.by_source.items():
@@ -2230,9 +2371,9 @@ def db_repair_embeddings(db_path: Optional[str], batch_size: int, source: Option
2230
2371
  names.append(name)
2231
2372
 
2232
2373
  if len(names) >= batch_size:
2233
- # Generate embeddings
2234
- embeddings = embedder.embed_batch(names)
2235
- database.insert_embeddings_batch(org_ids, embeddings)
2374
+ # Generate both float32 and int8 embeddings
2375
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
2376
+ database.insert_both_embeddings_batch(org_ids, embeddings, scalar_embeddings)
2236
2377
  count += len(names)
2237
2378
  click.echo(f"Repaired {count:,} / {missing_count:,} embeddings...", err=True)
2238
2379
  org_ids = []
@@ -2240,14 +2381,161 @@ def db_repair_embeddings(db_path: Optional[str], batch_size: int, source: Option
2240
2381
 
2241
2382
  # Final batch
2242
2383
  if names:
2243
- embeddings = embedder.embed_batch(names)
2244
- database.insert_embeddings_batch(org_ids, embeddings)
2384
+ embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
2385
+ database.insert_both_embeddings_batch(org_ids, embeddings, scalar_embeddings)
2245
2386
  count += len(names)
2246
2387
 
2247
2388
  click.echo(f"\nRepaired {count:,} embeddings successfully.", err=True)
2248
2389
  database.close()
2249
2390
 
2250
2391
 
2392
+ @db_cmd.command("backfill-scalar")
2393
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
2394
+ @click.option("--batch-size", type=int, default=10000, help="Batch size for processing (default: 10000)")
2395
+ @click.option("--embed-batch-size", type=int, default=64, help="Batch size for embedding generation (default: 64)")
2396
+ @click.option("--skip-generate", is_flag=True, help="Skip generating missing float32 embeddings (only quantize existing)")
2397
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
2398
+ def db_backfill_scalar(db_path: Optional[str], batch_size: int, embed_batch_size: int, skip_generate: bool, verbose: bool):
2399
+ """
2400
+ Backfill scalar (int8) embeddings for the entity database.
2401
+
2402
+ This command handles two cases:
2403
+ 1. Records with float32 but missing scalar → quantize existing
2404
+ 2. Records missing both embeddings → generate both from scratch
2405
+
2406
+ Scalar embeddings provide 75% storage reduction with ~92% recall at top-100.
2407
+
2408
+ \b
2409
+ Examples:
2410
+ corp-extractor db backfill-scalar
2411
+ corp-extractor db backfill-scalar --batch-size 5000 -v
2412
+ corp-extractor db backfill-scalar --skip-generate # Only quantize existing
2413
+ """
2414
+ _configure_logging(verbose)
2415
+ import numpy as np
2416
+
2417
+ from .database import OrganizationDatabase, CompanyEmbedder
2418
+ from .database.store import get_person_database
2419
+
2420
+ embedder = None # Lazy load only if needed
2421
+
2422
+ # Process organizations
2423
+ org_db = OrganizationDatabase(db_path=db_path)
2424
+
2425
+ # Phase 1: Quantize existing float32 embeddings to scalar
2426
+ org_quantized = 0
2427
+ click.echo("Phase 1: Quantizing existing float32 embeddings to scalar...", err=True)
2428
+ for batch_ids in org_db.get_missing_scalar_embedding_ids(batch_size=batch_size):
2429
+ fp32_map = org_db.get_embeddings_by_ids(batch_ids)
2430
+ if not fp32_map:
2431
+ continue
2432
+
2433
+ ids = list(fp32_map.keys())
2434
+ int8_embeddings = np.array([
2435
+ np.clip(np.round(fp32_map[i] * 127), -127, 127).astype(np.int8)
2436
+ for i in ids
2437
+ ])
2438
+
2439
+ org_db.insert_scalar_embeddings_batch(ids, int8_embeddings)
2440
+ org_quantized += len(ids)
2441
+ click.echo(f" Quantized {org_quantized:,} organization embeddings...", err=True)
2442
+
2443
+ click.echo(f"Quantized {org_quantized:,} organization embeddings.", err=True)
2444
+
2445
+ # Phase 2: Generate embeddings for records missing both
2446
+ org_generated = 0
2447
+ if not skip_generate:
2448
+ click.echo("\nPhase 2: Generating embeddings for organizations missing both...", err=True)
2449
+
2450
+ for batch in org_db.get_missing_all_embedding_ids(batch_size=batch_size):
2451
+ if not batch:
2452
+ continue
2453
+
2454
+ # Lazy load embedder
2455
+ if embedder is None:
2456
+ click.echo(" Loading embedding model...", err=True)
2457
+ embedder = CompanyEmbedder()
2458
+
2459
+ # Process in smaller batches for embedding generation
2460
+ for i in range(0, len(batch), embed_batch_size):
2461
+ sub_batch = batch[i:i + embed_batch_size]
2462
+ ids = [item[0] for item in sub_batch]
2463
+ names = [item[1] for item in sub_batch]
2464
+
2465
+ # Generate both float32 and int8 embeddings
2466
+ fp32_batch, int8_batch = embedder.embed_batch_and_quantize(names, batch_size=embed_batch_size)
2467
+
2468
+ # Insert both
2469
+ org_db.insert_both_embeddings_batch(ids, fp32_batch, int8_batch)
2470
+ org_generated += len(ids)
2471
+
2472
+ if org_generated % 10000 == 0:
2473
+ click.echo(f" Generated {org_generated:,} organization embeddings...", err=True)
2474
+
2475
+ click.echo(f"Generated {org_generated:,} organization embeddings.", err=True)
2476
+
2477
+ # Process people
2478
+ person_db = get_person_database(db_path=db_path)
2479
+
2480
+ # Phase 1: Quantize existing float32 embeddings to scalar
2481
+ person_quantized = 0
2482
+ click.echo("\nPhase 1: Quantizing existing float32 person embeddings to scalar...", err=True)
2483
+ for batch_ids in person_db.get_missing_scalar_embedding_ids(batch_size=batch_size):
2484
+ fp32_map = person_db.get_embeddings_by_ids(batch_ids)
2485
+ if not fp32_map:
2486
+ continue
2487
+
2488
+ ids = list(fp32_map.keys())
2489
+ int8_embeddings = np.array([
2490
+ np.clip(np.round(fp32_map[i] * 127), -127, 127).astype(np.int8)
2491
+ for i in ids
2492
+ ])
2493
+
2494
+ person_db.insert_scalar_embeddings_batch(ids, int8_embeddings)
2495
+ person_quantized += len(ids)
2496
+ click.echo(f" Quantized {person_quantized:,} person embeddings...", err=True)
2497
+
2498
+ click.echo(f"Quantized {person_quantized:,} person embeddings.", err=True)
2499
+
2500
+ # Phase 2: Generate embeddings for records missing both
2501
+ person_generated = 0
2502
+ if not skip_generate:
2503
+ click.echo("\nPhase 2: Generating embeddings for people missing both...", err=True)
2504
+
2505
+ for batch in person_db.get_missing_all_embedding_ids(batch_size=batch_size):
2506
+ if not batch:
2507
+ continue
2508
+
2509
+ # Lazy load embedder
2510
+ if embedder is None:
2511
+ click.echo(" Loading embedding model...", err=True)
2512
+ embedder = CompanyEmbedder()
2513
+
2514
+ # Process in smaller batches for embedding generation
2515
+ for i in range(0, len(batch), embed_batch_size):
2516
+ sub_batch = batch[i:i + embed_batch_size]
2517
+ ids = [item[0] for item in sub_batch]
2518
+ names = [item[1] for item in sub_batch]
2519
+
2520
+ # Generate both float32 and int8 embeddings
2521
+ fp32_batch, int8_batch = embedder.embed_batch_and_quantize(names, batch_size=embed_batch_size)
2522
+
2523
+ # Insert both
2524
+ person_db.insert_both_embeddings_batch(ids, fp32_batch, int8_batch)
2525
+ person_generated += len(ids)
2526
+
2527
+ if person_generated % 10000 == 0:
2528
+ click.echo(f" Generated {person_generated:,} person embeddings...", err=True)
2529
+
2530
+ click.echo(f"Generated {person_generated:,} person embeddings.", err=True)
2531
+
2532
+ # Summary
2533
+ click.echo(f"\nSummary:", err=True)
2534
+ click.echo(f" Organizations: {org_quantized:,} quantized, {org_generated:,} generated", err=True)
2535
+ click.echo(f" People: {person_quantized:,} quantized, {person_generated:,} generated", err=True)
2536
+ click.echo(f" Total: {org_quantized + org_generated + person_quantized + person_generated:,} embeddings processed", err=True)
2537
+
2538
+
2251
2539
  @db_cmd.command("migrate")
2252
2540
  @click.argument("db_path", type=click.Path(exists=True))
2253
2541
  @click.option("--rename-file", is_flag=True, help="Also rename companies.db to entities.db")
@@ -2309,6 +2597,145 @@ def db_migrate(db_path: str, rename_file: bool, yes: bool, verbose: bool):
2309
2597
  raise click.ClickException(f"Migration failed: {e}")
2310
2598
 
2311
2599
 
2600
+ @db_cmd.command("migrate-v2")
2601
+ @click.argument("source_db", type=click.Path(exists=True))
2602
+ @click.argument("target_db", type=click.Path())
2603
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
2604
+ @click.option("--resume", is_flag=True, help="Resume from last completed step")
2605
+ def db_migrate_v2(source_db: str, target_db: str, verbose: bool, resume: bool):
2606
+ """
2607
+ Migrate database from v1 schema to v2 normalized schema.
2608
+
2609
+ Creates a NEW database file with the v2 normalized schema.
2610
+ The original database is preserved unchanged.
2611
+
2612
+ Use --resume to continue a migration that was interrupted.
2613
+
2614
+ \b
2615
+ V2 changes:
2616
+ - TEXT enum fields replaced with INTEGER foreign keys
2617
+ - New enum lookup tables (source_types, people_types, etc.)
2618
+ - New roles and locations tables
2619
+ - QIDs stored as integers (Q prefix stripped)
2620
+ - Human-readable views for queries
2621
+
2622
+ \b
2623
+ Examples:
2624
+ corp-extractor db migrate-v2 entities.db entities-v2.db
2625
+ corp-extractor db migrate-v2 entities.db entities-v2.db --resume
2626
+ corp-extractor db migrate-v2 ~/.cache/corp-extractor/entities.db ./entities-v2.db -v
2627
+ """
2628
+ _configure_logging(verbose)
2629
+
2630
+ from pathlib import Path
2631
+ from .database.migrate_v2 import migrate_database
2632
+
2633
+ source_path = Path(source_db)
2634
+ target_path = Path(target_db)
2635
+
2636
+ if target_path.exists() and not resume:
2637
+ raise click.ClickException(
2638
+ f"Target database already exists: {target_path}\n"
2639
+ "Use --resume to continue an interrupted migration."
2640
+ )
2641
+
2642
+ if resume:
2643
+ click.echo(f"Resuming migration from {source_path} to {target_path}...")
2644
+ else:
2645
+ click.echo(f"Migrating {source_path} to {target_path}...")
2646
+
2647
+ try:
2648
+ stats = migrate_database(source_path, target_path, resume=resume)
2649
+
2650
+ click.echo("\nMigration complete:")
2651
+ for key, value in stats.items():
2652
+ click.echo(f" {key}: {value:,}")
2653
+
2654
+ except Exception as e:
2655
+ raise click.ClickException(f"Migration failed: {e}")
2656
+
2657
+
2658
+ @db_cmd.command("search-roles")
2659
+ @click.argument("query")
2660
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
2661
+ @click.option("--limit", default=10, help="Maximum results to return")
2662
+ def db_search_roles(query: str, db_path: Optional[str], limit: int):
2663
+ """
2664
+ Search for roles by name.
2665
+
2666
+ \b
2667
+ Examples:
2668
+ corp-extractor db search-roles "CEO"
2669
+ corp-extractor db search-roles "Chief Executive" --limit 5
2670
+ """
2671
+ from .database.store import get_roles_database
2672
+
2673
+ roles_db = get_roles_database(db_path)
2674
+ results = roles_db.search(query, top_k=limit)
2675
+
2676
+ if not results:
2677
+ click.echo(f"No roles found matching '{query}'")
2678
+ return
2679
+
2680
+ click.echo(f"Found {len(results)} role(s) matching '{query}':")
2681
+ for role_id, name, score in results:
2682
+ click.echo(f" [{role_id}] {name} (score: {score:.2f})")
2683
+
2684
+
2685
+ @db_cmd.command("search-locations")
2686
+ @click.argument("query")
2687
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
2688
+ @click.option("--type", "location_type", type=str, help="Filter by simplified type (country, city, etc.)")
2689
+ @click.option("--limit", default=10, help="Maximum results to return")
2690
+ def db_search_locations(query: str, db_path: Optional[str], location_type: Optional[str], limit: int):
2691
+ """
2692
+ Search for locations by name.
2693
+
2694
+ \b
2695
+ Examples:
2696
+ corp-extractor db search-locations "California"
2697
+ corp-extractor db search-locations "Paris" --type city
2698
+ corp-extractor db search-locations "Germany" --type country
2699
+ """
2700
+ from .database.store import get_locations_database
2701
+
2702
+ locations_db = get_locations_database(db_path)
2703
+ results = locations_db.search(query, top_k=limit, simplified_type=location_type)
2704
+
2705
+ if not results:
2706
+ click.echo(f"No locations found matching '{query}'")
2707
+ return
2708
+
2709
+ click.echo(f"Found {len(results)} location(s) matching '{query}':")
2710
+ for loc_id, name, score in results:
2711
+ click.echo(f" [{loc_id}] {name} (score: {score:.2f})")
2712
+
2713
+
2714
+ @db_cmd.command("import-locations")
2715
+ @click.option("--from-pycountry", is_flag=True, help="Import countries from pycountry")
2716
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
2717
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
2718
+ def db_import_locations(from_pycountry: bool, db_path: Optional[str], verbose: bool):
2719
+ """
2720
+ Import locations into the database.
2721
+
2722
+ \b
2723
+ Examples:
2724
+ corp-extractor db import-locations --from-pycountry
2725
+ """
2726
+ _configure_logging(verbose)
2727
+
2728
+ if not from_pycountry:
2729
+ raise click.UsageError("Must specify --from-pycountry")
2730
+
2731
+ from .database.store import get_locations_database
2732
+
2733
+ locations_db = get_locations_database(db_path)
2734
+ count = locations_db.import_from_pycountry()
2735
+
2736
+ click.echo(f"Imported {count:,} locations from pycountry")
2737
+
2738
+
2312
2739
  # =============================================================================
2313
2740
  # Document commands
2314
2741
  # =============================================================================