corp-extractor 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +33 -3
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +16 -12
- statement_extractor/cli.py +472 -45
- statement_extractor/database/embeddings.py +45 -0
- statement_extractor/database/hub.py +51 -9
- statement_extractor/database/importers/import_utils.py +264 -0
- statement_extractor/database/importers/wikidata_dump.py +334 -3
- statement_extractor/database/importers/wikidata_people.py +44 -0
- statement_extractor/database/migrate_v2.py +852 -0
- statement_extractor/database/models.py +125 -1
- statement_extractor/database/schema_v2.py +409 -0
- statement_extractor/database/seed_data.py +359 -0
- statement_extractor/database/store.py +2113 -322
- statement_extractor/plugins/qualifiers/person.py +109 -52
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
statement_extractor/cli.py
CHANGED
|
@@ -783,10 +783,10 @@ def db_import_gleif(file_path: Optional[str], download: bool, force: bool, db_pa
|
|
|
783
783
|
records.append(record)
|
|
784
784
|
|
|
785
785
|
if len(records) >= batch_size:
|
|
786
|
-
# Embed and insert batch
|
|
786
|
+
# Embed and insert batch (both float32 and int8)
|
|
787
787
|
names = [r.name for r in records]
|
|
788
|
-
embeddings = embedder.
|
|
789
|
-
database.insert_batch(records, embeddings)
|
|
788
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
789
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
790
790
|
count += len(records)
|
|
791
791
|
click.echo(f"Imported {count} records...", err=True)
|
|
792
792
|
records = []
|
|
@@ -794,8 +794,8 @@ def db_import_gleif(file_path: Optional[str], download: bool, force: bool, db_pa
|
|
|
794
794
|
# Final batch
|
|
795
795
|
if records:
|
|
796
796
|
names = [r.name for r in records]
|
|
797
|
-
embeddings = embedder.
|
|
798
|
-
database.insert_batch(records, embeddings)
|
|
797
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
798
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
799
799
|
count += len(records)
|
|
800
800
|
|
|
801
801
|
click.echo(f"\nImported {count} GLEIF records successfully.", err=True)
|
|
@@ -853,8 +853,8 @@ def db_import_sec(download: bool, file_path: Optional[str], db_path: Optional[st
|
|
|
853
853
|
|
|
854
854
|
if len(records) >= batch_size:
|
|
855
855
|
names = [r.name for r in records]
|
|
856
|
-
embeddings = embedder.
|
|
857
|
-
database.insert_batch(records, embeddings)
|
|
856
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
857
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
858
858
|
count += len(records)
|
|
859
859
|
click.echo(f"Imported {count} records...", err=True)
|
|
860
860
|
records = []
|
|
@@ -862,8 +862,8 @@ def db_import_sec(download: bool, file_path: Optional[str], db_path: Optional[st
|
|
|
862
862
|
# Final batch
|
|
863
863
|
if records:
|
|
864
864
|
names = [r.name for r in records]
|
|
865
|
-
embeddings = embedder.
|
|
866
|
-
database.insert_batch(records, embeddings)
|
|
865
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
866
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
867
867
|
count += len(records)
|
|
868
868
|
|
|
869
869
|
click.echo(f"\nImported {count} SEC Edgar records successfully.", err=True)
|
|
@@ -955,8 +955,8 @@ def db_import_sec_officers(db_path: Optional[str], start_year: int, end_year: Op
|
|
|
955
955
|
|
|
956
956
|
if len(records) >= batch_size:
|
|
957
957
|
embedding_texts = [r.get_embedding_text() for r in records]
|
|
958
|
-
embeddings = embedder.
|
|
959
|
-
database.insert_batch(records, embeddings)
|
|
958
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
|
|
959
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
960
960
|
count += len(records)
|
|
961
961
|
click.echo(f"Imported {count} records...", err=True)
|
|
962
962
|
records = []
|
|
@@ -964,8 +964,8 @@ def db_import_sec_officers(db_path: Optional[str], start_year: int, end_year: Op
|
|
|
964
964
|
# Final batch
|
|
965
965
|
if records:
|
|
966
966
|
embedding_texts = [r.get_embedding_text() for r in records]
|
|
967
|
-
embeddings = embedder.
|
|
968
|
-
database.insert_batch(records, embeddings)
|
|
967
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
|
|
968
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
969
969
|
count += len(records)
|
|
970
970
|
|
|
971
971
|
if skip_existing and skipped_existing > 0:
|
|
@@ -1055,8 +1055,8 @@ def db_import_ch_officers(file_path: str, db_path: Optional[str], limit: Optiona
|
|
|
1055
1055
|
|
|
1056
1056
|
if len(records) >= batch_size:
|
|
1057
1057
|
embedding_texts = [r.get_embedding_text() for r in records]
|
|
1058
|
-
embeddings = embedder.
|
|
1059
|
-
database.insert_batch(records, embeddings)
|
|
1058
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
|
|
1059
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1060
1060
|
count += len(records)
|
|
1061
1061
|
click.echo(f"Imported {count} records...", err=True)
|
|
1062
1062
|
records = []
|
|
@@ -1064,8 +1064,8 @@ def db_import_ch_officers(file_path: str, db_path: Optional[str], limit: Optiona
|
|
|
1064
1064
|
# Final batch
|
|
1065
1065
|
if records:
|
|
1066
1066
|
embedding_texts = [r.get_embedding_text() for r in records]
|
|
1067
|
-
embeddings = embedder.
|
|
1068
|
-
database.insert_batch(records, embeddings)
|
|
1067
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
|
|
1068
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1069
1069
|
count += len(records)
|
|
1070
1070
|
|
|
1071
1071
|
if skip_existing and skipped_existing > 0:
|
|
@@ -1130,8 +1130,8 @@ def db_import_wikidata(db_path: Optional[str], limit: Optional[int], batch_size:
|
|
|
1130
1130
|
|
|
1131
1131
|
if len(records) >= batch_size:
|
|
1132
1132
|
names = [r.name for r in records]
|
|
1133
|
-
embeddings = embedder.
|
|
1134
|
-
database.insert_batch(records, embeddings)
|
|
1133
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
1134
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1135
1135
|
count += len(records)
|
|
1136
1136
|
click.echo(f"Imported {count} records...", err=True)
|
|
1137
1137
|
records = []
|
|
@@ -1139,8 +1139,8 @@ def db_import_wikidata(db_path: Optional[str], limit: Optional[int], batch_size:
|
|
|
1139
1139
|
# Final batch
|
|
1140
1140
|
if records:
|
|
1141
1141
|
names = [r.name for r in records]
|
|
1142
|
-
embeddings = embedder.
|
|
1143
|
-
database.insert_batch(records, embeddings)
|
|
1142
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
1143
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1144
1144
|
count += len(records)
|
|
1145
1145
|
|
|
1146
1146
|
click.echo(f"\nImported {count} Wikidata records successfully.", err=True)
|
|
@@ -1219,10 +1219,10 @@ def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: i
|
|
|
1219
1219
|
records.append(record)
|
|
1220
1220
|
|
|
1221
1221
|
if len(records) >= batch_size:
|
|
1222
|
-
# Generate embeddings (
|
|
1222
|
+
# Generate embeddings (both float32 and int8)
|
|
1223
1223
|
embedding_texts = [r.get_embedding_text() for r in records]
|
|
1224
|
-
embeddings = embedder.
|
|
1225
|
-
database.insert_batch(records, embeddings)
|
|
1224
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
|
|
1225
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1226
1226
|
count += len(records)
|
|
1227
1227
|
|
|
1228
1228
|
click.echo(f" Imported {count} people...", err=True)
|
|
@@ -1231,8 +1231,8 @@ def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: i
|
|
|
1231
1231
|
# Final batch
|
|
1232
1232
|
if records:
|
|
1233
1233
|
embedding_texts = [r.get_embedding_text() for r in records]
|
|
1234
|
-
embeddings = embedder.
|
|
1235
|
-
database.insert_batch(records, embeddings)
|
|
1234
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
|
|
1235
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1236
1236
|
count += len(records)
|
|
1237
1237
|
|
|
1238
1238
|
if skip_existing and skipped_existing > 0:
|
|
@@ -1333,6 +1333,7 @@ def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: i
|
|
|
1333
1333
|
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
1334
1334
|
@click.option("--people/--no-people", default=True, help="Import people (default: yes)")
|
|
1335
1335
|
@click.option("--orgs/--no-orgs", default=True, help="Import organizations (default: yes)")
|
|
1336
|
+
@click.option("--locations/--no-locations", default=False, help="Import locations (default: no)")
|
|
1336
1337
|
@click.option("--require-enwiki", is_flag=True, help="Only import orgs with English Wikipedia articles")
|
|
1337
1338
|
@click.option("--resume", is_flag=True, help="Resume from last position in dump file (tracks entity index)")
|
|
1338
1339
|
@click.option("--skip-updates", is_flag=True, help="Skip Q codes already in database (no updates)")
|
|
@@ -1347,6 +1348,7 @@ def db_import_wikidata_dump(
|
|
|
1347
1348
|
db_path: Optional[str],
|
|
1348
1349
|
people: bool,
|
|
1349
1350
|
orgs: bool,
|
|
1351
|
+
locations: bool,
|
|
1350
1352
|
require_enwiki: bool,
|
|
1351
1353
|
resume: bool,
|
|
1352
1354
|
skip_updates: bool,
|
|
@@ -1355,7 +1357,7 @@ def db_import_wikidata_dump(
|
|
|
1355
1357
|
verbose: bool,
|
|
1356
1358
|
):
|
|
1357
1359
|
"""
|
|
1358
|
-
Import people and
|
|
1360
|
+
Import people, organizations, and locations from Wikidata JSON dump.
|
|
1359
1361
|
|
|
1360
1362
|
This uses the full Wikidata JSON dump (~100GB compressed) to import
|
|
1361
1363
|
all humans and organizations with English Wikipedia articles. This
|
|
@@ -1370,6 +1372,7 @@ def db_import_wikidata_dump(
|
|
|
1370
1372
|
- Resumable with --resume (tracks position in dump file)
|
|
1371
1373
|
- Skip existing with --skip-updates (loads existing Q codes)
|
|
1372
1374
|
- People like Andy Burnham are captured via occupation (P106)
|
|
1375
|
+
- Locations (countries, cities, regions) with parent hierarchy
|
|
1373
1376
|
|
|
1374
1377
|
\b
|
|
1375
1378
|
Resume options:
|
|
@@ -1383,6 +1386,7 @@ def db_import_wikidata_dump(
|
|
|
1383
1386
|
corp-extractor db import-wikidata-dump --dump /path/to/dump.json.bz2 --limit 10000
|
|
1384
1387
|
corp-extractor db import-wikidata-dump --download --people --no-orgs --limit 50000
|
|
1385
1388
|
corp-extractor db import-wikidata-dump --dump dump.json.bz2 --orgs --no-people
|
|
1389
|
+
corp-extractor db import-wikidata-dump --dump dump.json.bz2 --locations --no-people --no-orgs # Locations only
|
|
1386
1390
|
corp-extractor db import-wikidata-dump --dump dump.json.bz2 --resume # Resume interrupted import
|
|
1387
1391
|
corp-extractor db import-wikidata-dump --dump dump.json.bz2 --skip-updates # Skip existing Q codes
|
|
1388
1392
|
"""
|
|
@@ -1395,8 +1399,8 @@ def db_import_wikidata_dump(
|
|
|
1395
1399
|
if not dump_path and not download:
|
|
1396
1400
|
raise click.UsageError("Either --dump path or --download is required")
|
|
1397
1401
|
|
|
1398
|
-
if not people and not orgs:
|
|
1399
|
-
raise click.UsageError("Must import at least one of --people or --
|
|
1402
|
+
if not people and not orgs and not locations:
|
|
1403
|
+
raise click.UsageError("Must import at least one of --people, --orgs, or --locations")
|
|
1400
1404
|
|
|
1401
1405
|
# Default database path
|
|
1402
1406
|
if db_path is None:
|
|
@@ -1538,6 +1542,121 @@ def db_import_wikidata_dump(
|
|
|
1538
1542
|
return len(new_labels)
|
|
1539
1543
|
return 0
|
|
1540
1544
|
|
|
1545
|
+
# ========================================
|
|
1546
|
+
# Location-only import (separate pass)
|
|
1547
|
+
# ========================================
|
|
1548
|
+
if locations and not people and not orgs:
|
|
1549
|
+
from .database.store import get_locations_database
|
|
1550
|
+
|
|
1551
|
+
click.echo("\n=== Location Import ===", err=True)
|
|
1552
|
+
click.echo(f" Locations: {'up to ' + str(limit) + ' records' if limit else 'unlimited'}", err=True)
|
|
1553
|
+
if require_enwiki:
|
|
1554
|
+
click.echo(" Filter: only locations with English Wikipedia articles", err=True)
|
|
1555
|
+
|
|
1556
|
+
# Initialize locations database
|
|
1557
|
+
locations_database = get_locations_database(db_path=db_path_obj)
|
|
1558
|
+
|
|
1559
|
+
# Load existing location Q codes for skip_updates mode
|
|
1560
|
+
existing_location_ids: set[str] = set()
|
|
1561
|
+
if skip_updates:
|
|
1562
|
+
existing_location_ids = locations_database.get_all_source_ids(source="wikidata")
|
|
1563
|
+
click.echo(f" Skip updates: {len(existing_location_ids):,} existing Q codes", err=True)
|
|
1564
|
+
|
|
1565
|
+
if start_index > 0:
|
|
1566
|
+
click.echo(f" Resuming from entity index {start_index:,}", err=True)
|
|
1567
|
+
|
|
1568
|
+
location_records: list = []
|
|
1569
|
+
locations_count = 0
|
|
1570
|
+
last_entity_index = start_index
|
|
1571
|
+
last_entity_id = ""
|
|
1572
|
+
|
|
1573
|
+
def location_progress_callback(entity_index: int, entity_id: str, loc_count: int) -> None:
|
|
1574
|
+
nonlocal last_entity_index, last_entity_id
|
|
1575
|
+
last_entity_index = entity_index
|
|
1576
|
+
last_entity_id = entity_id
|
|
1577
|
+
|
|
1578
|
+
def save_location_progress() -> None:
|
|
1579
|
+
if progress:
|
|
1580
|
+
progress.entity_index = last_entity_index
|
|
1581
|
+
progress.last_entity_id = last_entity_id
|
|
1582
|
+
progress.save()
|
|
1583
|
+
|
|
1584
|
+
def flush_location_batch() -> None:
|
|
1585
|
+
nonlocal location_records, locations_count
|
|
1586
|
+
if location_records:
|
|
1587
|
+
inserted = locations_database.insert_batch(location_records)
|
|
1588
|
+
locations_count += inserted
|
|
1589
|
+
location_records = []
|
|
1590
|
+
|
|
1591
|
+
click.echo("Starting dump iteration...", err=True)
|
|
1592
|
+
sys.stderr.flush()
|
|
1593
|
+
|
|
1594
|
+
try:
|
|
1595
|
+
if limit:
|
|
1596
|
+
# Use progress bar when we have limits
|
|
1597
|
+
with click.progressbar(
|
|
1598
|
+
length=limit,
|
|
1599
|
+
label="Processing dump",
|
|
1600
|
+
show_percent=True,
|
|
1601
|
+
show_pos=True,
|
|
1602
|
+
) as pbar:
|
|
1603
|
+
for record in importer.import_locations(
|
|
1604
|
+
limit=limit,
|
|
1605
|
+
require_enwiki=require_enwiki,
|
|
1606
|
+
skip_ids=existing_location_ids if skip_updates else None,
|
|
1607
|
+
start_index=start_index,
|
|
1608
|
+
progress_callback=location_progress_callback,
|
|
1609
|
+
):
|
|
1610
|
+
pbar.update(1)
|
|
1611
|
+
location_records.append(record)
|
|
1612
|
+
if len(location_records) >= batch_size:
|
|
1613
|
+
flush_location_batch()
|
|
1614
|
+
persist_new_labels()
|
|
1615
|
+
save_location_progress()
|
|
1616
|
+
else:
|
|
1617
|
+
# No limit - show counter updates
|
|
1618
|
+
for record in importer.import_locations(
|
|
1619
|
+
limit=None,
|
|
1620
|
+
require_enwiki=require_enwiki,
|
|
1621
|
+
skip_ids=existing_location_ids if skip_updates else None,
|
|
1622
|
+
start_index=start_index,
|
|
1623
|
+
progress_callback=location_progress_callback,
|
|
1624
|
+
):
|
|
1625
|
+
location_records.append(record)
|
|
1626
|
+
if len(location_records) >= batch_size:
|
|
1627
|
+
flush_location_batch()
|
|
1628
|
+
persist_new_labels()
|
|
1629
|
+
save_location_progress()
|
|
1630
|
+
click.echo(f"\r Progress: {locations_count:,} locations...", nl=False, err=True)
|
|
1631
|
+
sys.stderr.flush()
|
|
1632
|
+
|
|
1633
|
+
click.echo("", err=True) # Newline after counter
|
|
1634
|
+
|
|
1635
|
+
# Final batches
|
|
1636
|
+
flush_location_batch()
|
|
1637
|
+
persist_new_labels()
|
|
1638
|
+
save_location_progress()
|
|
1639
|
+
|
|
1640
|
+
finally:
|
|
1641
|
+
# Ensure we save progress even on interrupt
|
|
1642
|
+
save_location_progress()
|
|
1643
|
+
|
|
1644
|
+
click.echo(f"\nLocation import complete: {locations_count:,} locations", err=True)
|
|
1645
|
+
|
|
1646
|
+
# Final label resolution
|
|
1647
|
+
click.echo("\n=== Final QID Label Resolution ===", err=True)
|
|
1648
|
+
all_labels = importer.get_label_cache()
|
|
1649
|
+
click.echo(f" Total labels in cache: {len(all_labels):,}", err=True)
|
|
1650
|
+
|
|
1651
|
+
# Final stats
|
|
1652
|
+
final_label_count = database.get_qid_labels_count()
|
|
1653
|
+
click.echo(f" Total labels in DB: {final_label_count:,}", err=True)
|
|
1654
|
+
|
|
1655
|
+
locations_database.close()
|
|
1656
|
+
database.close()
|
|
1657
|
+
click.echo("\nWikidata dump import complete!", err=True)
|
|
1658
|
+
return
|
|
1659
|
+
|
|
1541
1660
|
# Combined import - single pass through the dump for both people and orgs
|
|
1542
1661
|
click.echo("\n=== Combined Import (single dump pass) ===", err=True)
|
|
1543
1662
|
sys.stderr.flush() # Ensure output is visible immediately
|
|
@@ -1583,8 +1702,8 @@ def db_import_wikidata_dump(
|
|
|
1583
1702
|
nonlocal people_records, people_count
|
|
1584
1703
|
if people_records:
|
|
1585
1704
|
embedding_texts = [r.get_embedding_text() for r in people_records]
|
|
1586
|
-
embeddings = embedder.
|
|
1587
|
-
person_database.insert_batch(people_records, embeddings)
|
|
1705
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
|
|
1706
|
+
person_database.insert_batch(people_records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1588
1707
|
people_count += len(people_records)
|
|
1589
1708
|
people_records = []
|
|
1590
1709
|
|
|
@@ -1592,8 +1711,8 @@ def db_import_wikidata_dump(
|
|
|
1592
1711
|
nonlocal org_records, orgs_count
|
|
1593
1712
|
if org_records and org_database:
|
|
1594
1713
|
names = [r.name for r in org_records]
|
|
1595
|
-
embeddings = embedder.
|
|
1596
|
-
org_database.insert_batch(org_records, embeddings)
|
|
1714
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
1715
|
+
org_database.insert_batch(org_records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1597
1716
|
orgs_count += len(org_records)
|
|
1598
1717
|
org_records = []
|
|
1599
1718
|
|
|
@@ -1735,9 +1854,9 @@ def db_import_wikidata_dump(
|
|
|
1735
1854
|
|
|
1736
1855
|
if orgs:
|
|
1737
1856
|
org_database = get_database(db_path=db_path_obj)
|
|
1738
|
-
org_updates = org_database.resolve_qid_labels(all_labels)
|
|
1739
|
-
if org_updates:
|
|
1740
|
-
click.echo(f"
|
|
1857
|
+
org_updates, org_deletes = org_database.resolve_qid_labels(all_labels)
|
|
1858
|
+
if org_updates or org_deletes:
|
|
1859
|
+
click.echo(f" Orgs: {org_updates:,} updated, {org_deletes:,} duplicates deleted", err=True)
|
|
1741
1860
|
org_database.close()
|
|
1742
1861
|
|
|
1743
1862
|
# Final stats
|
|
@@ -1875,8 +1994,8 @@ def db_import_companies_house(
|
|
|
1875
1994
|
|
|
1876
1995
|
if len(records) >= batch_size:
|
|
1877
1996
|
names = [r.name for r in records]
|
|
1878
|
-
embeddings = embedder.
|
|
1879
|
-
database.insert_batch(records, embeddings)
|
|
1997
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
1998
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1880
1999
|
count += len(records)
|
|
1881
2000
|
click.echo(f"Imported {count} records...", err=True)
|
|
1882
2001
|
records = []
|
|
@@ -1884,8 +2003,8 @@ def db_import_companies_house(
|
|
|
1884
2003
|
# Final batch
|
|
1885
2004
|
if records:
|
|
1886
2005
|
names = [r.name for r in records]
|
|
1887
|
-
embeddings = embedder.
|
|
1888
|
-
database.insert_batch(records, embeddings)
|
|
2006
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
2007
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1889
2008
|
count += len(records)
|
|
1890
2009
|
|
|
1891
2010
|
click.echo(f"\nImported {count} Companies House records successfully.", err=True)
|
|
@@ -1904,6 +2023,7 @@ def db_status(db_path: Optional[str]):
|
|
|
1904
2023
|
corp-extractor db status --db /path/to/entities.db
|
|
1905
2024
|
"""
|
|
1906
2025
|
from .database import OrganizationDatabase
|
|
2026
|
+
from .database.store import get_person_database
|
|
1907
2027
|
|
|
1908
2028
|
try:
|
|
1909
2029
|
database = OrganizationDatabase(db_path=db_path)
|
|
@@ -1921,6 +2041,27 @@ def db_status(db_path: Optional[str]):
|
|
|
1921
2041
|
click.echo(f"\n⚠️ Missing embeddings: {missing_embeddings:,}")
|
|
1922
2042
|
click.echo(" Run 'corp-extractor db repair-embeddings' to fix")
|
|
1923
2043
|
|
|
2044
|
+
# Show embedding counts (float32 and scalar)
|
|
2045
|
+
org_fp32 = database.get_float32_embedding_count()
|
|
2046
|
+
org_int8 = database.get_scalar_embedding_count()
|
|
2047
|
+
click.echo(f"\nOrganization embeddings:")
|
|
2048
|
+
click.echo(f" float32: {org_fp32:,}")
|
|
2049
|
+
click.echo(f" int8 (scalar): {org_int8:,}")
|
|
2050
|
+
if org_fp32 > 0 and org_int8 < org_fp32:
|
|
2051
|
+
click.echo(f" ⚠️ {org_fp32 - org_int8:,} missing scalar embeddings")
|
|
2052
|
+
click.echo(" Run 'corp-extractor db backfill-scalar' to generate")
|
|
2053
|
+
|
|
2054
|
+
# Person embeddings
|
|
2055
|
+
person_db = get_person_database(db_path=db_path)
|
|
2056
|
+
person_fp32 = person_db.get_float32_embedding_count()
|
|
2057
|
+
person_int8 = person_db.get_scalar_embedding_count()
|
|
2058
|
+
if person_fp32 > 0:
|
|
2059
|
+
click.echo(f"\nPerson embeddings:")
|
|
2060
|
+
click.echo(f" float32: {person_fp32:,}")
|
|
2061
|
+
click.echo(f" int8 (scalar): {person_int8:,}")
|
|
2062
|
+
if person_int8 < person_fp32:
|
|
2063
|
+
click.echo(f" ⚠️ {person_fp32 - person_int8:,} missing scalar embeddings")
|
|
2064
|
+
|
|
1924
2065
|
if stats.by_source:
|
|
1925
2066
|
click.echo("\nRecords by source:")
|
|
1926
2067
|
for source, count in stats.by_source.items():
|
|
@@ -2230,9 +2371,9 @@ def db_repair_embeddings(db_path: Optional[str], batch_size: int, source: Option
|
|
|
2230
2371
|
names.append(name)
|
|
2231
2372
|
|
|
2232
2373
|
if len(names) >= batch_size:
|
|
2233
|
-
# Generate embeddings
|
|
2234
|
-
embeddings = embedder.
|
|
2235
|
-
database.
|
|
2374
|
+
# Generate both float32 and int8 embeddings
|
|
2375
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
2376
|
+
database.insert_both_embeddings_batch(org_ids, embeddings, scalar_embeddings)
|
|
2236
2377
|
count += len(names)
|
|
2237
2378
|
click.echo(f"Repaired {count:,} / {missing_count:,} embeddings...", err=True)
|
|
2238
2379
|
org_ids = []
|
|
@@ -2240,14 +2381,161 @@ def db_repair_embeddings(db_path: Optional[str], batch_size: int, source: Option
|
|
|
2240
2381
|
|
|
2241
2382
|
# Final batch
|
|
2242
2383
|
if names:
|
|
2243
|
-
embeddings = embedder.
|
|
2244
|
-
database.
|
|
2384
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
2385
|
+
database.insert_both_embeddings_batch(org_ids, embeddings, scalar_embeddings)
|
|
2245
2386
|
count += len(names)
|
|
2246
2387
|
|
|
2247
2388
|
click.echo(f"\nRepaired {count:,} embeddings successfully.", err=True)
|
|
2248
2389
|
database.close()
|
|
2249
2390
|
|
|
2250
2391
|
|
|
2392
|
+
@db_cmd.command("backfill-scalar")
|
|
2393
|
+
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
2394
|
+
@click.option("--batch-size", type=int, default=10000, help="Batch size for processing (default: 10000)")
|
|
2395
|
+
@click.option("--embed-batch-size", type=int, default=64, help="Batch size for embedding generation (default: 64)")
|
|
2396
|
+
@click.option("--skip-generate", is_flag=True, help="Skip generating missing float32 embeddings (only quantize existing)")
|
|
2397
|
+
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
|
|
2398
|
+
def db_backfill_scalar(db_path: Optional[str], batch_size: int, embed_batch_size: int, skip_generate: bool, verbose: bool):
|
|
2399
|
+
"""
|
|
2400
|
+
Backfill scalar (int8) embeddings for the entity database.
|
|
2401
|
+
|
|
2402
|
+
This command handles two cases:
|
|
2403
|
+
1. Records with float32 but missing scalar → quantize existing
|
|
2404
|
+
2. Records missing both embeddings → generate both from scratch
|
|
2405
|
+
|
|
2406
|
+
Scalar embeddings provide 75% storage reduction with ~92% recall at top-100.
|
|
2407
|
+
|
|
2408
|
+
\b
|
|
2409
|
+
Examples:
|
|
2410
|
+
corp-extractor db backfill-scalar
|
|
2411
|
+
corp-extractor db backfill-scalar --batch-size 5000 -v
|
|
2412
|
+
corp-extractor db backfill-scalar --skip-generate # Only quantize existing
|
|
2413
|
+
"""
|
|
2414
|
+
_configure_logging(verbose)
|
|
2415
|
+
import numpy as np
|
|
2416
|
+
|
|
2417
|
+
from .database import OrganizationDatabase, CompanyEmbedder
|
|
2418
|
+
from .database.store import get_person_database
|
|
2419
|
+
|
|
2420
|
+
embedder = None # Lazy load only if needed
|
|
2421
|
+
|
|
2422
|
+
# Process organizations
|
|
2423
|
+
org_db = OrganizationDatabase(db_path=db_path)
|
|
2424
|
+
|
|
2425
|
+
# Phase 1: Quantize existing float32 embeddings to scalar
|
|
2426
|
+
org_quantized = 0
|
|
2427
|
+
click.echo("Phase 1: Quantizing existing float32 embeddings to scalar...", err=True)
|
|
2428
|
+
for batch_ids in org_db.get_missing_scalar_embedding_ids(batch_size=batch_size):
|
|
2429
|
+
fp32_map = org_db.get_embeddings_by_ids(batch_ids)
|
|
2430
|
+
if not fp32_map:
|
|
2431
|
+
continue
|
|
2432
|
+
|
|
2433
|
+
ids = list(fp32_map.keys())
|
|
2434
|
+
int8_embeddings = np.array([
|
|
2435
|
+
np.clip(np.round(fp32_map[i] * 127), -127, 127).astype(np.int8)
|
|
2436
|
+
for i in ids
|
|
2437
|
+
])
|
|
2438
|
+
|
|
2439
|
+
org_db.insert_scalar_embeddings_batch(ids, int8_embeddings)
|
|
2440
|
+
org_quantized += len(ids)
|
|
2441
|
+
click.echo(f" Quantized {org_quantized:,} organization embeddings...", err=True)
|
|
2442
|
+
|
|
2443
|
+
click.echo(f"Quantized {org_quantized:,} organization embeddings.", err=True)
|
|
2444
|
+
|
|
2445
|
+
# Phase 2: Generate embeddings for records missing both
|
|
2446
|
+
org_generated = 0
|
|
2447
|
+
if not skip_generate:
|
|
2448
|
+
click.echo("\nPhase 2: Generating embeddings for organizations missing both...", err=True)
|
|
2449
|
+
|
|
2450
|
+
for batch in org_db.get_missing_all_embedding_ids(batch_size=batch_size):
|
|
2451
|
+
if not batch:
|
|
2452
|
+
continue
|
|
2453
|
+
|
|
2454
|
+
# Lazy load embedder
|
|
2455
|
+
if embedder is None:
|
|
2456
|
+
click.echo(" Loading embedding model...", err=True)
|
|
2457
|
+
embedder = CompanyEmbedder()
|
|
2458
|
+
|
|
2459
|
+
# Process in smaller batches for embedding generation
|
|
2460
|
+
for i in range(0, len(batch), embed_batch_size):
|
|
2461
|
+
sub_batch = batch[i:i + embed_batch_size]
|
|
2462
|
+
ids = [item[0] for item in sub_batch]
|
|
2463
|
+
names = [item[1] for item in sub_batch]
|
|
2464
|
+
|
|
2465
|
+
# Generate both float32 and int8 embeddings
|
|
2466
|
+
fp32_batch, int8_batch = embedder.embed_batch_and_quantize(names, batch_size=embed_batch_size)
|
|
2467
|
+
|
|
2468
|
+
# Insert both
|
|
2469
|
+
org_db.insert_both_embeddings_batch(ids, fp32_batch, int8_batch)
|
|
2470
|
+
org_generated += len(ids)
|
|
2471
|
+
|
|
2472
|
+
if org_generated % 10000 == 0:
|
|
2473
|
+
click.echo(f" Generated {org_generated:,} organization embeddings...", err=True)
|
|
2474
|
+
|
|
2475
|
+
click.echo(f"Generated {org_generated:,} organization embeddings.", err=True)
|
|
2476
|
+
|
|
2477
|
+
# Process people
|
|
2478
|
+
person_db = get_person_database(db_path=db_path)
|
|
2479
|
+
|
|
2480
|
+
# Phase 1: Quantize existing float32 embeddings to scalar
|
|
2481
|
+
person_quantized = 0
|
|
2482
|
+
click.echo("\nPhase 1: Quantizing existing float32 person embeddings to scalar...", err=True)
|
|
2483
|
+
for batch_ids in person_db.get_missing_scalar_embedding_ids(batch_size=batch_size):
|
|
2484
|
+
fp32_map = person_db.get_embeddings_by_ids(batch_ids)
|
|
2485
|
+
if not fp32_map:
|
|
2486
|
+
continue
|
|
2487
|
+
|
|
2488
|
+
ids = list(fp32_map.keys())
|
|
2489
|
+
int8_embeddings = np.array([
|
|
2490
|
+
np.clip(np.round(fp32_map[i] * 127), -127, 127).astype(np.int8)
|
|
2491
|
+
for i in ids
|
|
2492
|
+
])
|
|
2493
|
+
|
|
2494
|
+
person_db.insert_scalar_embeddings_batch(ids, int8_embeddings)
|
|
2495
|
+
person_quantized += len(ids)
|
|
2496
|
+
click.echo(f" Quantized {person_quantized:,} person embeddings...", err=True)
|
|
2497
|
+
|
|
2498
|
+
click.echo(f"Quantized {person_quantized:,} person embeddings.", err=True)
|
|
2499
|
+
|
|
2500
|
+
# Phase 2: Generate embeddings for records missing both
|
|
2501
|
+
person_generated = 0
|
|
2502
|
+
if not skip_generate:
|
|
2503
|
+
click.echo("\nPhase 2: Generating embeddings for people missing both...", err=True)
|
|
2504
|
+
|
|
2505
|
+
for batch in person_db.get_missing_all_embedding_ids(batch_size=batch_size):
|
|
2506
|
+
if not batch:
|
|
2507
|
+
continue
|
|
2508
|
+
|
|
2509
|
+
# Lazy load embedder
|
|
2510
|
+
if embedder is None:
|
|
2511
|
+
click.echo(" Loading embedding model...", err=True)
|
|
2512
|
+
embedder = CompanyEmbedder()
|
|
2513
|
+
|
|
2514
|
+
# Process in smaller batches for embedding generation
|
|
2515
|
+
for i in range(0, len(batch), embed_batch_size):
|
|
2516
|
+
sub_batch = batch[i:i + embed_batch_size]
|
|
2517
|
+
ids = [item[0] for item in sub_batch]
|
|
2518
|
+
names = [item[1] for item in sub_batch]
|
|
2519
|
+
|
|
2520
|
+
# Generate both float32 and int8 embeddings
|
|
2521
|
+
fp32_batch, int8_batch = embedder.embed_batch_and_quantize(names, batch_size=embed_batch_size)
|
|
2522
|
+
|
|
2523
|
+
# Insert both
|
|
2524
|
+
person_db.insert_both_embeddings_batch(ids, fp32_batch, int8_batch)
|
|
2525
|
+
person_generated += len(ids)
|
|
2526
|
+
|
|
2527
|
+
if person_generated % 10000 == 0:
|
|
2528
|
+
click.echo(f" Generated {person_generated:,} person embeddings...", err=True)
|
|
2529
|
+
|
|
2530
|
+
click.echo(f"Generated {person_generated:,} person embeddings.", err=True)
|
|
2531
|
+
|
|
2532
|
+
# Summary
|
|
2533
|
+
click.echo(f"\nSummary:", err=True)
|
|
2534
|
+
click.echo(f" Organizations: {org_quantized:,} quantized, {org_generated:,} generated", err=True)
|
|
2535
|
+
click.echo(f" People: {person_quantized:,} quantized, {person_generated:,} generated", err=True)
|
|
2536
|
+
click.echo(f" Total: {org_quantized + org_generated + person_quantized + person_generated:,} embeddings processed", err=True)
|
|
2537
|
+
|
|
2538
|
+
|
|
2251
2539
|
@db_cmd.command("migrate")
|
|
2252
2540
|
@click.argument("db_path", type=click.Path(exists=True))
|
|
2253
2541
|
@click.option("--rename-file", is_flag=True, help="Also rename companies.db to entities.db")
|
|
@@ -2309,6 +2597,145 @@ def db_migrate(db_path: str, rename_file: bool, yes: bool, verbose: bool):
|
|
|
2309
2597
|
raise click.ClickException(f"Migration failed: {e}")
|
|
2310
2598
|
|
|
2311
2599
|
|
|
2600
|
+
@db_cmd.command("migrate-v2")
|
|
2601
|
+
@click.argument("source_db", type=click.Path(exists=True))
|
|
2602
|
+
@click.argument("target_db", type=click.Path())
|
|
2603
|
+
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
|
|
2604
|
+
@click.option("--resume", is_flag=True, help="Resume from last completed step")
|
|
2605
|
+
def db_migrate_v2(source_db: str, target_db: str, verbose: bool, resume: bool):
|
|
2606
|
+
"""
|
|
2607
|
+
Migrate database from v1 schema to v2 normalized schema.
|
|
2608
|
+
|
|
2609
|
+
Creates a NEW database file with the v2 normalized schema.
|
|
2610
|
+
The original database is preserved unchanged.
|
|
2611
|
+
|
|
2612
|
+
Use --resume to continue a migration that was interrupted.
|
|
2613
|
+
|
|
2614
|
+
\b
|
|
2615
|
+
V2 changes:
|
|
2616
|
+
- TEXT enum fields replaced with INTEGER foreign keys
|
|
2617
|
+
- New enum lookup tables (source_types, people_types, etc.)
|
|
2618
|
+
- New roles and locations tables
|
|
2619
|
+
- QIDs stored as integers (Q prefix stripped)
|
|
2620
|
+
- Human-readable views for queries
|
|
2621
|
+
|
|
2622
|
+
\b
|
|
2623
|
+
Examples:
|
|
2624
|
+
corp-extractor db migrate-v2 entities.db entities-v2.db
|
|
2625
|
+
corp-extractor db migrate-v2 entities.db entities-v2.db --resume
|
|
2626
|
+
corp-extractor db migrate-v2 ~/.cache/corp-extractor/entities.db ./entities-v2.db -v
|
|
2627
|
+
"""
|
|
2628
|
+
_configure_logging(verbose)
|
|
2629
|
+
|
|
2630
|
+
from pathlib import Path
|
|
2631
|
+
from .database.migrate_v2 import migrate_database
|
|
2632
|
+
|
|
2633
|
+
source_path = Path(source_db)
|
|
2634
|
+
target_path = Path(target_db)
|
|
2635
|
+
|
|
2636
|
+
if target_path.exists() and not resume:
|
|
2637
|
+
raise click.ClickException(
|
|
2638
|
+
f"Target database already exists: {target_path}\n"
|
|
2639
|
+
"Use --resume to continue an interrupted migration."
|
|
2640
|
+
)
|
|
2641
|
+
|
|
2642
|
+
if resume:
|
|
2643
|
+
click.echo(f"Resuming migration from {source_path} to {target_path}...")
|
|
2644
|
+
else:
|
|
2645
|
+
click.echo(f"Migrating {source_path} to {target_path}...")
|
|
2646
|
+
|
|
2647
|
+
try:
|
|
2648
|
+
stats = migrate_database(source_path, target_path, resume=resume)
|
|
2649
|
+
|
|
2650
|
+
click.echo("\nMigration complete:")
|
|
2651
|
+
for key, value in stats.items():
|
|
2652
|
+
click.echo(f" {key}: {value:,}")
|
|
2653
|
+
|
|
2654
|
+
except Exception as e:
|
|
2655
|
+
raise click.ClickException(f"Migration failed: {e}")
|
|
2656
|
+
|
|
2657
|
+
|
|
2658
|
+
@db_cmd.command("search-roles")
|
|
2659
|
+
@click.argument("query")
|
|
2660
|
+
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
2661
|
+
@click.option("--limit", default=10, help="Maximum results to return")
|
|
2662
|
+
def db_search_roles(query: str, db_path: Optional[str], limit: int):
|
|
2663
|
+
"""
|
|
2664
|
+
Search for roles by name.
|
|
2665
|
+
|
|
2666
|
+
\b
|
|
2667
|
+
Examples:
|
|
2668
|
+
corp-extractor db search-roles "CEO"
|
|
2669
|
+
corp-extractor db search-roles "Chief Executive" --limit 5
|
|
2670
|
+
"""
|
|
2671
|
+
from .database.store import get_roles_database
|
|
2672
|
+
|
|
2673
|
+
roles_db = get_roles_database(db_path)
|
|
2674
|
+
results = roles_db.search(query, top_k=limit)
|
|
2675
|
+
|
|
2676
|
+
if not results:
|
|
2677
|
+
click.echo(f"No roles found matching '{query}'")
|
|
2678
|
+
return
|
|
2679
|
+
|
|
2680
|
+
click.echo(f"Found {len(results)} role(s) matching '{query}':")
|
|
2681
|
+
for role_id, name, score in results:
|
|
2682
|
+
click.echo(f" [{role_id}] {name} (score: {score:.2f})")
|
|
2683
|
+
|
|
2684
|
+
|
|
2685
|
+
@db_cmd.command("search-locations")
|
|
2686
|
+
@click.argument("query")
|
|
2687
|
+
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
2688
|
+
@click.option("--type", "location_type", type=str, help="Filter by simplified type (country, city, etc.)")
|
|
2689
|
+
@click.option("--limit", default=10, help="Maximum results to return")
|
|
2690
|
+
def db_search_locations(query: str, db_path: Optional[str], location_type: Optional[str], limit: int):
|
|
2691
|
+
"""
|
|
2692
|
+
Search for locations by name.
|
|
2693
|
+
|
|
2694
|
+
\b
|
|
2695
|
+
Examples:
|
|
2696
|
+
corp-extractor db search-locations "California"
|
|
2697
|
+
corp-extractor db search-locations "Paris" --type city
|
|
2698
|
+
corp-extractor db search-locations "Germany" --type country
|
|
2699
|
+
"""
|
|
2700
|
+
from .database.store import get_locations_database
|
|
2701
|
+
|
|
2702
|
+
locations_db = get_locations_database(db_path)
|
|
2703
|
+
results = locations_db.search(query, top_k=limit, simplified_type=location_type)
|
|
2704
|
+
|
|
2705
|
+
if not results:
|
|
2706
|
+
click.echo(f"No locations found matching '{query}'")
|
|
2707
|
+
return
|
|
2708
|
+
|
|
2709
|
+
click.echo(f"Found {len(results)} location(s) matching '{query}':")
|
|
2710
|
+
for loc_id, name, score in results:
|
|
2711
|
+
click.echo(f" [{loc_id}] {name} (score: {score:.2f})")
|
|
2712
|
+
|
|
2713
|
+
|
|
2714
|
+
@db_cmd.command("import-locations")
|
|
2715
|
+
@click.option("--from-pycountry", is_flag=True, help="Import countries from pycountry")
|
|
2716
|
+
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
2717
|
+
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
|
|
2718
|
+
def db_import_locations(from_pycountry: bool, db_path: Optional[str], verbose: bool):
|
|
2719
|
+
"""
|
|
2720
|
+
Import locations into the database.
|
|
2721
|
+
|
|
2722
|
+
\b
|
|
2723
|
+
Examples:
|
|
2724
|
+
corp-extractor db import-locations --from-pycountry
|
|
2725
|
+
"""
|
|
2726
|
+
_configure_logging(verbose)
|
|
2727
|
+
|
|
2728
|
+
if not from_pycountry:
|
|
2729
|
+
raise click.UsageError("Must specify --from-pycountry")
|
|
2730
|
+
|
|
2731
|
+
from .database.store import get_locations_database
|
|
2732
|
+
|
|
2733
|
+
locations_db = get_locations_database(db_path)
|
|
2734
|
+
count = locations_db.import_from_pycountry()
|
|
2735
|
+
|
|
2736
|
+
click.echo(f"Imported {count:,} locations from pycountry")
|
|
2737
|
+
|
|
2738
|
+
|
|
2312
2739
|
# =============================================================================
|
|
2313
2740
|
# Document commands
|
|
2314
2741
|
# =============================================================================
|