corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
- statement_extractor/cli.py +1317 -101
- statement_extractor/database/embeddings.py +45 -0
- statement_extractor/database/hub.py +86 -136
- statement_extractor/database/importers/__init__.py +10 -2
- statement_extractor/database/importers/companies_house.py +16 -2
- statement_extractor/database/importers/companies_house_officers.py +431 -0
- statement_extractor/database/importers/gleif.py +23 -0
- statement_extractor/database/importers/import_utils.py +264 -0
- statement_extractor/database/importers/sec_edgar.py +17 -0
- statement_extractor/database/importers/sec_form4.py +512 -0
- statement_extractor/database/importers/wikidata.py +151 -43
- statement_extractor/database/importers/wikidata_dump.py +2282 -0
- statement_extractor/database/importers/wikidata_people.py +867 -325
- statement_extractor/database/migrate_v2.py +852 -0
- statement_extractor/database/models.py +155 -7
- statement_extractor/database/schema_v2.py +409 -0
- statement_extractor/database/seed_data.py +359 -0
- statement_extractor/database/store.py +3449 -233
- statement_extractor/document/deduplicator.py +10 -12
- statement_extractor/extractor.py +1 -1
- statement_extractor/models/__init__.py +3 -2
- statement_extractor/models/statement.py +15 -17
- statement_extractor/models.py +1 -1
- statement_extractor/pipeline/context.py +5 -5
- statement_extractor/pipeline/orchestrator.py +12 -12
- statement_extractor/plugins/base.py +17 -17
- statement_extractor/plugins/extractors/gliner2.py +28 -28
- statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
- statement_extractor/plugins/qualifiers/person.py +120 -53
- statement_extractor/plugins/splitters/t5_gemma.py +35 -39
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
- {corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0
statement_extractor/cli.py
CHANGED
|
@@ -439,7 +439,7 @@ def _print_pipeline_json(ctx):
|
|
|
439
439
|
"""Print pipeline results as JSON."""
|
|
440
440
|
output = {
|
|
441
441
|
"statement_count": ctx.statement_count,
|
|
442
|
-
"
|
|
442
|
+
"split_sentences": [s.model_dump() for s in ctx.split_sentences],
|
|
443
443
|
"statements": [s.model_dump() for s in ctx.statements],
|
|
444
444
|
"labeled_statements": [stmt.as_dict() for stmt in ctx.labeled_statements],
|
|
445
445
|
"timings": ctx.stage_timings,
|
|
@@ -472,9 +472,10 @@ def _print_pipeline_triples(ctx):
|
|
|
472
472
|
elif ctx.statements:
|
|
473
473
|
for stmt in ctx.statements:
|
|
474
474
|
click.echo(f"{stmt.subject.text}\t{stmt.predicate}\t{stmt.object.text}")
|
|
475
|
-
elif ctx.
|
|
476
|
-
|
|
477
|
-
|
|
475
|
+
elif ctx.split_sentences:
|
|
476
|
+
# Stage 1 only output - just show the split sentences (no triples yet)
|
|
477
|
+
for sentence in ctx.split_sentences:
|
|
478
|
+
click.echo(sentence.text)
|
|
478
479
|
|
|
479
480
|
|
|
480
481
|
def _print_pipeline_table(ctx, verbose: bool):
|
|
@@ -528,20 +529,16 @@ def _print_pipeline_table(ctx, verbose: bool):
|
|
|
528
529
|
|
|
529
530
|
click.echo("-" * 80)
|
|
530
531
|
|
|
531
|
-
elif ctx.
|
|
532
|
-
click.echo(f"\
|
|
532
|
+
elif ctx.split_sentences:
|
|
533
|
+
click.echo(f"\nSplit into {len(ctx.split_sentences)} atomic sentence(s):\n")
|
|
533
534
|
click.echo("-" * 80)
|
|
534
535
|
|
|
535
|
-
for i,
|
|
536
|
-
|
|
537
|
-
click.echo(f"
|
|
538
|
-
click.echo(f" {triple.object_text}")
|
|
536
|
+
for i, sentence in enumerate(ctx.split_sentences, 1):
|
|
537
|
+
text_preview = sentence.text[:100] + "..." if len(sentence.text) > 100 else sentence.text
|
|
538
|
+
click.echo(f"{i}. {text_preview}")
|
|
539
539
|
|
|
540
540
|
if verbose:
|
|
541
|
-
click.echo(f" Confidence: {
|
|
542
|
-
if triple.source_sentence:
|
|
543
|
-
source = triple.source_sentence[:60] + "..." if len(triple.source_sentence) > 60 else triple.source_sentence
|
|
544
|
-
click.echo(f" Source: \"{source}\"")
|
|
541
|
+
click.echo(f" Confidence: {sentence.confidence:.2f}")
|
|
545
542
|
|
|
546
543
|
click.echo("-" * 80)
|
|
547
544
|
|
|
@@ -666,22 +663,27 @@ def db_cmd():
|
|
|
666
663
|
Commands:
|
|
667
664
|
import-gleif Import GLEIF LEI data (~3M records)
|
|
668
665
|
import-sec Import SEC Edgar bulk data (~100K+ filers)
|
|
666
|
+
import-sec-officers Import SEC Form 4 officers/directors
|
|
667
|
+
import-ch-officers Import UK Companies House officers (Prod195)
|
|
669
668
|
import-companies-house Import UK Companies House (~5M records)
|
|
670
|
-
import-wikidata Import Wikidata organizations
|
|
671
|
-
import-people Import Wikidata notable people
|
|
669
|
+
import-wikidata Import Wikidata organizations (SPARQL, may timeout)
|
|
670
|
+
import-people Import Wikidata notable people (SPARQL, may timeout)
|
|
671
|
+
import-wikidata-dump Import from Wikidata JSON dump (recommended)
|
|
672
|
+
canonicalize Link equivalent records across sources
|
|
672
673
|
status Show database status
|
|
673
674
|
search Search for an organization
|
|
674
675
|
search-people Search for a person
|
|
675
676
|
download Download database from HuggingFace
|
|
676
|
-
upload Upload database with lite
|
|
677
|
+
upload Upload database with lite variant
|
|
677
678
|
create-lite Create lite version (no record data)
|
|
678
|
-
compress Compress database with gzip
|
|
679
679
|
|
|
680
680
|
\b
|
|
681
681
|
Examples:
|
|
682
682
|
corp-extractor db import-sec --download
|
|
683
|
+
corp-extractor db import-sec-officers --start-year 2023 --limit 10000
|
|
683
684
|
corp-extractor db import-gleif --download --limit 100000
|
|
684
|
-
corp-extractor db import-
|
|
685
|
+
corp-extractor db import-wikidata-dump --download --limit 50000
|
|
686
|
+
corp-extractor db canonicalize
|
|
685
687
|
corp-extractor db status
|
|
686
688
|
corp-extractor db search "Apple Inc"
|
|
687
689
|
corp-extractor db search-people "Tim Cook"
|
|
@@ -781,10 +783,10 @@ def db_import_gleif(file_path: Optional[str], download: bool, force: bool, db_pa
|
|
|
781
783
|
records.append(record)
|
|
782
784
|
|
|
783
785
|
if len(records) >= batch_size:
|
|
784
|
-
# Embed and insert batch
|
|
786
|
+
# Embed and insert batch (both float32 and int8)
|
|
785
787
|
names = [r.name for r in records]
|
|
786
|
-
embeddings = embedder.
|
|
787
|
-
database.insert_batch(records, embeddings)
|
|
788
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
789
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
788
790
|
count += len(records)
|
|
789
791
|
click.echo(f"Imported {count} records...", err=True)
|
|
790
792
|
records = []
|
|
@@ -792,8 +794,8 @@ def db_import_gleif(file_path: Optional[str], download: bool, force: bool, db_pa
|
|
|
792
794
|
# Final batch
|
|
793
795
|
if records:
|
|
794
796
|
names = [r.name for r in records]
|
|
795
|
-
embeddings = embedder.
|
|
796
|
-
database.insert_batch(records, embeddings)
|
|
797
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
798
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
797
799
|
count += len(records)
|
|
798
800
|
|
|
799
801
|
click.echo(f"\nImported {count} GLEIF records successfully.", err=True)
|
|
@@ -851,8 +853,8 @@ def db_import_sec(download: bool, file_path: Optional[str], db_path: Optional[st
|
|
|
851
853
|
|
|
852
854
|
if len(records) >= batch_size:
|
|
853
855
|
names = [r.name for r in records]
|
|
854
|
-
embeddings = embedder.
|
|
855
|
-
database.insert_batch(records, embeddings)
|
|
856
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
857
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
856
858
|
count += len(records)
|
|
857
859
|
click.echo(f"Imported {count} records...", err=True)
|
|
858
860
|
records = []
|
|
@@ -860,14 +862,221 @@ def db_import_sec(download: bool, file_path: Optional[str], db_path: Optional[st
|
|
|
860
862
|
# Final batch
|
|
861
863
|
if records:
|
|
862
864
|
names = [r.name for r in records]
|
|
863
|
-
embeddings = embedder.
|
|
864
|
-
database.insert_batch(records, embeddings)
|
|
865
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
866
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
865
867
|
count += len(records)
|
|
866
868
|
|
|
867
869
|
click.echo(f"\nImported {count} SEC Edgar records successfully.", err=True)
|
|
868
870
|
database.close()
|
|
869
871
|
|
|
870
872
|
|
|
873
|
+
@db_cmd.command("import-sec-officers")
|
|
874
|
+
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
875
|
+
@click.option("--start-year", type=int, default=2020, help="Start year (default: 2020)")
|
|
876
|
+
@click.option("--end-year", type=int, help="End year (default: current year)")
|
|
877
|
+
@click.option("--limit", type=int, help="Limit number of records")
|
|
878
|
+
@click.option("--batch-size", type=int, default=1000, help="Batch size for commits (default: 1000)")
|
|
879
|
+
@click.option("--resume", is_flag=True, help="Resume from saved progress")
|
|
880
|
+
@click.option("--skip-existing", is_flag=True, help="Skip records that already exist")
|
|
881
|
+
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
|
|
882
|
+
def db_import_sec_officers(db_path: Optional[str], start_year: int, end_year: Optional[int], limit: Optional[int], batch_size: int, resume: bool, skip_existing: bool, verbose: bool):
|
|
883
|
+
"""
|
|
884
|
+
Import SEC Form 4 insider data into the people database.
|
|
885
|
+
|
|
886
|
+
Downloads Form 4 filings from SEC EDGAR and extracts officers, directors,
|
|
887
|
+
and significant investors (10%+ owners) from each company.
|
|
888
|
+
|
|
889
|
+
Form 4 filings are submitted when insiders buy or sell company stock.
|
|
890
|
+
They contain the person's name, role (officer/director), and company.
|
|
891
|
+
|
|
892
|
+
Rate limited to 5 requests/second to comply with SEC guidelines.
|
|
893
|
+
|
|
894
|
+
\b
|
|
895
|
+
Examples:
|
|
896
|
+
corp-extractor db import-sec-officers --limit 1000
|
|
897
|
+
corp-extractor db import-sec-officers --start-year 2023
|
|
898
|
+
corp-extractor db import-sec-officers --resume
|
|
899
|
+
corp-extractor db import-sec-officers --skip-existing -v
|
|
900
|
+
"""
|
|
901
|
+
_configure_logging(verbose)
|
|
902
|
+
|
|
903
|
+
from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
|
|
904
|
+
from .database.embeddings import CompanyEmbedder
|
|
905
|
+
from .database.importers.sec_form4 import SecForm4Importer
|
|
906
|
+
|
|
907
|
+
# Default database path
|
|
908
|
+
if db_path is None:
|
|
909
|
+
db_path_obj = DEFAULT_DB_PATH
|
|
910
|
+
else:
|
|
911
|
+
db_path_obj = Path(db_path)
|
|
912
|
+
|
|
913
|
+
click.echo(f"Importing SEC Form 4 officers/directors to {db_path_obj}...", err=True)
|
|
914
|
+
click.echo(f"Year range: {start_year} - {end_year or 'current'}", err=True)
|
|
915
|
+
if resume:
|
|
916
|
+
click.echo("Resuming from saved progress...", err=True)
|
|
917
|
+
|
|
918
|
+
# Initialize components
|
|
919
|
+
database = get_person_database(db_path=db_path_obj)
|
|
920
|
+
org_database = get_database(db_path=db_path_obj)
|
|
921
|
+
embedder = CompanyEmbedder()
|
|
922
|
+
importer = SecForm4Importer()
|
|
923
|
+
|
|
924
|
+
# Import records in batches
|
|
925
|
+
records = []
|
|
926
|
+
count = 0
|
|
927
|
+
skipped_existing = 0
|
|
928
|
+
|
|
929
|
+
def progress_callback(year: int, quarter: int, filing_idx: int, accession: str, total: int) -> None:
|
|
930
|
+
if verbose and filing_idx % 100 == 0:
|
|
931
|
+
click.echo(f" {year} Q{quarter}: {filing_idx} filings, {total} records", err=True)
|
|
932
|
+
|
|
933
|
+
for record in importer.import_range(
|
|
934
|
+
start_year=start_year,
|
|
935
|
+
end_year=end_year,
|
|
936
|
+
limit=limit,
|
|
937
|
+
resume=resume,
|
|
938
|
+
progress_callback=progress_callback,
|
|
939
|
+
):
|
|
940
|
+
# Skip existing records if flag is set
|
|
941
|
+
if skip_existing:
|
|
942
|
+
existing = database.get_by_source_id(record.source, record.source_id)
|
|
943
|
+
if existing is not None:
|
|
944
|
+
skipped_existing += 1
|
|
945
|
+
continue
|
|
946
|
+
|
|
947
|
+
# Look up org ID by CIK if available
|
|
948
|
+
issuer_cik = record.record.get("issuer_cik", "")
|
|
949
|
+
if issuer_cik:
|
|
950
|
+
org_id = org_database.get_id_by_source_id("sec_edgar", issuer_cik.zfill(10))
|
|
951
|
+
if org_id is not None:
|
|
952
|
+
record.known_for_org_id = org_id
|
|
953
|
+
|
|
954
|
+
records.append(record)
|
|
955
|
+
|
|
956
|
+
if len(records) >= batch_size:
|
|
957
|
+
embedding_texts = [r.get_embedding_text() for r in records]
|
|
958
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
|
|
959
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
960
|
+
count += len(records)
|
|
961
|
+
click.echo(f"Imported {count} records...", err=True)
|
|
962
|
+
records = []
|
|
963
|
+
|
|
964
|
+
# Final batch
|
|
965
|
+
if records:
|
|
966
|
+
embedding_texts = [r.get_embedding_text() for r in records]
|
|
967
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
|
|
968
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
969
|
+
count += len(records)
|
|
970
|
+
|
|
971
|
+
if skip_existing and skipped_existing > 0:
|
|
972
|
+
click.echo(f"\nImported {count} SEC officers/directors (skipped {skipped_existing} existing).", err=True)
|
|
973
|
+
else:
|
|
974
|
+
click.echo(f"\nImported {count} SEC officers/directors successfully.", err=True)
|
|
975
|
+
|
|
976
|
+
org_database.close()
|
|
977
|
+
database.close()
|
|
978
|
+
|
|
979
|
+
|
|
980
|
+
@db_cmd.command("import-ch-officers")
|
|
981
|
+
@click.option("--file", "file_path", type=click.Path(exists=True), required=True, help="Path to CH officers zip file (Prod195)")
|
|
982
|
+
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
983
|
+
@click.option("--limit", type=int, help="Limit number of records")
|
|
984
|
+
@click.option("--batch-size", type=int, default=1000, help="Batch size for commits (default: 1000)")
|
|
985
|
+
@click.option("--resume", is_flag=True, help="Resume from saved progress")
|
|
986
|
+
@click.option("--include-resigned", is_flag=True, help="Include resigned officers (default: current only)")
|
|
987
|
+
@click.option("--skip-existing", is_flag=True, help="Skip records that already exist")
|
|
988
|
+
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
|
|
989
|
+
def db_import_ch_officers(file_path: str, db_path: Optional[str], limit: Optional[int], batch_size: int, resume: bool, include_resigned: bool, skip_existing: bool, verbose: bool):
|
|
990
|
+
"""
|
|
991
|
+
Import Companies House officers data into the people database.
|
|
992
|
+
|
|
993
|
+
Requires the Prod195 bulk officers zip file from Companies House.
|
|
994
|
+
Request access via BulkProducts@companieshouse.gov.uk.
|
|
995
|
+
|
|
996
|
+
\b
|
|
997
|
+
Examples:
|
|
998
|
+
corp-extractor db import-ch-officers --file officers.zip --limit 10000
|
|
999
|
+
corp-extractor db import-ch-officers --file officers.zip --resume
|
|
1000
|
+
corp-extractor db import-ch-officers --file officers.zip --include-resigned
|
|
1001
|
+
"""
|
|
1002
|
+
_configure_logging(verbose)
|
|
1003
|
+
|
|
1004
|
+
from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
|
|
1005
|
+
from .database.embeddings import CompanyEmbedder
|
|
1006
|
+
from .database.importers.companies_house_officers import CompaniesHouseOfficersImporter
|
|
1007
|
+
|
|
1008
|
+
# Default database path
|
|
1009
|
+
if db_path is None:
|
|
1010
|
+
db_path_obj = DEFAULT_DB_PATH
|
|
1011
|
+
else:
|
|
1012
|
+
db_path_obj = Path(db_path)
|
|
1013
|
+
|
|
1014
|
+
click.echo(f"Importing Companies House officers to {db_path_obj}...", err=True)
|
|
1015
|
+
if resume:
|
|
1016
|
+
click.echo("Resuming from saved progress...", err=True)
|
|
1017
|
+
|
|
1018
|
+
# Initialize components
|
|
1019
|
+
database = get_person_database(db_path=db_path_obj)
|
|
1020
|
+
org_database = get_database(db_path=db_path_obj)
|
|
1021
|
+
embedder = CompanyEmbedder()
|
|
1022
|
+
importer = CompaniesHouseOfficersImporter()
|
|
1023
|
+
|
|
1024
|
+
# Import records in batches
|
|
1025
|
+
records = []
|
|
1026
|
+
count = 0
|
|
1027
|
+
skipped_existing = 0
|
|
1028
|
+
|
|
1029
|
+
def progress_callback(file_idx: int, line_num: int, total: int) -> None:
|
|
1030
|
+
if verbose:
|
|
1031
|
+
click.echo(f" File {file_idx}: line {line_num}, {total} records", err=True)
|
|
1032
|
+
|
|
1033
|
+
for record in importer.import_from_zip(
|
|
1034
|
+
file_path,
|
|
1035
|
+
limit=limit,
|
|
1036
|
+
resume=resume,
|
|
1037
|
+
current_only=not include_resigned,
|
|
1038
|
+
progress_callback=progress_callback,
|
|
1039
|
+
):
|
|
1040
|
+
# Skip existing records if flag is set
|
|
1041
|
+
if skip_existing:
|
|
1042
|
+
existing = database.get_by_source_id(record.source, record.source_id)
|
|
1043
|
+
if existing is not None:
|
|
1044
|
+
skipped_existing += 1
|
|
1045
|
+
continue
|
|
1046
|
+
|
|
1047
|
+
# Look up org ID by company number if available
|
|
1048
|
+
company_number = record.record.get("company_number", "")
|
|
1049
|
+
if company_number:
|
|
1050
|
+
org_id = org_database.get_id_by_source_id("companies_house", company_number)
|
|
1051
|
+
if org_id is not None:
|
|
1052
|
+
record.known_for_org_id = org_id
|
|
1053
|
+
|
|
1054
|
+
records.append(record)
|
|
1055
|
+
|
|
1056
|
+
if len(records) >= batch_size:
|
|
1057
|
+
embedding_texts = [r.get_embedding_text() for r in records]
|
|
1058
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
|
|
1059
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1060
|
+
count += len(records)
|
|
1061
|
+
click.echo(f"Imported {count} records...", err=True)
|
|
1062
|
+
records = []
|
|
1063
|
+
|
|
1064
|
+
# Final batch
|
|
1065
|
+
if records:
|
|
1066
|
+
embedding_texts = [r.get_embedding_text() for r in records]
|
|
1067
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
|
|
1068
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1069
|
+
count += len(records)
|
|
1070
|
+
|
|
1071
|
+
if skip_existing and skipped_existing > 0:
|
|
1072
|
+
click.echo(f"\nImported {count} CH officers (skipped {skipped_existing} existing).", err=True)
|
|
1073
|
+
else:
|
|
1074
|
+
click.echo(f"\nImported {count} CH officers successfully.", err=True)
|
|
1075
|
+
|
|
1076
|
+
org_database.close()
|
|
1077
|
+
database.close()
|
|
1078
|
+
|
|
1079
|
+
|
|
871
1080
|
@db_cmd.command("import-wikidata")
|
|
872
1081
|
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
873
1082
|
@click.option("--limit", type=int, help="Limit number of records")
|
|
@@ -921,8 +1130,8 @@ def db_import_wikidata(db_path: Optional[str], limit: Optional[int], batch_size:
|
|
|
921
1130
|
|
|
922
1131
|
if len(records) >= batch_size:
|
|
923
1132
|
names = [r.name for r in records]
|
|
924
|
-
embeddings = embedder.
|
|
925
|
-
database.insert_batch(records, embeddings)
|
|
1133
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
1134
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
926
1135
|
count += len(records)
|
|
927
1136
|
click.echo(f"Imported {count} records...", err=True)
|
|
928
1137
|
records = []
|
|
@@ -930,8 +1139,8 @@ def db_import_wikidata(db_path: Optional[str], limit: Optional[int], batch_size:
|
|
|
930
1139
|
# Final batch
|
|
931
1140
|
if records:
|
|
932
1141
|
names = [r.name for r in records]
|
|
933
|
-
embeddings = embedder.
|
|
934
|
-
database.insert_batch(records, embeddings)
|
|
1142
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
1143
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
935
1144
|
count += len(records)
|
|
936
1145
|
|
|
937
1146
|
click.echo(f"\nImported {count} Wikidata records successfully.", err=True)
|
|
@@ -947,23 +1156,32 @@ def db_import_wikidata(db_path: Optional[str], limit: Optional[int], batch_size:
|
|
|
947
1156
|
"academic", "scientist", "journalist", "entrepreneur", "activist"
|
|
948
1157
|
]), default="executive", help="Person type to import")
|
|
949
1158
|
@click.option("--all", "import_all", is_flag=True, help="Run all person type queries sequentially")
|
|
1159
|
+
@click.option("--enrich", is_flag=True, help="Query individual people to get role/org data (slower, resumable)")
|
|
1160
|
+
@click.option("--enrich-only", is_flag=True, help="Only enrich existing people (skip bulk import)")
|
|
1161
|
+
@click.option("--enrich-dates", is_flag=True, help="Query individual people to get start/end dates (slower)")
|
|
1162
|
+
@click.option("--skip-existing", is_flag=True, help="Skip records that already exist (default: update them)")
|
|
950
1163
|
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
|
|
951
|
-
def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: int, query_type: str, import_all: bool, verbose: bool):
|
|
1164
|
+
def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: int, query_type: str, import_all: bool, enrich: bool, enrich_only: bool, enrich_dates: bool, skip_existing: bool, verbose: bool):
|
|
952
1165
|
"""
|
|
953
1166
|
Import notable people data from Wikidata via SPARQL.
|
|
954
1167
|
|
|
1168
|
+
Uses a two-phase approach for reliability:
|
|
1169
|
+
1. Bulk import: Fast fetch of QID, name, country (no timeouts)
|
|
1170
|
+
2. Enrich (optional): Per-person queries for role/org/dates
|
|
1171
|
+
|
|
955
1172
|
Imports people with English Wikipedia articles (ensures notability).
|
|
956
|
-
Includes executives, politicians, athletes, artists, academics, and more.
|
|
957
1173
|
|
|
958
1174
|
\b
|
|
959
1175
|
Examples:
|
|
960
1176
|
corp-extractor db import-people --type executive --limit 5000
|
|
961
1177
|
corp-extractor db import-people --all --limit 10000
|
|
1178
|
+
corp-extractor db import-people --type executive --enrich
|
|
1179
|
+
corp-extractor db import-people --enrich-only --limit 100
|
|
962
1180
|
corp-extractor db import-people --type politician -v
|
|
963
1181
|
"""
|
|
964
1182
|
_configure_logging(verbose)
|
|
965
1183
|
|
|
966
|
-
from .database.store import get_person_database, DEFAULT_DB_PATH
|
|
1184
|
+
from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
|
|
967
1185
|
from .database.embeddings import CompanyEmbedder
|
|
968
1186
|
from .database.importers.wikidata_people import WikidataPeopleImporter
|
|
969
1187
|
|
|
@@ -977,35 +1195,677 @@ def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: i
|
|
|
977
1195
|
|
|
978
1196
|
# Initialize components
|
|
979
1197
|
database = get_person_database(db_path=db_path_obj)
|
|
1198
|
+
org_database = get_database(db_path=db_path_obj)
|
|
980
1199
|
embedder = CompanyEmbedder()
|
|
981
1200
|
importer = WikidataPeopleImporter(batch_size=batch_size)
|
|
982
1201
|
|
|
983
|
-
# Batch processing
|
|
984
|
-
records = []
|
|
985
1202
|
count = 0
|
|
986
1203
|
|
|
987
|
-
|
|
988
|
-
|
|
1204
|
+
# Phase 1: Bulk import (fast, minimal data) - skip if --enrich-only
|
|
1205
|
+
if not enrich_only:
|
|
1206
|
+
records = []
|
|
1207
|
+
skipped_existing = 0
|
|
989
1208
|
|
|
990
|
-
|
|
991
|
-
|
|
1209
|
+
click.echo("Phase 1: Bulk import (QID, name, country)...", err=True)
|
|
1210
|
+
|
|
1211
|
+
for record in importer.import_from_sparql(limit=limit, query_type=query_type, import_all=import_all):
|
|
1212
|
+
# Skip existing records if flag is set
|
|
1213
|
+
if skip_existing:
|
|
1214
|
+
existing = database.get_by_source_id(record.source, record.source_id)
|
|
1215
|
+
if existing is not None:
|
|
1216
|
+
skipped_existing += 1
|
|
1217
|
+
continue
|
|
1218
|
+
|
|
1219
|
+
records.append(record)
|
|
1220
|
+
|
|
1221
|
+
if len(records) >= batch_size:
|
|
1222
|
+
# Generate embeddings (both float32 and int8)
|
|
1223
|
+
embedding_texts = [r.get_embedding_text() for r in records]
|
|
1224
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
|
|
1225
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1226
|
+
count += len(records)
|
|
1227
|
+
|
|
1228
|
+
click.echo(f" Imported {count} people...", err=True)
|
|
1229
|
+
records = []
|
|
1230
|
+
|
|
1231
|
+
# Final batch
|
|
1232
|
+
if records:
|
|
992
1233
|
embedding_texts = [r.get_embedding_text() for r in records]
|
|
993
|
-
embeddings = embedder.
|
|
994
|
-
database.insert_batch(records, embeddings)
|
|
1234
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
|
|
1235
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
995
1236
|
count += len(records)
|
|
996
|
-
click.echo(f" Imported {count} people...", err=True)
|
|
997
|
-
records = []
|
|
998
1237
|
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1238
|
+
if skip_existing and skipped_existing > 0:
|
|
1239
|
+
click.echo(f"\nPhase 1 complete: {count} people imported (skipped {skipped_existing} existing).", err=True)
|
|
1240
|
+
else:
|
|
1241
|
+
click.echo(f"\nPhase 1 complete: {count} people imported.", err=True)
|
|
1242
|
+
else:
|
|
1243
|
+
click.echo("Skipping Phase 1 (bulk import) - using existing database records.", err=True)
|
|
1244
|
+
# Enable enrich if enrich_only is set
|
|
1245
|
+
enrich = True
|
|
1246
|
+
|
|
1247
|
+
# Phase 2: Enrich with role/org/dates (optional, slower but resumable)
|
|
1248
|
+
if enrich:
|
|
1249
|
+
click.echo("\nPhase 2: Enriching with role/org/dates (parallel queries)...", err=True)
|
|
1250
|
+
# Get all people without role/org
|
|
1251
|
+
people_to_enrich = []
|
|
1252
|
+
enriched_count = 0
|
|
1253
|
+
for record in database.iter_records():
|
|
1254
|
+
if not record.known_for_role and not record.known_for_org:
|
|
1255
|
+
people_to_enrich.append(record)
|
|
1256
|
+
enriched_count += 1
|
|
1257
|
+
# Apply limit if --enrich-only
|
|
1258
|
+
if enrich_only and limit and enriched_count >= limit:
|
|
1259
|
+
break
|
|
1260
|
+
|
|
1261
|
+
if people_to_enrich:
|
|
1262
|
+
click.echo(f"Found {len(people_to_enrich)} people to enrich...", err=True)
|
|
1263
|
+
importer.enrich_people_role_org_batch(people_to_enrich, delay_seconds=0.1, max_workers=5)
|
|
1264
|
+
|
|
1265
|
+
# Persist the enriched data and re-generate embeddings
|
|
1266
|
+
updated = 0
|
|
1267
|
+
org_count = 0
|
|
1268
|
+
date_count = 0
|
|
1269
|
+
for person in people_to_enrich:
|
|
1270
|
+
if person.known_for_role or person.known_for_org:
|
|
1271
|
+
# Look up org ID if we have org_qid
|
|
1272
|
+
org_qid = person.record.get("org_qid", "")
|
|
1273
|
+
if org_qid:
|
|
1274
|
+
org_id = org_database.get_id_by_source_id("wikipedia", org_qid)
|
|
1275
|
+
if org_id is not None:
|
|
1276
|
+
person.known_for_org_id = org_id
|
|
1277
|
+
|
|
1278
|
+
# Update the record with new role/org/dates and re-embed
|
|
1279
|
+
new_embedding_text = person.get_embedding_text()
|
|
1280
|
+
new_embedding = embedder.embed(new_embedding_text)
|
|
1281
|
+
if database.update_role_org(
|
|
1282
|
+
person.source, person.source_id,
|
|
1283
|
+
person.known_for_role, person.known_for_org,
|
|
1284
|
+
person.known_for_org_id, new_embedding,
|
|
1285
|
+
person.from_date, person.to_date,
|
|
1286
|
+
):
|
|
1287
|
+
updated += 1
|
|
1288
|
+
if person.known_for_org:
|
|
1289
|
+
org_count += 1
|
|
1290
|
+
if person.from_date or person.to_date:
|
|
1291
|
+
date_count += 1
|
|
1292
|
+
if verbose:
|
|
1293
|
+
date_str = ""
|
|
1294
|
+
if person.from_date or person.to_date:
|
|
1295
|
+
date_str = f" ({person.from_date or '?'} - {person.to_date or '?'})"
|
|
1296
|
+
click.echo(f" {person.name}: {person.known_for_role} at {person.known_for_org}{date_str}", err=True)
|
|
1297
|
+
|
|
1298
|
+
click.echo(f"Updated {updated} people ({org_count} with orgs, {date_count} with dates).", err=True)
|
|
1299
|
+
|
|
1300
|
+
# Phase 3: Enrich with dates (optional, even slower)
|
|
1301
|
+
if enrich_dates:
|
|
1302
|
+
click.echo("\nPhase 3: Enriching with dates...", err=True)
|
|
1303
|
+
# Get all people without dates but with role (dates are associated with positions)
|
|
1304
|
+
people_to_enrich = []
|
|
1305
|
+
for record in database.iter_records():
|
|
1306
|
+
if not record.from_date and not record.to_date and record.known_for_role:
|
|
1307
|
+
people_to_enrich.append(record)
|
|
1308
|
+
|
|
1309
|
+
if people_to_enrich:
|
|
1310
|
+
click.echo(f"Found {len(people_to_enrich)} people to enrich with dates...", err=True)
|
|
1311
|
+
enriched = importer.enrich_people_batch(people_to_enrich, delay_seconds=0.3)
|
|
1312
|
+
|
|
1313
|
+
# Persist the enriched dates
|
|
1314
|
+
updated = 0
|
|
1315
|
+
for person in people_to_enrich:
|
|
1316
|
+
if person.from_date or person.to_date:
|
|
1317
|
+
if database.update_dates(person.source, person.source_id, person.from_date, person.to_date):
|
|
1318
|
+
updated += 1
|
|
1319
|
+
if verbose:
|
|
1320
|
+
click.echo(f" {person.name}: {person.from_date or '?'} - {person.to_date or '?'}", err=True)
|
|
1321
|
+
|
|
1322
|
+
click.echo(f"Updated {updated} people with dates.", err=True)
|
|
1323
|
+
|
|
1324
|
+
org_database.close()
|
|
1325
|
+
database.close()
|
|
1326
|
+
|
|
1327
|
+
|
|
1328
|
+
@db_cmd.command("import-wikidata-dump")
|
|
1329
|
+
@click.option("--dump", "dump_path", type=click.Path(exists=True), help="Path to Wikidata JSON dump file (.bz2 or .gz)")
|
|
1330
|
+
@click.option("--download", is_flag=True, help="Download latest dump first (~100GB)")
|
|
1331
|
+
@click.option("--force", is_flag=True, help="Force re-download even if cached")
|
|
1332
|
+
@click.option("--no-aria2", is_flag=True, help="Don't use aria2c even if available (slower)")
|
|
1333
|
+
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
1334
|
+
@click.option("--people/--no-people", default=True, help="Import people (default: yes)")
|
|
1335
|
+
@click.option("--orgs/--no-orgs", default=True, help="Import organizations (default: yes)")
|
|
1336
|
+
@click.option("--locations/--no-locations", default=False, help="Import locations (default: no)")
|
|
1337
|
+
@click.option("--require-enwiki", is_flag=True, help="Only import orgs with English Wikipedia articles")
|
|
1338
|
+
@click.option("--resume", is_flag=True, help="Resume from last position in dump file (tracks entity index)")
|
|
1339
|
+
@click.option("--skip-updates", is_flag=True, help="Skip Q codes already in database (no updates)")
|
|
1340
|
+
@click.option("--limit", type=int, help="Max records per type (people and/or orgs)")
|
|
1341
|
+
@click.option("--batch-size", type=int, default=10000, help="Batch size for commits (default: 10000)")
|
|
1342
|
+
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
|
|
1343
|
+
def db_import_wikidata_dump(
|
|
1344
|
+
dump_path: Optional[str],
|
|
1345
|
+
download: bool,
|
|
1346
|
+
force: bool,
|
|
1347
|
+
no_aria2: bool,
|
|
1348
|
+
db_path: Optional[str],
|
|
1349
|
+
people: bool,
|
|
1350
|
+
orgs: bool,
|
|
1351
|
+
locations: bool,
|
|
1352
|
+
require_enwiki: bool,
|
|
1353
|
+
resume: bool,
|
|
1354
|
+
skip_updates: bool,
|
|
1355
|
+
limit: Optional[int],
|
|
1356
|
+
batch_size: int,
|
|
1357
|
+
verbose: bool,
|
|
1358
|
+
):
|
|
1359
|
+
"""
|
|
1360
|
+
Import people, organizations, and locations from Wikidata JSON dump.
|
|
1361
|
+
|
|
1362
|
+
This uses the full Wikidata JSON dump (~100GB compressed) to import
|
|
1363
|
+
all humans and organizations with English Wikipedia articles. This
|
|
1364
|
+
avoids SPARQL query timeouts that occur with large result sets.
|
|
1365
|
+
|
|
1366
|
+
The dump is streamed line-by-line to minimize memory usage.
|
|
1367
|
+
|
|
1368
|
+
\b
|
|
1369
|
+
Features:
|
|
1370
|
+
- No timeouts (processes locally)
|
|
1371
|
+
- Complete coverage (all notable people/orgs)
|
|
1372
|
+
- Resumable with --resume (tracks position in dump file)
|
|
1373
|
+
- Skip existing with --skip-updates (loads existing Q codes)
|
|
1374
|
+
- People like Andy Burnham are captured via occupation (P106)
|
|
1375
|
+
- Locations (countries, cities, regions) with parent hierarchy
|
|
1376
|
+
|
|
1377
|
+
\b
|
|
1378
|
+
Resume options:
|
|
1379
|
+
- --resume: Resume from where the dump processing left off (tracks entity index).
|
|
1380
|
+
Progress is saved after each batch. Use this if import was interrupted.
|
|
1381
|
+
- --skip-updates: Skip Q codes already in database (no updates to existing records).
|
|
1382
|
+
Use this to add new records without re-processing existing ones.
|
|
1383
|
+
|
|
1384
|
+
\b
|
|
1385
|
+
Examples:
|
|
1386
|
+
corp-extractor db import-wikidata-dump --dump /path/to/dump.json.bz2 --limit 10000
|
|
1387
|
+
corp-extractor db import-wikidata-dump --download --people --no-orgs --limit 50000
|
|
1388
|
+
corp-extractor db import-wikidata-dump --dump dump.json.bz2 --orgs --no-people
|
|
1389
|
+
corp-extractor db import-wikidata-dump --dump dump.json.bz2 --locations --no-people --no-orgs # Locations only
|
|
1390
|
+
corp-extractor db import-wikidata-dump --dump dump.json.bz2 --resume # Resume interrupted import
|
|
1391
|
+
corp-extractor db import-wikidata-dump --dump dump.json.bz2 --skip-updates # Skip existing Q codes
|
|
1392
|
+
"""
|
|
1393
|
+
_configure_logging(verbose)
|
|
1394
|
+
|
|
1395
|
+
from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
|
|
1396
|
+
from .database.embeddings import CompanyEmbedder
|
|
1397
|
+
from .database.importers.wikidata_dump import WikidataDumpImporter, DumpProgress
|
|
1398
|
+
|
|
1399
|
+
if not dump_path and not download:
|
|
1400
|
+
raise click.UsageError("Either --dump path or --download is required")
|
|
1401
|
+
|
|
1402
|
+
if not people and not orgs and not locations:
|
|
1403
|
+
raise click.UsageError("Must import at least one of --people, --orgs, or --locations")
|
|
1404
|
+
|
|
1405
|
+
# Default database path
|
|
1406
|
+
if db_path is None:
|
|
1407
|
+
db_path_obj = DEFAULT_DB_PATH
|
|
1408
|
+
else:
|
|
1409
|
+
db_path_obj = Path(db_path)
|
|
1410
|
+
|
|
1411
|
+
click.echo(f"Importing Wikidata dump to {db_path_obj}...", err=True)
|
|
1412
|
+
|
|
1413
|
+
# Initialize importer
|
|
1414
|
+
importer = WikidataDumpImporter(dump_path=dump_path)
|
|
1415
|
+
|
|
1416
|
+
# Download if requested
|
|
1417
|
+
if download:
|
|
1418
|
+
import shutil
|
|
1419
|
+
dump_target = importer.get_dump_path()
|
|
1420
|
+
click.echo(f"Downloading Wikidata dump (~100GB) to:", err=True)
|
|
1421
|
+
click.echo(f" {dump_target}", err=True)
|
|
1422
|
+
|
|
1423
|
+
# Check for aria2c
|
|
1424
|
+
has_aria2 = shutil.which("aria2c") is not None
|
|
1425
|
+
use_aria2 = has_aria2 and not no_aria2
|
|
1426
|
+
|
|
1427
|
+
if use_aria2:
|
|
1428
|
+
click.echo(" Using aria2c for fast parallel download (16 connections)", err=True)
|
|
1429
|
+
dump_file = importer.download_dump(force=force, use_aria2=True)
|
|
1430
|
+
click.echo(f"\nUsing dump: {dump_file}", err=True)
|
|
1431
|
+
else:
|
|
1432
|
+
if not has_aria2:
|
|
1433
|
+
click.echo("", err=True)
|
|
1434
|
+
click.echo(" TIP: Install aria2c for 10-20x faster downloads:", err=True)
|
|
1435
|
+
click.echo(" brew install aria2 (macOS)", err=True)
|
|
1436
|
+
click.echo(" apt install aria2 (Ubuntu/Debian)", err=True)
|
|
1437
|
+
click.echo("", err=True)
|
|
1438
|
+
|
|
1439
|
+
# Use urllib to get content length first
|
|
1440
|
+
import urllib.request
|
|
1441
|
+
req = urllib.request.Request(
|
|
1442
|
+
"https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2",
|
|
1443
|
+
headers={"User-Agent": "corp-extractor/1.0"},
|
|
1444
|
+
method="HEAD"
|
|
1445
|
+
)
|
|
1446
|
+
with urllib.request.urlopen(req) as response:
|
|
1447
|
+
total_size = int(response.headers.get("content-length", 0))
|
|
1448
|
+
|
|
1449
|
+
if total_size:
|
|
1450
|
+
total_gb = total_size / (1024 ** 3)
|
|
1451
|
+
click.echo(f" Size: {total_gb:.1f} GB", err=True)
|
|
1452
|
+
|
|
1453
|
+
# Download with progress bar
|
|
1454
|
+
progress_bar = None
|
|
1455
|
+
|
|
1456
|
+
def update_progress(downloaded: int, total: int) -> None:
|
|
1457
|
+
nonlocal progress_bar
|
|
1458
|
+
if progress_bar is None and total > 0:
|
|
1459
|
+
progress_bar = click.progressbar(
|
|
1460
|
+
length=total,
|
|
1461
|
+
label="Downloading",
|
|
1462
|
+
show_percent=True,
|
|
1463
|
+
show_pos=True,
|
|
1464
|
+
item_show_func=lambda x: f"{(x or 0) / (1024**3):.1f} GB" if x else "",
|
|
1465
|
+
)
|
|
1466
|
+
progress_bar.__enter__()
|
|
1467
|
+
if progress_bar:
|
|
1468
|
+
# Update to absolute position
|
|
1469
|
+
progress_bar.update(downloaded - progress_bar.pos)
|
|
1470
|
+
|
|
1471
|
+
try:
|
|
1472
|
+
dump_file = importer.download_dump(force=force, use_aria2=False, progress_callback=update_progress)
|
|
1473
|
+
finally:
|
|
1474
|
+
if progress_bar:
|
|
1475
|
+
progress_bar.__exit__(None, None, None)
|
|
1476
|
+
|
|
1477
|
+
click.echo(f"\nUsing dump: {dump_file}", err=True)
|
|
1478
|
+
elif dump_path:
|
|
1479
|
+
click.echo(f"Using dump: {dump_path}", err=True)
|
|
1480
|
+
|
|
1481
|
+
# Initialize embedder (loads model, may take time on first run)
|
|
1482
|
+
click.echo("Loading embedding model...", err=True)
|
|
1483
|
+
sys.stderr.flush()
|
|
1484
|
+
embedder = CompanyEmbedder()
|
|
1485
|
+
click.echo("Embedding model loaded.", err=True)
|
|
1486
|
+
sys.stderr.flush()
|
|
1487
|
+
|
|
1488
|
+
# Load existing QID labels from database and seed the importer's cache
|
|
1489
|
+
database = get_person_database(db_path=db_path_obj)
|
|
1490
|
+
existing_labels = database.get_all_qid_labels()
|
|
1491
|
+
if existing_labels:
|
|
1492
|
+
click.echo(f"Loaded {len(existing_labels):,} existing QID labels from DB", err=True)
|
|
1493
|
+
importer.set_label_cache(existing_labels)
|
|
1494
|
+
known_qids_at_start = set(existing_labels.keys())
|
|
1495
|
+
|
|
1496
|
+
# Load existing source_ids for skip_updates mode
|
|
1497
|
+
existing_people_ids: set[str] = set()
|
|
1498
|
+
existing_org_ids: set[str] = set()
|
|
1499
|
+
if skip_updates:
|
|
1500
|
+
click.echo("Loading existing records for --skip-updates...", err=True)
|
|
1501
|
+
if people:
|
|
1502
|
+
existing_people_ids = database.get_all_source_ids(source="wikidata")
|
|
1503
|
+
click.echo(f" Found {len(existing_people_ids):,} existing people Q codes", err=True)
|
|
1504
|
+
if orgs:
|
|
1505
|
+
org_database = get_database(db_path=db_path_obj)
|
|
1506
|
+
existing_org_ids = org_database.get_all_source_ids(source="wikipedia")
|
|
1507
|
+
click.echo(f" Found {len(existing_org_ids):,} existing org Q codes", err=True)
|
|
1508
|
+
|
|
1509
|
+
# Load progress for resume mode (position-based resume)
|
|
1510
|
+
progress: Optional[DumpProgress] = None
|
|
1511
|
+
start_index = 0
|
|
1512
|
+
if resume:
|
|
1513
|
+
progress = DumpProgress.load()
|
|
1514
|
+
if progress:
|
|
1515
|
+
# Verify the progress is for the same dump file
|
|
1516
|
+
actual_dump_path = importer._dump_path or Path(dump_path) if dump_path else importer.get_dump_path()
|
|
1517
|
+
if progress.matches_dump(actual_dump_path):
|
|
1518
|
+
start_index = progress.entity_index
|
|
1519
|
+
click.echo(f"Resuming from entity index {start_index:,}", err=True)
|
|
1520
|
+
click.echo(f" Last entity: {progress.last_entity_id}", err=True)
|
|
1521
|
+
click.echo(f" Last updated: {progress.last_updated}", err=True)
|
|
1522
|
+
else:
|
|
1523
|
+
click.echo("Warning: Progress file is for a different dump, starting from beginning", err=True)
|
|
1524
|
+
progress = None
|
|
1525
|
+
else:
|
|
1526
|
+
click.echo("No progress file found, starting from beginning", err=True)
|
|
1527
|
+
|
|
1528
|
+
# Initialize progress tracking
|
|
1529
|
+
if progress is None:
|
|
1530
|
+
actual_dump_path = importer._dump_path or Path(dump_path) if dump_path else importer.get_dump_path()
|
|
1531
|
+
progress = DumpProgress(
|
|
1532
|
+
dump_path=str(actual_dump_path),
|
|
1533
|
+
dump_size=actual_dump_path.stat().st_size if actual_dump_path.exists() else 0,
|
|
1534
|
+
)
|
|
1535
|
+
|
|
1536
|
+
# Helper to persist new labels after each batch
|
|
1537
|
+
def persist_new_labels() -> int:
|
|
1538
|
+
new_labels = importer.get_new_labels_since(known_qids_at_start)
|
|
1539
|
+
if new_labels:
|
|
1540
|
+
database.insert_qid_labels(new_labels)
|
|
1541
|
+
known_qids_at_start.update(new_labels.keys())
|
|
1542
|
+
return len(new_labels)
|
|
1543
|
+
return 0
|
|
1544
|
+
|
|
1545
|
+
# ========================================
|
|
1546
|
+
# Location-only import (separate pass)
|
|
1547
|
+
# ========================================
|
|
1548
|
+
if locations and not people and not orgs:
|
|
1549
|
+
from .database.store import get_locations_database
|
|
1550
|
+
|
|
1551
|
+
click.echo("\n=== Location Import ===", err=True)
|
|
1552
|
+
click.echo(f" Locations: {'up to ' + str(limit) + ' records' if limit else 'unlimited'}", err=True)
|
|
1553
|
+
if require_enwiki:
|
|
1554
|
+
click.echo(" Filter: only locations with English Wikipedia articles", err=True)
|
|
1555
|
+
|
|
1556
|
+
# Initialize locations database
|
|
1557
|
+
locations_database = get_locations_database(db_path=db_path_obj)
|
|
1558
|
+
|
|
1559
|
+
# Load existing location Q codes for skip_updates mode
|
|
1560
|
+
existing_location_ids: set[str] = set()
|
|
1561
|
+
if skip_updates:
|
|
1562
|
+
existing_location_ids = locations_database.get_all_source_ids(source="wikidata")
|
|
1563
|
+
click.echo(f" Skip updates: {len(existing_location_ids):,} existing Q codes", err=True)
|
|
1564
|
+
|
|
1565
|
+
if start_index > 0:
|
|
1566
|
+
click.echo(f" Resuming from entity index {start_index:,}", err=True)
|
|
1567
|
+
|
|
1568
|
+
location_records: list = []
|
|
1569
|
+
locations_count = 0
|
|
1570
|
+
last_entity_index = start_index
|
|
1571
|
+
last_entity_id = ""
|
|
1572
|
+
|
|
1573
|
+
def location_progress_callback(entity_index: int, entity_id: str, loc_count: int) -> None:
|
|
1574
|
+
nonlocal last_entity_index, last_entity_id
|
|
1575
|
+
last_entity_index = entity_index
|
|
1576
|
+
last_entity_id = entity_id
|
|
1577
|
+
|
|
1578
|
+
def save_location_progress() -> None:
|
|
1579
|
+
if progress:
|
|
1580
|
+
progress.entity_index = last_entity_index
|
|
1581
|
+
progress.last_entity_id = last_entity_id
|
|
1582
|
+
progress.save()
|
|
1583
|
+
|
|
1584
|
+
def flush_location_batch() -> None:
|
|
1585
|
+
nonlocal location_records, locations_count
|
|
1586
|
+
if location_records:
|
|
1587
|
+
inserted = locations_database.insert_batch(location_records)
|
|
1588
|
+
locations_count += inserted
|
|
1589
|
+
location_records = []
|
|
1590
|
+
|
|
1591
|
+
click.echo("Starting dump iteration...", err=True)
|
|
1592
|
+
sys.stderr.flush()
|
|
1593
|
+
|
|
1594
|
+
try:
|
|
1595
|
+
if limit:
|
|
1596
|
+
# Use progress bar when we have limits
|
|
1597
|
+
with click.progressbar(
|
|
1598
|
+
length=limit,
|
|
1599
|
+
label="Processing dump",
|
|
1600
|
+
show_percent=True,
|
|
1601
|
+
show_pos=True,
|
|
1602
|
+
) as pbar:
|
|
1603
|
+
for record in importer.import_locations(
|
|
1604
|
+
limit=limit,
|
|
1605
|
+
require_enwiki=require_enwiki,
|
|
1606
|
+
skip_ids=existing_location_ids if skip_updates else None,
|
|
1607
|
+
start_index=start_index,
|
|
1608
|
+
progress_callback=location_progress_callback,
|
|
1609
|
+
):
|
|
1610
|
+
pbar.update(1)
|
|
1611
|
+
location_records.append(record)
|
|
1612
|
+
if len(location_records) >= batch_size:
|
|
1613
|
+
flush_location_batch()
|
|
1614
|
+
persist_new_labels()
|
|
1615
|
+
save_location_progress()
|
|
1616
|
+
else:
|
|
1617
|
+
# No limit - show counter updates
|
|
1618
|
+
for record in importer.import_locations(
|
|
1619
|
+
limit=None,
|
|
1620
|
+
require_enwiki=require_enwiki,
|
|
1621
|
+
skip_ids=existing_location_ids if skip_updates else None,
|
|
1622
|
+
start_index=start_index,
|
|
1623
|
+
progress_callback=location_progress_callback,
|
|
1624
|
+
):
|
|
1625
|
+
location_records.append(record)
|
|
1626
|
+
if len(location_records) >= batch_size:
|
|
1627
|
+
flush_location_batch()
|
|
1628
|
+
persist_new_labels()
|
|
1629
|
+
save_location_progress()
|
|
1630
|
+
click.echo(f"\r Progress: {locations_count:,} locations...", nl=False, err=True)
|
|
1631
|
+
sys.stderr.flush()
|
|
1632
|
+
|
|
1633
|
+
click.echo("", err=True) # Newline after counter
|
|
1634
|
+
|
|
1635
|
+
# Final batches
|
|
1636
|
+
flush_location_batch()
|
|
1637
|
+
persist_new_labels()
|
|
1638
|
+
save_location_progress()
|
|
1639
|
+
|
|
1640
|
+
finally:
|
|
1641
|
+
# Ensure we save progress even on interrupt
|
|
1642
|
+
save_location_progress()
|
|
1643
|
+
|
|
1644
|
+
click.echo(f"\nLocation import complete: {locations_count:,} locations", err=True)
|
|
1645
|
+
|
|
1646
|
+
# Final label resolution
|
|
1647
|
+
click.echo("\n=== Final QID Label Resolution ===", err=True)
|
|
1648
|
+
all_labels = importer.get_label_cache()
|
|
1649
|
+
click.echo(f" Total labels in cache: {len(all_labels):,}", err=True)
|
|
1650
|
+
|
|
1651
|
+
# Final stats
|
|
1652
|
+
final_label_count = database.get_qid_labels_count()
|
|
1653
|
+
click.echo(f" Total labels in DB: {final_label_count:,}", err=True)
|
|
1654
|
+
|
|
1655
|
+
locations_database.close()
|
|
1656
|
+
database.close()
|
|
1657
|
+
click.echo("\nWikidata dump import complete!", err=True)
|
|
1658
|
+
return
|
|
1005
1659
|
|
|
1006
|
-
|
|
1660
|
+
# Combined import - single pass through the dump for both people and orgs
|
|
1661
|
+
click.echo("\n=== Combined Import (single dump pass) ===", err=True)
|
|
1662
|
+
sys.stderr.flush() # Ensure output is visible immediately
|
|
1663
|
+
if people:
|
|
1664
|
+
click.echo(f" People: {'up to ' + str(limit) + ' records' if limit else 'unlimited'}", err=True)
|
|
1665
|
+
if skip_updates and existing_people_ids:
|
|
1666
|
+
click.echo(f" Skip updates: {len(existing_people_ids):,} existing Q codes", err=True)
|
|
1667
|
+
if orgs:
|
|
1668
|
+
click.echo(f" Orgs: {'up to ' + str(limit) + ' records' if limit else 'unlimited'}", err=True)
|
|
1669
|
+
if require_enwiki:
|
|
1670
|
+
click.echo(" Filter: only orgs with English Wikipedia articles", err=True)
|
|
1671
|
+
if skip_updates and existing_org_ids:
|
|
1672
|
+
click.echo(f" Skip updates: {len(existing_org_ids):,} existing Q codes", err=True)
|
|
1673
|
+
if start_index > 0:
|
|
1674
|
+
click.echo(f" Resuming from entity index {start_index:,}", err=True)
|
|
1675
|
+
|
|
1676
|
+
# Initialize databases
|
|
1677
|
+
person_database = get_person_database(db_path=db_path_obj)
|
|
1678
|
+
org_database = get_database(db_path=db_path_obj) if orgs else None
|
|
1679
|
+
|
|
1680
|
+
# Batches for each type
|
|
1681
|
+
people_records: list = []
|
|
1682
|
+
org_records: list = []
|
|
1683
|
+
people_count = 0
|
|
1684
|
+
orgs_count = 0
|
|
1685
|
+
last_entity_index = start_index
|
|
1686
|
+
last_entity_id = ""
|
|
1687
|
+
|
|
1688
|
+
def combined_progress_callback(entity_index: int, entity_id: str, ppl_count: int, org_count: int) -> None:
|
|
1689
|
+
nonlocal last_entity_index, last_entity_id
|
|
1690
|
+
last_entity_index = entity_index
|
|
1691
|
+
last_entity_id = entity_id
|
|
1692
|
+
|
|
1693
|
+
def save_progress() -> None:
|
|
1694
|
+
if progress:
|
|
1695
|
+
progress.entity_index = last_entity_index
|
|
1696
|
+
progress.last_entity_id = last_entity_id
|
|
1697
|
+
progress.people_yielded = people_count
|
|
1698
|
+
progress.orgs_yielded = orgs_count
|
|
1699
|
+
progress.save()
|
|
1700
|
+
|
|
1701
|
+
def flush_people_batch() -> None:
|
|
1702
|
+
nonlocal people_records, people_count
|
|
1703
|
+
if people_records:
|
|
1704
|
+
embedding_texts = [r.get_embedding_text() for r in people_records]
|
|
1705
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(embedding_texts)
|
|
1706
|
+
person_database.insert_batch(people_records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1707
|
+
people_count += len(people_records)
|
|
1708
|
+
people_records = []
|
|
1709
|
+
|
|
1710
|
+
def flush_org_batch() -> None:
|
|
1711
|
+
nonlocal org_records, orgs_count
|
|
1712
|
+
if org_records and org_database:
|
|
1713
|
+
names = [r.name for r in org_records]
|
|
1714
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
1715
|
+
org_database.insert_batch(org_records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1716
|
+
orgs_count += len(org_records)
|
|
1717
|
+
org_records = []
|
|
1718
|
+
|
|
1719
|
+
# Calculate total for progress bar (if limits set for both)
|
|
1720
|
+
total_limit = None
|
|
1721
|
+
if limit and people and orgs:
|
|
1722
|
+
total_limit = limit * 2 # Rough estimate
|
|
1723
|
+
elif limit:
|
|
1724
|
+
total_limit = limit
|
|
1725
|
+
|
|
1726
|
+
click.echo("Starting dump iteration...", err=True)
|
|
1727
|
+
sys.stderr.flush()
|
|
1728
|
+
|
|
1729
|
+
records_seen = 0
|
|
1730
|
+
try:
|
|
1731
|
+
if total_limit:
|
|
1732
|
+
# Use progress bar when we have limits
|
|
1733
|
+
with click.progressbar(
|
|
1734
|
+
length=total_limit,
|
|
1735
|
+
label="Processing dump",
|
|
1736
|
+
show_percent=True,
|
|
1737
|
+
show_pos=True,
|
|
1738
|
+
) as pbar:
|
|
1739
|
+
for record_type, record in importer.import_all(
|
|
1740
|
+
people_limit=limit if people else 0,
|
|
1741
|
+
orgs_limit=limit if orgs else 0,
|
|
1742
|
+
import_people=people,
|
|
1743
|
+
import_orgs=orgs,
|
|
1744
|
+
require_enwiki=require_enwiki,
|
|
1745
|
+
skip_people_ids=existing_people_ids if skip_updates else None,
|
|
1746
|
+
skip_org_ids=existing_org_ids if skip_updates else None,
|
|
1747
|
+
start_index=start_index,
|
|
1748
|
+
progress_callback=combined_progress_callback,
|
|
1749
|
+
):
|
|
1750
|
+
records_seen += 1
|
|
1751
|
+
pbar.update(1)
|
|
1752
|
+
|
|
1753
|
+
if record_type == "person":
|
|
1754
|
+
people_records.append(record)
|
|
1755
|
+
if len(people_records) >= batch_size:
|
|
1756
|
+
flush_people_batch()
|
|
1757
|
+
persist_new_labels()
|
|
1758
|
+
save_progress()
|
|
1759
|
+
else: # org
|
|
1760
|
+
org_records.append(record)
|
|
1761
|
+
if len(org_records) >= batch_size:
|
|
1762
|
+
flush_org_batch()
|
|
1763
|
+
persist_new_labels()
|
|
1764
|
+
save_progress()
|
|
1765
|
+
else:
|
|
1766
|
+
# No limit - show counter updates
|
|
1767
|
+
for record_type, record in importer.import_all(
|
|
1768
|
+
people_limit=None,
|
|
1769
|
+
orgs_limit=None,
|
|
1770
|
+
import_people=people,
|
|
1771
|
+
import_orgs=orgs,
|
|
1772
|
+
require_enwiki=require_enwiki,
|
|
1773
|
+
skip_people_ids=existing_people_ids if skip_updates else None,
|
|
1774
|
+
skip_org_ids=existing_org_ids if skip_updates else None,
|
|
1775
|
+
start_index=start_index,
|
|
1776
|
+
progress_callback=combined_progress_callback,
|
|
1777
|
+
):
|
|
1778
|
+
records_seen += 1
|
|
1779
|
+
# Show first record immediately as proof of life
|
|
1780
|
+
if records_seen == 1:
|
|
1781
|
+
click.echo(f" First record found: {record.name}", err=True)
|
|
1782
|
+
sys.stderr.flush()
|
|
1783
|
+
|
|
1784
|
+
if record_type == "person":
|
|
1785
|
+
people_records.append(record)
|
|
1786
|
+
if len(people_records) >= batch_size:
|
|
1787
|
+
flush_people_batch()
|
|
1788
|
+
persist_new_labels()
|
|
1789
|
+
save_progress()
|
|
1790
|
+
click.echo(f"\r Progress: {people_count:,} people, {orgs_count:,} orgs...", nl=False, err=True)
|
|
1791
|
+
sys.stderr.flush()
|
|
1792
|
+
else: # org
|
|
1793
|
+
org_records.append(record)
|
|
1794
|
+
if len(org_records) >= batch_size:
|
|
1795
|
+
flush_org_batch()
|
|
1796
|
+
persist_new_labels()
|
|
1797
|
+
save_progress()
|
|
1798
|
+
click.echo(f"\r Progress: {people_count:,} people, {orgs_count:,} orgs...", nl=False, err=True)
|
|
1799
|
+
sys.stderr.flush()
|
|
1800
|
+
|
|
1801
|
+
click.echo("", err=True) # Newline after counter
|
|
1802
|
+
|
|
1803
|
+
# Final batches
|
|
1804
|
+
flush_people_batch()
|
|
1805
|
+
flush_org_batch()
|
|
1806
|
+
persist_new_labels()
|
|
1807
|
+
save_progress()
|
|
1808
|
+
|
|
1809
|
+
finally:
|
|
1810
|
+
# Ensure we save progress even on interrupt
|
|
1811
|
+
save_progress()
|
|
1812
|
+
|
|
1813
|
+
click.echo(f"Import complete: {people_count:,} people, {orgs_count:,} orgs", err=True)
|
|
1814
|
+
|
|
1815
|
+
# Keep references for final label resolution
|
|
1816
|
+
database = person_database
|
|
1817
|
+
if org_database:
|
|
1818
|
+
org_database.close()
|
|
1819
|
+
|
|
1820
|
+
# Final label resolution pass for any remaining unresolved QIDs
|
|
1821
|
+
click.echo("\n=== Final QID Label Resolution ===", err=True)
|
|
1822
|
+
|
|
1823
|
+
# Get the full label cache (includes labels from DB + new ones from import)
|
|
1824
|
+
all_labels = importer.get_label_cache()
|
|
1825
|
+
click.echo(f" Total labels in cache: {len(all_labels):,}", err=True)
|
|
1826
|
+
|
|
1827
|
+
# Check for any remaining unresolved QIDs in the database
|
|
1828
|
+
people_unresolved = database.get_unresolved_qids()
|
|
1829
|
+
click.echo(f" Unresolved QIDs in people: {len(people_unresolved):,}", err=True)
|
|
1830
|
+
|
|
1831
|
+
org_unresolved: set[str] = set()
|
|
1832
|
+
if orgs:
|
|
1833
|
+
org_database = get_database(db_path=db_path_obj)
|
|
1834
|
+
org_unresolved = org_database.get_unresolved_qids()
|
|
1835
|
+
click.echo(f" Unresolved QIDs in orgs: {len(org_unresolved):,}", err=True)
|
|
1836
|
+
|
|
1837
|
+
all_unresolved = people_unresolved | org_unresolved
|
|
1838
|
+
need_sparql = all_unresolved - set(all_labels.keys())
|
|
1839
|
+
|
|
1840
|
+
if need_sparql:
|
|
1841
|
+
click.echo(f" Resolving {len(need_sparql):,} remaining QIDs via SPARQL...", err=True)
|
|
1842
|
+
sparql_resolved = importer.resolve_qids_via_sparql(need_sparql)
|
|
1843
|
+
all_labels.update(sparql_resolved)
|
|
1844
|
+
# Persist newly resolved labels
|
|
1845
|
+
if sparql_resolved:
|
|
1846
|
+
database.insert_qid_labels(sparql_resolved)
|
|
1847
|
+
click.echo(f" SPARQL resolved and stored: {len(sparql_resolved):,}", err=True)
|
|
1848
|
+
|
|
1849
|
+
# Update records with any newly resolved labels
|
|
1850
|
+
if all_labels:
|
|
1851
|
+
updates, deletes = database.resolve_qid_labels(all_labels)
|
|
1852
|
+
if updates or deletes:
|
|
1853
|
+
click.echo(f" People: {updates:,} updated, {deletes:,} duplicates deleted", err=True)
|
|
1854
|
+
|
|
1855
|
+
if orgs:
|
|
1856
|
+
org_database = get_database(db_path=db_path_obj)
|
|
1857
|
+
org_updates, org_deletes = org_database.resolve_qid_labels(all_labels)
|
|
1858
|
+
if org_updates or org_deletes:
|
|
1859
|
+
click.echo(f" Orgs: {org_updates:,} updated, {org_deletes:,} duplicates deleted", err=True)
|
|
1860
|
+
org_database.close()
|
|
1861
|
+
|
|
1862
|
+
# Final stats
|
|
1863
|
+
final_label_count = database.get_qid_labels_count()
|
|
1864
|
+
click.echo(f" Total labels in DB: {final_label_count:,}", err=True)
|
|
1007
1865
|
database.close()
|
|
1008
1866
|
|
|
1867
|
+
click.echo("\nWikidata dump import complete!", err=True)
|
|
1868
|
+
|
|
1009
1869
|
|
|
1010
1870
|
@db_cmd.command("search-people")
|
|
1011
1871
|
@click.argument("query")
|
|
@@ -1134,8 +1994,8 @@ def db_import_companies_house(
|
|
|
1134
1994
|
|
|
1135
1995
|
if len(records) >= batch_size:
|
|
1136
1996
|
names = [r.name for r in records]
|
|
1137
|
-
embeddings = embedder.
|
|
1138
|
-
database.insert_batch(records, embeddings)
|
|
1997
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
1998
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1139
1999
|
count += len(records)
|
|
1140
2000
|
click.echo(f"Imported {count} records...", err=True)
|
|
1141
2001
|
records = []
|
|
@@ -1143,8 +2003,8 @@ def db_import_companies_house(
|
|
|
1143
2003
|
# Final batch
|
|
1144
2004
|
if records:
|
|
1145
2005
|
names = [r.name for r in records]
|
|
1146
|
-
embeddings = embedder.
|
|
1147
|
-
database.insert_batch(records, embeddings)
|
|
2006
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
2007
|
+
database.insert_batch(records, embeddings, scalar_embeddings=scalar_embeddings)
|
|
1148
2008
|
count += len(records)
|
|
1149
2009
|
|
|
1150
2010
|
click.echo(f"\nImported {count} Companies House records successfully.", err=True)
|
|
@@ -1163,6 +2023,7 @@ def db_status(db_path: Optional[str]):
|
|
|
1163
2023
|
corp-extractor db status --db /path/to/entities.db
|
|
1164
2024
|
"""
|
|
1165
2025
|
from .database import OrganizationDatabase
|
|
2026
|
+
from .database.store import get_person_database
|
|
1166
2027
|
|
|
1167
2028
|
try:
|
|
1168
2029
|
database = OrganizationDatabase(db_path=db_path)
|
|
@@ -1180,17 +2041,119 @@ def db_status(db_path: Optional[str]):
|
|
|
1180
2041
|
click.echo(f"\n⚠️ Missing embeddings: {missing_embeddings:,}")
|
|
1181
2042
|
click.echo(" Run 'corp-extractor db repair-embeddings' to fix")
|
|
1182
2043
|
|
|
2044
|
+
# Show embedding counts (float32 and scalar)
|
|
2045
|
+
org_fp32 = database.get_float32_embedding_count()
|
|
2046
|
+
org_int8 = database.get_scalar_embedding_count()
|
|
2047
|
+
click.echo(f"\nOrganization embeddings:")
|
|
2048
|
+
click.echo(f" float32: {org_fp32:,}")
|
|
2049
|
+
click.echo(f" int8 (scalar): {org_int8:,}")
|
|
2050
|
+
if org_fp32 > 0 and org_int8 < org_fp32:
|
|
2051
|
+
click.echo(f" ⚠️ {org_fp32 - org_int8:,} missing scalar embeddings")
|
|
2052
|
+
click.echo(" Run 'corp-extractor db backfill-scalar' to generate")
|
|
2053
|
+
|
|
2054
|
+
# Person embeddings
|
|
2055
|
+
person_db = get_person_database(db_path=db_path)
|
|
2056
|
+
person_fp32 = person_db.get_float32_embedding_count()
|
|
2057
|
+
person_int8 = person_db.get_scalar_embedding_count()
|
|
2058
|
+
if person_fp32 > 0:
|
|
2059
|
+
click.echo(f"\nPerson embeddings:")
|
|
2060
|
+
click.echo(f" float32: {person_fp32:,}")
|
|
2061
|
+
click.echo(f" int8 (scalar): {person_int8:,}")
|
|
2062
|
+
if person_int8 < person_fp32:
|
|
2063
|
+
click.echo(f" ⚠️ {person_fp32 - person_int8:,} missing scalar embeddings")
|
|
2064
|
+
|
|
1183
2065
|
if stats.by_source:
|
|
1184
2066
|
click.echo("\nRecords by source:")
|
|
1185
2067
|
for source, count in stats.by_source.items():
|
|
1186
2068
|
click.echo(f" {source}: {count:,}")
|
|
1187
2069
|
|
|
2070
|
+
# Show canonicalization stats
|
|
2071
|
+
canon_stats = database.get_canon_stats()
|
|
2072
|
+
if canon_stats["canonicalized_records"] > 0:
|
|
2073
|
+
click.echo("\nCanonicalization:")
|
|
2074
|
+
click.echo(f" Canonicalized: {canon_stats['canonicalized_records']:,} / {canon_stats['total_records']:,}")
|
|
2075
|
+
click.echo(f" Canonical groups: {canon_stats['canonical_groups']:,}")
|
|
2076
|
+
click.echo(f" Multi-record groups: {canon_stats['multi_record_groups']:,}")
|
|
2077
|
+
click.echo(f" Records in multi-groups: {canon_stats['records_in_multi_groups']:,}")
|
|
2078
|
+
else:
|
|
2079
|
+
click.echo("\nCanonicalization: Not run yet")
|
|
2080
|
+
click.echo(" Run 'corp-extractor db canonicalize' to link equivalent records")
|
|
2081
|
+
|
|
1188
2082
|
database.close()
|
|
1189
2083
|
|
|
1190
2084
|
except Exception as e:
|
|
1191
2085
|
raise click.ClickException(f"Failed to read database: {e}")
|
|
1192
2086
|
|
|
1193
2087
|
|
|
2088
|
+
@db_cmd.command("canonicalize")
|
|
2089
|
+
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
2090
|
+
@click.option("--batch-size", type=int, default=10000, help="Batch size for updates (default: 10000)")
|
|
2091
|
+
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
|
|
2092
|
+
def db_canonicalize(db_path: Optional[str], batch_size: int, verbose: bool):
|
|
2093
|
+
"""
|
|
2094
|
+
Canonicalize organizations by linking equivalent records across sources.
|
|
2095
|
+
|
|
2096
|
+
Records are considered equivalent if they share:
|
|
2097
|
+
- Same LEI (globally unique legal entity identifier)
|
|
2098
|
+
- Same ticker symbol
|
|
2099
|
+
- Same CIK (SEC identifier)
|
|
2100
|
+
- Same normalized name (after lowercasing, removing dots)
|
|
2101
|
+
- Same name with suffix expansion (Ltd -> Limited, etc.)
|
|
2102
|
+
|
|
2103
|
+
For each group, the highest-priority source becomes canonical:
|
|
2104
|
+
gleif > sec_edgar > companies_house > wikipedia
|
|
2105
|
+
|
|
2106
|
+
Canonicalization enables better search re-ranking by boosting results
|
|
2107
|
+
that have records from multiple authoritative sources.
|
|
2108
|
+
|
|
2109
|
+
\b
|
|
2110
|
+
Examples:
|
|
2111
|
+
corp-extractor db canonicalize
|
|
2112
|
+
corp-extractor db canonicalize -v
|
|
2113
|
+
corp-extractor db canonicalize --db /path/to/entities.db
|
|
2114
|
+
"""
|
|
2115
|
+
_configure_logging(verbose)
|
|
2116
|
+
|
|
2117
|
+
from .database import OrganizationDatabase
|
|
2118
|
+
from .database.store import get_person_database
|
|
2119
|
+
|
|
2120
|
+
try:
|
|
2121
|
+
# Canonicalize organizations
|
|
2122
|
+
database = OrganizationDatabase(db_path=db_path)
|
|
2123
|
+
click.echo("Running organization canonicalization...", err=True)
|
|
2124
|
+
|
|
2125
|
+
result = database.canonicalize(batch_size=batch_size)
|
|
2126
|
+
|
|
2127
|
+
click.echo("\nOrganization Canonicalization Results")
|
|
2128
|
+
click.echo("=" * 40)
|
|
2129
|
+
click.echo(f"Total records processed: {result['total_records']:,}")
|
|
2130
|
+
click.echo(f"Equivalence groups found: {result['groups_found']:,}")
|
|
2131
|
+
click.echo(f"Multi-record groups: {result['multi_record_groups']:,}")
|
|
2132
|
+
click.echo(f"Records updated: {result['records_updated']:,}")
|
|
2133
|
+
|
|
2134
|
+
database.close()
|
|
2135
|
+
|
|
2136
|
+
# Canonicalize people
|
|
2137
|
+
db_path_obj = Path(db_path) if db_path else None
|
|
2138
|
+
person_db = get_person_database(db_path=db_path_obj)
|
|
2139
|
+
click.echo("\nRunning people canonicalization...", err=True)
|
|
2140
|
+
|
|
2141
|
+
people_result = person_db.canonicalize(batch_size=batch_size)
|
|
2142
|
+
|
|
2143
|
+
click.echo("\nPeople Canonicalization Results")
|
|
2144
|
+
click.echo("=" * 40)
|
|
2145
|
+
click.echo(f"Total records processed: {people_result['total_records']:,}")
|
|
2146
|
+
click.echo(f"Matched by organization: {people_result['matched_by_org']:,}")
|
|
2147
|
+
click.echo(f"Matched by date overlap: {people_result['matched_by_date']:,}")
|
|
2148
|
+
click.echo(f"Canonical groups: {people_result['canonical_groups']:,}")
|
|
2149
|
+
click.echo(f"Records in multi-record groups: {people_result['records_in_groups']:,}")
|
|
2150
|
+
|
|
2151
|
+
person_db.close()
|
|
2152
|
+
|
|
2153
|
+
except Exception as e:
|
|
2154
|
+
raise click.ClickException(f"Canonicalization failed: {e}")
|
|
2155
|
+
|
|
2156
|
+
|
|
1194
2157
|
@db_cmd.command("search")
|
|
1195
2158
|
@click.argument("query")
|
|
1196
2159
|
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
@@ -1247,10 +2210,9 @@ def db_search(query: str, db_path: Optional[str], top_k: int, source: Optional[s
|
|
|
1247
2210
|
@click.option("--repo", type=str, default="Corp-o-Rate-Community/entity-references", help="HuggingFace repo ID")
|
|
1248
2211
|
@click.option("--db", "db_path", type=click.Path(), help="Output path for database")
|
|
1249
2212
|
@click.option("--full", is_flag=True, help="Download full version (larger, includes record metadata)")
|
|
1250
|
-
@click.option("--no-compress", is_flag=True, help="Download uncompressed version (slower)")
|
|
1251
2213
|
@click.option("--force", is_flag=True, help="Force re-download")
|
|
1252
2214
|
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
|
|
1253
|
-
def db_download(repo: str, db_path: Optional[str], full: bool,
|
|
2215
|
+
def db_download(repo: str, db_path: Optional[str], full: bool, force: bool, verbose: bool):
|
|
1254
2216
|
"""
|
|
1255
2217
|
Download entity database from HuggingFace Hub.
|
|
1256
2218
|
|
|
@@ -1274,7 +2236,6 @@ def db_download(repo: str, db_path: Optional[str], full: bool, no_compress: bool
|
|
|
1274
2236
|
repo_id=repo,
|
|
1275
2237
|
filename=filename,
|
|
1276
2238
|
force_download=force,
|
|
1277
|
-
prefer_compressed=not no_compress,
|
|
1278
2239
|
)
|
|
1279
2240
|
click.echo(f"Database downloaded to: {path}")
|
|
1280
2241
|
except Exception as e:
|
|
@@ -1286,27 +2247,23 @@ def db_download(repo: str, db_path: Optional[str], full: bool, no_compress: bool
|
|
|
1286
2247
|
@click.option("--repo", type=str, default="Corp-o-Rate-Community/entity-references", help="HuggingFace repo ID")
|
|
1287
2248
|
@click.option("--message", type=str, default="Update entity database", help="Commit message")
|
|
1288
2249
|
@click.option("--no-lite", is_flag=True, help="Skip creating lite version (without record data)")
|
|
1289
|
-
@click.option("--no-compress", is_flag=True, help="Skip creating compressed versions")
|
|
1290
2250
|
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
|
|
1291
|
-
def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool,
|
|
2251
|
+
def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, verbose: bool):
|
|
1292
2252
|
"""
|
|
1293
|
-
Upload entity database to HuggingFace Hub
|
|
2253
|
+
Upload entity database to HuggingFace Hub.
|
|
1294
2254
|
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
By default uploads:
|
|
2255
|
+
First VACUUMs the database, then creates and uploads:
|
|
1298
2256
|
- entities.db (full database)
|
|
1299
2257
|
- entities-lite.db (without record data, smaller)
|
|
1300
|
-
- entities.db.gz (compressed full)
|
|
1301
|
-
- entities-lite.db.gz (compressed lite)
|
|
1302
2258
|
|
|
2259
|
+
If no path is provided, uploads from the default cache location.
|
|
1303
2260
|
Requires HF_TOKEN environment variable to be set.
|
|
1304
2261
|
|
|
1305
2262
|
\b
|
|
1306
2263
|
Examples:
|
|
1307
2264
|
corp-extractor db upload
|
|
1308
2265
|
corp-extractor db upload /path/to/entities.db
|
|
1309
|
-
corp-extractor db upload --no-lite
|
|
2266
|
+
corp-extractor db upload --no-lite
|
|
1310
2267
|
corp-extractor db upload --repo my-org/my-entity-db
|
|
1311
2268
|
"""
|
|
1312
2269
|
_configure_logging(verbose)
|
|
@@ -1322,10 +2279,9 @@ def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, no
|
|
|
1322
2279
|
)
|
|
1323
2280
|
|
|
1324
2281
|
click.echo(f"Uploading {db_path} to {repo}...", err=True)
|
|
2282
|
+
click.echo(" - Running VACUUM to optimize database", err=True)
|
|
1325
2283
|
if not no_lite:
|
|
1326
2284
|
click.echo(" - Creating lite version (without record data)", err=True)
|
|
1327
|
-
if not no_compress:
|
|
1328
|
-
click.echo(" - Creating compressed versions", err=True)
|
|
1329
2285
|
|
|
1330
2286
|
try:
|
|
1331
2287
|
results = upload_database_with_variants(
|
|
@@ -1333,7 +2289,6 @@ def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, no
|
|
|
1333
2289
|
repo_id=repo,
|
|
1334
2290
|
commit_message=message,
|
|
1335
2291
|
include_lite=not no_lite,
|
|
1336
|
-
include_compressed=not no_compress,
|
|
1337
2292
|
)
|
|
1338
2293
|
click.echo(f"\nUploaded {len(results)} file(s) successfully:")
|
|
1339
2294
|
for filename, url in results.items():
|
|
@@ -1371,31 +2326,6 @@ def db_create_lite(db_path: str, output: Optional[str], verbose: bool):
|
|
|
1371
2326
|
raise click.ClickException(f"Failed to create lite database: {e}")
|
|
1372
2327
|
|
|
1373
2328
|
|
|
1374
|
-
@db_cmd.command("compress")
|
|
1375
|
-
@click.argument("db_path", type=click.Path(exists=True))
|
|
1376
|
-
@click.option("-o", "--output", type=click.Path(), help="Output path (default: adds .gz suffix)")
|
|
1377
|
-
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
|
|
1378
|
-
def db_compress(db_path: str, output: Optional[str], verbose: bool):
|
|
1379
|
-
"""
|
|
1380
|
-
Compress a database file using gzip.
|
|
1381
|
-
|
|
1382
|
-
\b
|
|
1383
|
-
Examples:
|
|
1384
|
-
corp-extractor db compress entities.db
|
|
1385
|
-
corp-extractor db compress entities.db -o entities.db.gz
|
|
1386
|
-
"""
|
|
1387
|
-
_configure_logging(verbose)
|
|
1388
|
-
from .database.hub import compress_database
|
|
1389
|
-
|
|
1390
|
-
click.echo(f"Compressing {db_path}...", err=True)
|
|
1391
|
-
|
|
1392
|
-
try:
|
|
1393
|
-
compressed_path = compress_database(db_path, output)
|
|
1394
|
-
click.echo(f"Compressed database created: {compressed_path}")
|
|
1395
|
-
except Exception as e:
|
|
1396
|
-
raise click.ClickException(f"Compression failed: {e}")
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
2329
|
@db_cmd.command("repair-embeddings")
|
|
1400
2330
|
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
1401
2331
|
@click.option("--batch-size", type=int, default=1000, help="Batch size for embedding generation (default: 1000)")
|
|
@@ -1441,9 +2371,9 @@ def db_repair_embeddings(db_path: Optional[str], batch_size: int, source: Option
|
|
|
1441
2371
|
names.append(name)
|
|
1442
2372
|
|
|
1443
2373
|
if len(names) >= batch_size:
|
|
1444
|
-
# Generate embeddings
|
|
1445
|
-
embeddings = embedder.
|
|
1446
|
-
database.
|
|
2374
|
+
# Generate both float32 and int8 embeddings
|
|
2375
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
2376
|
+
database.insert_both_embeddings_batch(org_ids, embeddings, scalar_embeddings)
|
|
1447
2377
|
count += len(names)
|
|
1448
2378
|
click.echo(f"Repaired {count:,} / {missing_count:,} embeddings...", err=True)
|
|
1449
2379
|
org_ids = []
|
|
@@ -1451,14 +2381,161 @@ def db_repair_embeddings(db_path: Optional[str], batch_size: int, source: Option
|
|
|
1451
2381
|
|
|
1452
2382
|
# Final batch
|
|
1453
2383
|
if names:
|
|
1454
|
-
embeddings = embedder.
|
|
1455
|
-
database.
|
|
2384
|
+
embeddings, scalar_embeddings = embedder.embed_batch_and_quantize(names)
|
|
2385
|
+
database.insert_both_embeddings_batch(org_ids, embeddings, scalar_embeddings)
|
|
1456
2386
|
count += len(names)
|
|
1457
2387
|
|
|
1458
2388
|
click.echo(f"\nRepaired {count:,} embeddings successfully.", err=True)
|
|
1459
2389
|
database.close()
|
|
1460
2390
|
|
|
1461
2391
|
|
|
2392
|
+
@db_cmd.command("backfill-scalar")
|
|
2393
|
+
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
2394
|
+
@click.option("--batch-size", type=int, default=10000, help="Batch size for processing (default: 10000)")
|
|
2395
|
+
@click.option("--embed-batch-size", type=int, default=64, help="Batch size for embedding generation (default: 64)")
|
|
2396
|
+
@click.option("--skip-generate", is_flag=True, help="Skip generating missing float32 embeddings (only quantize existing)")
|
|
2397
|
+
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
|
|
2398
|
+
def db_backfill_scalar(db_path: Optional[str], batch_size: int, embed_batch_size: int, skip_generate: bool, verbose: bool):
|
|
2399
|
+
"""
|
|
2400
|
+
Backfill scalar (int8) embeddings for the entity database.
|
|
2401
|
+
|
|
2402
|
+
This command handles two cases:
|
|
2403
|
+
1. Records with float32 but missing scalar → quantize existing
|
|
2404
|
+
2. Records missing both embeddings → generate both from scratch
|
|
2405
|
+
|
|
2406
|
+
Scalar embeddings provide 75% storage reduction with ~92% recall at top-100.
|
|
2407
|
+
|
|
2408
|
+
\b
|
|
2409
|
+
Examples:
|
|
2410
|
+
corp-extractor db backfill-scalar
|
|
2411
|
+
corp-extractor db backfill-scalar --batch-size 5000 -v
|
|
2412
|
+
corp-extractor db backfill-scalar --skip-generate # Only quantize existing
|
|
2413
|
+
"""
|
|
2414
|
+
_configure_logging(verbose)
|
|
2415
|
+
import numpy as np
|
|
2416
|
+
|
|
2417
|
+
from .database import OrganizationDatabase, CompanyEmbedder
|
|
2418
|
+
from .database.store import get_person_database
|
|
2419
|
+
|
|
2420
|
+
embedder = None # Lazy load only if needed
|
|
2421
|
+
|
|
2422
|
+
# Process organizations
|
|
2423
|
+
org_db = OrganizationDatabase(db_path=db_path)
|
|
2424
|
+
|
|
2425
|
+
# Phase 1: Quantize existing float32 embeddings to scalar
|
|
2426
|
+
org_quantized = 0
|
|
2427
|
+
click.echo("Phase 1: Quantizing existing float32 embeddings to scalar...", err=True)
|
|
2428
|
+
for batch_ids in org_db.get_missing_scalar_embedding_ids(batch_size=batch_size):
|
|
2429
|
+
fp32_map = org_db.get_embeddings_by_ids(batch_ids)
|
|
2430
|
+
if not fp32_map:
|
|
2431
|
+
continue
|
|
2432
|
+
|
|
2433
|
+
ids = list(fp32_map.keys())
|
|
2434
|
+
int8_embeddings = np.array([
|
|
2435
|
+
np.clip(np.round(fp32_map[i] * 127), -127, 127).astype(np.int8)
|
|
2436
|
+
for i in ids
|
|
2437
|
+
])
|
|
2438
|
+
|
|
2439
|
+
org_db.insert_scalar_embeddings_batch(ids, int8_embeddings)
|
|
2440
|
+
org_quantized += len(ids)
|
|
2441
|
+
click.echo(f" Quantized {org_quantized:,} organization embeddings...", err=True)
|
|
2442
|
+
|
|
2443
|
+
click.echo(f"Quantized {org_quantized:,} organization embeddings.", err=True)
|
|
2444
|
+
|
|
2445
|
+
# Phase 2: Generate embeddings for records missing both
|
|
2446
|
+
org_generated = 0
|
|
2447
|
+
if not skip_generate:
|
|
2448
|
+
click.echo("\nPhase 2: Generating embeddings for organizations missing both...", err=True)
|
|
2449
|
+
|
|
2450
|
+
for batch in org_db.get_missing_all_embedding_ids(batch_size=batch_size):
|
|
2451
|
+
if not batch:
|
|
2452
|
+
continue
|
|
2453
|
+
|
|
2454
|
+
# Lazy load embedder
|
|
2455
|
+
if embedder is None:
|
|
2456
|
+
click.echo(" Loading embedding model...", err=True)
|
|
2457
|
+
embedder = CompanyEmbedder()
|
|
2458
|
+
|
|
2459
|
+
# Process in smaller batches for embedding generation
|
|
2460
|
+
for i in range(0, len(batch), embed_batch_size):
|
|
2461
|
+
sub_batch = batch[i:i + embed_batch_size]
|
|
2462
|
+
ids = [item[0] for item in sub_batch]
|
|
2463
|
+
names = [item[1] for item in sub_batch]
|
|
2464
|
+
|
|
2465
|
+
# Generate both float32 and int8 embeddings
|
|
2466
|
+
fp32_batch, int8_batch = embedder.embed_batch_and_quantize(names, batch_size=embed_batch_size)
|
|
2467
|
+
|
|
2468
|
+
# Insert both
|
|
2469
|
+
org_db.insert_both_embeddings_batch(ids, fp32_batch, int8_batch)
|
|
2470
|
+
org_generated += len(ids)
|
|
2471
|
+
|
|
2472
|
+
if org_generated % 10000 == 0:
|
|
2473
|
+
click.echo(f" Generated {org_generated:,} organization embeddings...", err=True)
|
|
2474
|
+
|
|
2475
|
+
click.echo(f"Generated {org_generated:,} organization embeddings.", err=True)
|
|
2476
|
+
|
|
2477
|
+
# Process people
|
|
2478
|
+
person_db = get_person_database(db_path=db_path)
|
|
2479
|
+
|
|
2480
|
+
# Phase 1: Quantize existing float32 embeddings to scalar
|
|
2481
|
+
person_quantized = 0
|
|
2482
|
+
click.echo("\nPhase 1: Quantizing existing float32 person embeddings to scalar...", err=True)
|
|
2483
|
+
for batch_ids in person_db.get_missing_scalar_embedding_ids(batch_size=batch_size):
|
|
2484
|
+
fp32_map = person_db.get_embeddings_by_ids(batch_ids)
|
|
2485
|
+
if not fp32_map:
|
|
2486
|
+
continue
|
|
2487
|
+
|
|
2488
|
+
ids = list(fp32_map.keys())
|
|
2489
|
+
int8_embeddings = np.array([
|
|
2490
|
+
np.clip(np.round(fp32_map[i] * 127), -127, 127).astype(np.int8)
|
|
2491
|
+
for i in ids
|
|
2492
|
+
])
|
|
2493
|
+
|
|
2494
|
+
person_db.insert_scalar_embeddings_batch(ids, int8_embeddings)
|
|
2495
|
+
person_quantized += len(ids)
|
|
2496
|
+
click.echo(f" Quantized {person_quantized:,} person embeddings...", err=True)
|
|
2497
|
+
|
|
2498
|
+
click.echo(f"Quantized {person_quantized:,} person embeddings.", err=True)
|
|
2499
|
+
|
|
2500
|
+
# Phase 2: Generate embeddings for records missing both
|
|
2501
|
+
person_generated = 0
|
|
2502
|
+
if not skip_generate:
|
|
2503
|
+
click.echo("\nPhase 2: Generating embeddings for people missing both...", err=True)
|
|
2504
|
+
|
|
2505
|
+
for batch in person_db.get_missing_all_embedding_ids(batch_size=batch_size):
|
|
2506
|
+
if not batch:
|
|
2507
|
+
continue
|
|
2508
|
+
|
|
2509
|
+
# Lazy load embedder
|
|
2510
|
+
if embedder is None:
|
|
2511
|
+
click.echo(" Loading embedding model...", err=True)
|
|
2512
|
+
embedder = CompanyEmbedder()
|
|
2513
|
+
|
|
2514
|
+
# Process in smaller batches for embedding generation
|
|
2515
|
+
for i in range(0, len(batch), embed_batch_size):
|
|
2516
|
+
sub_batch = batch[i:i + embed_batch_size]
|
|
2517
|
+
ids = [item[0] for item in sub_batch]
|
|
2518
|
+
names = [item[1] for item in sub_batch]
|
|
2519
|
+
|
|
2520
|
+
# Generate both float32 and int8 embeddings
|
|
2521
|
+
fp32_batch, int8_batch = embedder.embed_batch_and_quantize(names, batch_size=embed_batch_size)
|
|
2522
|
+
|
|
2523
|
+
# Insert both
|
|
2524
|
+
person_db.insert_both_embeddings_batch(ids, fp32_batch, int8_batch)
|
|
2525
|
+
person_generated += len(ids)
|
|
2526
|
+
|
|
2527
|
+
if person_generated % 10000 == 0:
|
|
2528
|
+
click.echo(f" Generated {person_generated:,} person embeddings...", err=True)
|
|
2529
|
+
|
|
2530
|
+
click.echo(f"Generated {person_generated:,} person embeddings.", err=True)
|
|
2531
|
+
|
|
2532
|
+
# Summary
|
|
2533
|
+
click.echo(f"\nSummary:", err=True)
|
|
2534
|
+
click.echo(f" Organizations: {org_quantized:,} quantized, {org_generated:,} generated", err=True)
|
|
2535
|
+
click.echo(f" People: {person_quantized:,} quantized, {person_generated:,} generated", err=True)
|
|
2536
|
+
click.echo(f" Total: {org_quantized + org_generated + person_quantized + person_generated:,} embeddings processed", err=True)
|
|
2537
|
+
|
|
2538
|
+
|
|
1462
2539
|
@db_cmd.command("migrate")
|
|
1463
2540
|
@click.argument("db_path", type=click.Path(exists=True))
|
|
1464
2541
|
@click.option("--rename-file", is_flag=True, help="Also rename companies.db to entities.db")
|
|
@@ -1520,6 +2597,145 @@ def db_migrate(db_path: str, rename_file: bool, yes: bool, verbose: bool):
|
|
|
1520
2597
|
raise click.ClickException(f"Migration failed: {e}")
|
|
1521
2598
|
|
|
1522
2599
|
|
|
2600
|
+
@db_cmd.command("migrate-v2")
|
|
2601
|
+
@click.argument("source_db", type=click.Path(exists=True))
|
|
2602
|
+
@click.argument("target_db", type=click.Path())
|
|
2603
|
+
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
|
|
2604
|
+
@click.option("--resume", is_flag=True, help="Resume from last completed step")
|
|
2605
|
+
def db_migrate_v2(source_db: str, target_db: str, verbose: bool, resume: bool):
|
|
2606
|
+
"""
|
|
2607
|
+
Migrate database from v1 schema to v2 normalized schema.
|
|
2608
|
+
|
|
2609
|
+
Creates a NEW database file with the v2 normalized schema.
|
|
2610
|
+
The original database is preserved unchanged.
|
|
2611
|
+
|
|
2612
|
+
Use --resume to continue a migration that was interrupted.
|
|
2613
|
+
|
|
2614
|
+
\b
|
|
2615
|
+
V2 changes:
|
|
2616
|
+
- TEXT enum fields replaced with INTEGER foreign keys
|
|
2617
|
+
- New enum lookup tables (source_types, people_types, etc.)
|
|
2618
|
+
- New roles and locations tables
|
|
2619
|
+
- QIDs stored as integers (Q prefix stripped)
|
|
2620
|
+
- Human-readable views for queries
|
|
2621
|
+
|
|
2622
|
+
\b
|
|
2623
|
+
Examples:
|
|
2624
|
+
corp-extractor db migrate-v2 entities.db entities-v2.db
|
|
2625
|
+
corp-extractor db migrate-v2 entities.db entities-v2.db --resume
|
|
2626
|
+
corp-extractor db migrate-v2 ~/.cache/corp-extractor/entities.db ./entities-v2.db -v
|
|
2627
|
+
"""
|
|
2628
|
+
_configure_logging(verbose)
|
|
2629
|
+
|
|
2630
|
+
from pathlib import Path
|
|
2631
|
+
from .database.migrate_v2 import migrate_database
|
|
2632
|
+
|
|
2633
|
+
source_path = Path(source_db)
|
|
2634
|
+
target_path = Path(target_db)
|
|
2635
|
+
|
|
2636
|
+
if target_path.exists() and not resume:
|
|
2637
|
+
raise click.ClickException(
|
|
2638
|
+
f"Target database already exists: {target_path}\n"
|
|
2639
|
+
"Use --resume to continue an interrupted migration."
|
|
2640
|
+
)
|
|
2641
|
+
|
|
2642
|
+
if resume:
|
|
2643
|
+
click.echo(f"Resuming migration from {source_path} to {target_path}...")
|
|
2644
|
+
else:
|
|
2645
|
+
click.echo(f"Migrating {source_path} to {target_path}...")
|
|
2646
|
+
|
|
2647
|
+
try:
|
|
2648
|
+
stats = migrate_database(source_path, target_path, resume=resume)
|
|
2649
|
+
|
|
2650
|
+
click.echo("\nMigration complete:")
|
|
2651
|
+
for key, value in stats.items():
|
|
2652
|
+
click.echo(f" {key}: {value:,}")
|
|
2653
|
+
|
|
2654
|
+
except Exception as e:
|
|
2655
|
+
raise click.ClickException(f"Migration failed: {e}")
|
|
2656
|
+
|
|
2657
|
+
|
|
2658
|
+
@db_cmd.command("search-roles")
|
|
2659
|
+
@click.argument("query")
|
|
2660
|
+
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
2661
|
+
@click.option("--limit", default=10, help="Maximum results to return")
|
|
2662
|
+
def db_search_roles(query: str, db_path: Optional[str], limit: int):
|
|
2663
|
+
"""
|
|
2664
|
+
Search for roles by name.
|
|
2665
|
+
|
|
2666
|
+
\b
|
|
2667
|
+
Examples:
|
|
2668
|
+
corp-extractor db search-roles "CEO"
|
|
2669
|
+
corp-extractor db search-roles "Chief Executive" --limit 5
|
|
2670
|
+
"""
|
|
2671
|
+
from .database.store import get_roles_database
|
|
2672
|
+
|
|
2673
|
+
roles_db = get_roles_database(db_path)
|
|
2674
|
+
results = roles_db.search(query, top_k=limit)
|
|
2675
|
+
|
|
2676
|
+
if not results:
|
|
2677
|
+
click.echo(f"No roles found matching '{query}'")
|
|
2678
|
+
return
|
|
2679
|
+
|
|
2680
|
+
click.echo(f"Found {len(results)} role(s) matching '{query}':")
|
|
2681
|
+
for role_id, name, score in results:
|
|
2682
|
+
click.echo(f" [{role_id}] {name} (score: {score:.2f})")
|
|
2683
|
+
|
|
2684
|
+
|
|
2685
|
+
@db_cmd.command("search-locations")
|
|
2686
|
+
@click.argument("query")
|
|
2687
|
+
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
2688
|
+
@click.option("--type", "location_type", type=str, help="Filter by simplified type (country, city, etc.)")
|
|
2689
|
+
@click.option("--limit", default=10, help="Maximum results to return")
|
|
2690
|
+
def db_search_locations(query: str, db_path: Optional[str], location_type: Optional[str], limit: int):
|
|
2691
|
+
"""
|
|
2692
|
+
Search for locations by name.
|
|
2693
|
+
|
|
2694
|
+
\b
|
|
2695
|
+
Examples:
|
|
2696
|
+
corp-extractor db search-locations "California"
|
|
2697
|
+
corp-extractor db search-locations "Paris" --type city
|
|
2698
|
+
corp-extractor db search-locations "Germany" --type country
|
|
2699
|
+
"""
|
|
2700
|
+
from .database.store import get_locations_database
|
|
2701
|
+
|
|
2702
|
+
locations_db = get_locations_database(db_path)
|
|
2703
|
+
results = locations_db.search(query, top_k=limit, simplified_type=location_type)
|
|
2704
|
+
|
|
2705
|
+
if not results:
|
|
2706
|
+
click.echo(f"No locations found matching '{query}'")
|
|
2707
|
+
return
|
|
2708
|
+
|
|
2709
|
+
click.echo(f"Found {len(results)} location(s) matching '{query}':")
|
|
2710
|
+
for loc_id, name, score in results:
|
|
2711
|
+
click.echo(f" [{loc_id}] {name} (score: {score:.2f})")
|
|
2712
|
+
|
|
2713
|
+
|
|
2714
|
+
@db_cmd.command("import-locations")
|
|
2715
|
+
@click.option("--from-pycountry", is_flag=True, help="Import countries from pycountry")
|
|
2716
|
+
@click.option("--db", "db_path", type=click.Path(), help="Database path")
|
|
2717
|
+
@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
|
|
2718
|
+
def db_import_locations(from_pycountry: bool, db_path: Optional[str], verbose: bool):
|
|
2719
|
+
"""
|
|
2720
|
+
Import locations into the database.
|
|
2721
|
+
|
|
2722
|
+
\b
|
|
2723
|
+
Examples:
|
|
2724
|
+
corp-extractor db import-locations --from-pycountry
|
|
2725
|
+
"""
|
|
2726
|
+
_configure_logging(verbose)
|
|
2727
|
+
|
|
2728
|
+
if not from_pycountry:
|
|
2729
|
+
raise click.UsageError("Must specify --from-pycountry")
|
|
2730
|
+
|
|
2731
|
+
from .database.store import get_locations_database
|
|
2732
|
+
|
|
2733
|
+
locations_db = get_locations_database(db_path)
|
|
2734
|
+
count = locations_db.import_from_pycountry()
|
|
2735
|
+
|
|
2736
|
+
click.echo(f"Imported {count:,} locations from pycountry")
|
|
2737
|
+
|
|
2738
|
+
|
|
1523
2739
|
# =============================================================================
|
|
1524
2740
|
# Document commands
|
|
1525
2741
|
# =============================================================================
|