corp-extractor 0.5.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +228 -30
  2. corp_extractor-0.9.3.dist-info/RECORD +79 -0
  3. statement_extractor/__init__.py +1 -1
  4. statement_extractor/cli.py +2030 -24
  5. statement_extractor/data/statement_taxonomy.json +6949 -1159
  6. statement_extractor/database/__init__.py +52 -0
  7. statement_extractor/database/embeddings.py +186 -0
  8. statement_extractor/database/hub.py +428 -0
  9. statement_extractor/database/importers/__init__.py +32 -0
  10. statement_extractor/database/importers/companies_house.py +559 -0
  11. statement_extractor/database/importers/companies_house_officers.py +431 -0
  12. statement_extractor/database/importers/gleif.py +561 -0
  13. statement_extractor/database/importers/sec_edgar.py +392 -0
  14. statement_extractor/database/importers/sec_form4.py +512 -0
  15. statement_extractor/database/importers/wikidata.py +1120 -0
  16. statement_extractor/database/importers/wikidata_dump.py +1951 -0
  17. statement_extractor/database/importers/wikidata_people.py +1130 -0
  18. statement_extractor/database/models.py +254 -0
  19. statement_extractor/database/resolver.py +245 -0
  20. statement_extractor/database/store.py +3034 -0
  21. statement_extractor/document/__init__.py +62 -0
  22. statement_extractor/document/chunker.py +410 -0
  23. statement_extractor/document/context.py +171 -0
  24. statement_extractor/document/deduplicator.py +171 -0
  25. statement_extractor/document/html_extractor.py +246 -0
  26. statement_extractor/document/loader.py +303 -0
  27. statement_extractor/document/pipeline.py +388 -0
  28. statement_extractor/document/summarizer.py +195 -0
  29. statement_extractor/extractor.py +1 -1
  30. statement_extractor/models/__init__.py +19 -3
  31. statement_extractor/models/canonical.py +44 -1
  32. statement_extractor/models/document.py +308 -0
  33. statement_extractor/models/labels.py +47 -18
  34. statement_extractor/models/qualifiers.py +51 -3
  35. statement_extractor/models/statement.py +39 -15
  36. statement_extractor/models.py +1 -1
  37. statement_extractor/pipeline/config.py +6 -11
  38. statement_extractor/pipeline/context.py +5 -5
  39. statement_extractor/pipeline/orchestrator.py +90 -121
  40. statement_extractor/pipeline/registry.py +52 -46
  41. statement_extractor/plugins/__init__.py +20 -8
  42. statement_extractor/plugins/base.py +348 -78
  43. statement_extractor/plugins/extractors/gliner2.py +38 -28
  44. statement_extractor/plugins/labelers/taxonomy.py +18 -5
  45. statement_extractor/plugins/labelers/taxonomy_embedding.py +17 -6
  46. statement_extractor/plugins/pdf/__init__.py +10 -0
  47. statement_extractor/plugins/pdf/pypdf.py +291 -0
  48. statement_extractor/plugins/qualifiers/__init__.py +11 -0
  49. statement_extractor/plugins/qualifiers/companies_house.py +14 -3
  50. statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  51. statement_extractor/plugins/qualifiers/gleif.py +14 -3
  52. statement_extractor/plugins/qualifiers/person.py +588 -14
  53. statement_extractor/plugins/qualifiers/sec_edgar.py +14 -3
  54. statement_extractor/plugins/scrapers/__init__.py +10 -0
  55. statement_extractor/plugins/scrapers/http.py +236 -0
  56. statement_extractor/plugins/splitters/t5_gemma.py +176 -75
  57. statement_extractor/plugins/taxonomy/embedding.py +193 -46
  58. statement_extractor/plugins/taxonomy/mnli.py +16 -4
  59. statement_extractor/scoring.py +8 -8
  60. corp_extractor-0.5.0.dist-info/RECORD +0 -55
  61. statement_extractor/plugins/canonicalizers/__init__.py +0 -17
  62. statement_extractor/plugins/canonicalizers/base.py +0 -9
  63. statement_extractor/plugins/canonicalizers/location.py +0 -219
  64. statement_extractor/plugins/canonicalizers/organization.py +0 -230
  65. statement_extractor/plugins/canonicalizers/person.py +0 -242
  66. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
  67. {corp_extractor-0.5.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0
@@ -11,6 +11,7 @@ Usage:
11
11
  import json
12
12
  import logging
13
13
  import sys
14
+ from pathlib import Path
14
15
  from typing import Optional
15
16
 
16
17
  import click
@@ -42,16 +43,27 @@ def _configure_logging(verbose: bool) -> None:
42
43
  "statement_extractor.plugins.extractors.gliner2",
43
44
  "statement_extractor.plugins.splitters",
44
45
  "statement_extractor.plugins.labelers",
46
+ "statement_extractor.plugins.scrapers",
47
+ "statement_extractor.plugins.scrapers.http",
48
+ "statement_extractor.plugins.pdf",
49
+ "statement_extractor.plugins.pdf.pypdf",
50
+ "statement_extractor.document",
51
+ "statement_extractor.document.loader",
52
+ "statement_extractor.document.html_extractor",
53
+ "statement_extractor.document.pipeline",
54
+ "statement_extractor.document.chunker",
45
55
  ]:
46
56
  logging.getLogger(logger_name).setLevel(level)
47
57
 
48
58
  # Suppress noisy third-party loggers
49
59
  for noisy_logger in [
60
+ "httpcore",
50
61
  "httpcore.http11",
51
62
  "httpcore.connection",
52
63
  "httpx",
53
64
  "urllib3",
54
65
  "huggingface_hub",
66
+ "asyncio",
55
67
  ]:
56
68
  logging.getLogger(noisy_logger).setLevel(logging.WARNING)
57
69
 
@@ -74,14 +86,17 @@ def main():
74
86
  \b
75
87
  Commands:
76
88
  split Extract sub-statements from text (simple, fast)
77
- pipeline Run the full 5-stage extraction pipeline
89
+ pipeline Run the full 6-stage extraction pipeline
90
+ document Process documents with chunking and citations
78
91
  plugins List or inspect available plugins
92
+ db Manage entity/organization embedding database
79
93
 
80
94
  \b
81
95
  Examples:
82
96
  corp-extractor split "Apple announced a new iPhone."
83
97
  corp-extractor split -f article.txt --json
84
98
  corp-extractor pipeline "Apple CEO Tim Cook announced..." --stages 1-3
99
+ corp-extractor document process report.txt --title "Annual Report"
85
100
  corp-extractor plugins list
86
101
  """
87
102
  pass
@@ -354,7 +369,7 @@ def pipeline_cmd(
354
369
  if enabled_plugins:
355
370
  enabled_plugin_set = {p.strip() for p in enabled_plugins.split(",") if p.strip()}
356
371
 
357
- disabled_plugin_set = set()
372
+ disabled_plugin_set = None
358
373
  if disable_plugins:
359
374
  disabled_plugin_set = {p.strip() for p in disable_plugins.split(",") if p.strip()}
360
375
 
@@ -365,13 +380,15 @@ def pipeline_cmd(
365
380
  if not quiet:
366
381
  click.echo("Default predicates disabled - using entity extraction only", err=True)
367
382
 
368
- # Create config
369
- config = PipelineConfig(
370
- enabled_stages=enabled_stages,
371
- enabled_plugins=enabled_plugin_set,
372
- disabled_plugins=disabled_plugin_set,
373
- extractor_options=extractor_options,
374
- )
383
+ # Create config - only pass disabled_plugins if user explicitly specified, otherwise use defaults
384
+ config_kwargs: dict = {
385
+ "enabled_stages": enabled_stages,
386
+ "enabled_plugins": enabled_plugin_set,
387
+ "extractor_options": extractor_options,
388
+ }
389
+ if disabled_plugin_set is not None:
390
+ config_kwargs["disabled_plugins"] = disabled_plugin_set
391
+ config = PipelineConfig(**config_kwargs)
375
392
 
376
393
  # Run pipeline
377
394
  try:
@@ -422,7 +439,7 @@ def _print_pipeline_json(ctx):
422
439
  """Print pipeline results as JSON."""
423
440
  output = {
424
441
  "statement_count": ctx.statement_count,
425
- "raw_triples": [t.model_dump() for t in ctx.raw_triples],
442
+ "split_sentences": [s.model_dump() for s in ctx.split_sentences],
426
443
  "statements": [s.model_dump() for s in ctx.statements],
427
444
  "labeled_statements": [stmt.as_dict() for stmt in ctx.labeled_statements],
428
445
  "timings": ctx.stage_timings,
@@ -455,9 +472,10 @@ def _print_pipeline_triples(ctx):
455
472
  elif ctx.statements:
456
473
  for stmt in ctx.statements:
457
474
  click.echo(f"{stmt.subject.text}\t{stmt.predicate}\t{stmt.object.text}")
458
- elif ctx.raw_triples:
459
- for triple in ctx.raw_triples:
460
- click.echo(f"{triple.subject_text}\t{triple.predicate_text}\t{triple.object_text}")
475
+ elif ctx.split_sentences:
476
+ # Stage 1 only output - just show the split sentences (no triples yet)
477
+ for sentence in ctx.split_sentences:
478
+ click.echo(sentence.text)
461
479
 
462
480
 
463
481
  def _print_pipeline_table(ctx, verbose: bool):
@@ -511,20 +529,16 @@ def _print_pipeline_table(ctx, verbose: bool):
511
529
 
512
530
  click.echo("-" * 80)
513
531
 
514
- elif ctx.raw_triples:
515
- click.echo(f"\nExtracted {len(ctx.raw_triples)} raw triple(s):\n")
532
+ elif ctx.split_sentences:
533
+ click.echo(f"\nSplit into {len(ctx.split_sentences)} atomic sentence(s):\n")
516
534
  click.echo("-" * 80)
517
535
 
518
- for i, triple in enumerate(ctx.raw_triples, 1):
519
- click.echo(f"{i}. {triple.subject_text}")
520
- click.echo(f" --[{triple.predicate_text}]-->")
521
- click.echo(f" {triple.object_text}")
536
+ for i, sentence in enumerate(ctx.split_sentences, 1):
537
+ text_preview = sentence.text[:100] + "..." if len(sentence.text) > 100 else sentence.text
538
+ click.echo(f"{i}. {text_preview}")
522
539
 
523
540
  if verbose:
524
- click.echo(f" Confidence: {triple.confidence:.2f}")
525
- if triple.source_sentence:
526
- source = triple.source_sentence[:60] + "..." if len(triple.source_sentence) > 60 else triple.source_sentence
527
- click.echo(f" Source: \"{source}\"")
541
+ click.echo(f" Confidence: {sentence.confidence:.2f}")
528
542
 
529
543
  click.echo("-" * 80)
530
544
 
@@ -629,12 +643,2004 @@ def _load_all_plugins():
629
643
  """Load all plugins by importing their modules."""
630
644
  # Import all plugin modules to trigger registration
631
645
  try:
632
- from .plugins import splitters, extractors, qualifiers, canonicalizers, labelers, taxonomy
646
+ from .plugins import splitters, extractors, qualifiers, labelers, taxonomy
633
647
  # The @PluginRegistry decorators will register plugins on import
648
+ _ = splitters, extractors, qualifiers, labelers, taxonomy # Silence unused warnings
634
649
  except ImportError as e:
635
650
  logging.debug(f"Some plugins failed to load: {e}")
636
651
 
637
652
 
653
+ # =============================================================================
654
+ # Database commands
655
+ # =============================================================================
656
+
657
+ @main.group("db")
658
+ def db_cmd():
659
+ """
660
+ Manage entity/organization embedding database.
661
+
662
+ \b
663
+ Commands:
664
+ import-gleif Import GLEIF LEI data (~3M records)
665
+ import-sec Import SEC Edgar bulk data (~100K+ filers)
666
+ import-sec-officers Import SEC Form 4 officers/directors
667
+ import-ch-officers Import UK Companies House officers (Prod195)
668
+ import-companies-house Import UK Companies House (~5M records)
669
+ import-wikidata Import Wikidata organizations (SPARQL, may timeout)
670
+ import-people Import Wikidata notable people (SPARQL, may timeout)
671
+ import-wikidata-dump Import from Wikidata JSON dump (recommended)
672
+ canonicalize Link equivalent records across sources
673
+ status Show database status
674
+ search Search for an organization
675
+ search-people Search for a person
676
+ download Download database from HuggingFace
677
+ upload Upload database with lite variant
678
+ create-lite Create lite version (no record data)
679
+
680
+ \b
681
+ Examples:
682
+ corp-extractor db import-sec --download
683
+ corp-extractor db import-sec-officers --start-year 2023 --limit 10000
684
+ corp-extractor db import-gleif --download --limit 100000
685
+ corp-extractor db import-wikidata-dump --download --limit 50000
686
+ corp-extractor db canonicalize
687
+ corp-extractor db status
688
+ corp-extractor db search "Apple Inc"
689
+ corp-extractor db search-people "Tim Cook"
690
+ corp-extractor db upload entities.db
691
+ """
692
+ pass
693
+
694
+
695
+ @db_cmd.command("gleif-info")
696
+ def db_gleif_info():
697
+ """
698
+ Show information about the latest available GLEIF data file.
699
+
700
+ \b
701
+ Examples:
702
+ corp-extractor db gleif-info
703
+ """
704
+ from .database.importers import GleifImporter
705
+
706
+ importer = GleifImporter()
707
+
708
+ try:
709
+ info = importer.get_latest_file_info()
710
+ record_count = info.get('record_count')
711
+
712
+ click.echo("\nLatest GLEIF Data File")
713
+ click.echo("=" * 40)
714
+ click.echo(f"File ID: {info['id']}")
715
+ click.echo(f"Publish Date: {info['publish_date']}")
716
+ click.echo(f"Record Count: {record_count:,}" if record_count else "Record Count: unknown")
717
+
718
+ delta = info.get("delta_from_last_file", {})
719
+ if delta:
720
+ click.echo(f"\nChanges from previous file:")
721
+ if delta.get('new'):
722
+ click.echo(f" New: {delta.get('new'):,}")
723
+ if delta.get('updated'):
724
+ click.echo(f" Updated: {delta.get('updated'):,}")
725
+ if delta.get('retired'):
726
+ click.echo(f" Retired: {delta.get('retired'):,}")
727
+
728
+ except Exception as e:
729
+ raise click.ClickException(f"Failed to get GLEIF info: {e}")
730
+
731
+
732
+ @db_cmd.command("import-gleif")
733
+ @click.argument("file_path", type=click.Path(exists=True), required=False)
734
+ @click.option("--download", is_flag=True, help="Download latest GLEIF file before importing")
735
+ @click.option("--force", is_flag=True, help="Force re-download even if cached")
736
+ @click.option("--db", "db_path", type=click.Path(), help="Database path (default: ~/.cache/corp-extractor/entities.db)")
737
+ @click.option("--limit", type=int, help="Limit number of records to import")
738
+ @click.option("--batch-size", type=int, default=50000, help="Batch size for commits (default: 50000)")
739
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
740
+ def db_import_gleif(file_path: Optional[str], download: bool, force: bool, db_path: Optional[str], limit: Optional[int], batch_size: int, verbose: bool):
741
+ """
742
+ Import GLEIF LEI data into the entity database.
743
+
744
+ If no file path is provided and --download is set, downloads the latest
745
+ GLEIF data file automatically. Downloaded files are cached and reused
746
+ unless --force is specified.
747
+
748
+ \b
749
+ Examples:
750
+ corp-extractor db import-gleif /path/to/lei-records.xml
751
+ corp-extractor db import-gleif --download
752
+ corp-extractor db import-gleif --download --limit 10000
753
+ corp-extractor db import-gleif --download --force # Re-download
754
+ """
755
+ _configure_logging(verbose)
756
+
757
+ from .database import OrganizationDatabase, CompanyEmbedder
758
+ from .database.importers import GleifImporter
759
+
760
+ importer = GleifImporter()
761
+
762
+ # Handle file path
763
+ if file_path is None:
764
+ if not download:
765
+ raise click.UsageError("Either provide a file path or use --download to fetch the latest GLEIF data")
766
+ click.echo("Downloading latest GLEIF data...", err=True)
767
+ file_path = str(importer.download_latest(force=force))
768
+ elif download:
769
+ click.echo("Downloading latest GLEIF data (ignoring provided file path)...", err=True)
770
+ file_path = str(importer.download_latest(force=force))
771
+
772
+ click.echo(f"Importing GLEIF data from {file_path}...", err=True)
773
+
774
+ # Initialize components
775
+ embedder = CompanyEmbedder()
776
+ database = OrganizationDatabase(db_path=db_path, embedding_dim=embedder.embedding_dim)
777
+
778
+ # Import records in batches
779
+ records = []
780
+ count = 0
781
+
782
+ for record in importer.import_from_file(file_path, limit=limit):
783
+ records.append(record)
784
+
785
+ if len(records) >= batch_size:
786
+ # Embed and insert batch
787
+ names = [r.name for r in records]
788
+ embeddings = embedder.embed_batch(names)
789
+ database.insert_batch(records, embeddings)
790
+ count += len(records)
791
+ click.echo(f"Imported {count} records...", err=True)
792
+ records = []
793
+
794
+ # Final batch
795
+ if records:
796
+ names = [r.name for r in records]
797
+ embeddings = embedder.embed_batch(names)
798
+ database.insert_batch(records, embeddings)
799
+ count += len(records)
800
+
801
+ click.echo(f"\nImported {count} GLEIF records successfully.", err=True)
802
+ database.close()
803
+
804
+
805
+ @db_cmd.command("import-sec")
806
+ @click.option("--download", is_flag=True, help="Download bulk submissions.zip (~500MB, ~100K+ filers)")
807
+ @click.option("--file", "file_path", type=click.Path(exists=True), help="Local file (submissions.zip or company_tickers.json)")
808
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
809
+ @click.option("--limit", type=int, help="Limit number of records")
810
+ @click.option("--batch-size", type=int, default=10000, help="Batch size (default: 10000)")
811
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
812
+ def db_import_sec(download: bool, file_path: Optional[str], db_path: Optional[str], limit: Optional[int], batch_size: int, verbose: bool):
813
+ """
814
+ Import SEC Edgar data into the entity database.
815
+
816
+ By default, downloads the bulk submissions.zip file which contains
817
+ ALL SEC filers (~100K+), not just companies with ticker symbols (~10K).
818
+
819
+ \b
820
+ Examples:
821
+ corp-extractor db import-sec --download
822
+ corp-extractor db import-sec --download --limit 50000
823
+ corp-extractor db import-sec --file /path/to/submissions.zip
824
+ corp-extractor db import-sec --file /path/to/company_tickers.json # legacy
825
+ """
826
+ _configure_logging(verbose)
827
+
828
+ from .database import OrganizationDatabase, CompanyEmbedder
829
+ from .database.importers import SecEdgarImporter
830
+
831
+ if not download and not file_path:
832
+ raise click.UsageError("Either --download or --file is required")
833
+
834
+ # Initialize components
835
+ embedder = CompanyEmbedder()
836
+ database = OrganizationDatabase(db_path=db_path, embedding_dim=embedder.embedding_dim)
837
+ importer = SecEdgarImporter()
838
+
839
+ # Get records
840
+ if file_path:
841
+ click.echo(f"Importing SEC Edgar data from {file_path}...", err=True)
842
+ record_iter = importer.import_from_file(file_path, limit=limit)
843
+ else:
844
+ click.echo("Downloading SEC submissions.zip (~500MB)...", err=True)
845
+ record_iter = importer.import_from_url(limit=limit)
846
+
847
+ # Import records in batches
848
+ records = []
849
+ count = 0
850
+
851
+ for record in record_iter:
852
+ records.append(record)
853
+
854
+ if len(records) >= batch_size:
855
+ names = [r.name for r in records]
856
+ embeddings = embedder.embed_batch(names)
857
+ database.insert_batch(records, embeddings)
858
+ count += len(records)
859
+ click.echo(f"Imported {count} records...", err=True)
860
+ records = []
861
+
862
+ # Final batch
863
+ if records:
864
+ names = [r.name for r in records]
865
+ embeddings = embedder.embed_batch(names)
866
+ database.insert_batch(records, embeddings)
867
+ count += len(records)
868
+
869
+ click.echo(f"\nImported {count} SEC Edgar records successfully.", err=True)
870
+ database.close()
871
+
872
+
873
+ @db_cmd.command("import-sec-officers")
874
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
875
+ @click.option("--start-year", type=int, default=2020, help="Start year (default: 2020)")
876
+ @click.option("--end-year", type=int, help="End year (default: current year)")
877
+ @click.option("--limit", type=int, help="Limit number of records")
878
+ @click.option("--batch-size", type=int, default=1000, help="Batch size for commits (default: 1000)")
879
+ @click.option("--resume", is_flag=True, help="Resume from saved progress")
880
+ @click.option("--skip-existing", is_flag=True, help="Skip records that already exist")
881
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
882
+ def db_import_sec_officers(db_path: Optional[str], start_year: int, end_year: Optional[int], limit: Optional[int], batch_size: int, resume: bool, skip_existing: bool, verbose: bool):
883
+ """
884
+ Import SEC Form 4 insider data into the people database.
885
+
886
+ Downloads Form 4 filings from SEC EDGAR and extracts officers, directors,
887
+ and significant investors (10%+ owners) from each company.
888
+
889
+ Form 4 filings are submitted when insiders buy or sell company stock.
890
+ They contain the person's name, role (officer/director), and company.
891
+
892
+ Rate limited to 5 requests/second to comply with SEC guidelines.
893
+
894
+ \b
895
+ Examples:
896
+ corp-extractor db import-sec-officers --limit 1000
897
+ corp-extractor db import-sec-officers --start-year 2023
898
+ corp-extractor db import-sec-officers --resume
899
+ corp-extractor db import-sec-officers --skip-existing -v
900
+ """
901
+ _configure_logging(verbose)
902
+
903
+ from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
904
+ from .database.embeddings import CompanyEmbedder
905
+ from .database.importers.sec_form4 import SecForm4Importer
906
+
907
+ # Default database path
908
+ if db_path is None:
909
+ db_path_obj = DEFAULT_DB_PATH
910
+ else:
911
+ db_path_obj = Path(db_path)
912
+
913
+ click.echo(f"Importing SEC Form 4 officers/directors to {db_path_obj}...", err=True)
914
+ click.echo(f"Year range: {start_year} - {end_year or 'current'}", err=True)
915
+ if resume:
916
+ click.echo("Resuming from saved progress...", err=True)
917
+
918
+ # Initialize components
919
+ database = get_person_database(db_path=db_path_obj)
920
+ org_database = get_database(db_path=db_path_obj)
921
+ embedder = CompanyEmbedder()
922
+ importer = SecForm4Importer()
923
+
924
+ # Import records in batches
925
+ records = []
926
+ count = 0
927
+ skipped_existing = 0
928
+
929
+ def progress_callback(year: int, quarter: int, filing_idx: int, accession: str, total: int) -> None:
930
+ if verbose and filing_idx % 100 == 0:
931
+ click.echo(f" {year} Q{quarter}: {filing_idx} filings, {total} records", err=True)
932
+
933
+ for record in importer.import_range(
934
+ start_year=start_year,
935
+ end_year=end_year,
936
+ limit=limit,
937
+ resume=resume,
938
+ progress_callback=progress_callback,
939
+ ):
940
+ # Skip existing records if flag is set
941
+ if skip_existing:
942
+ existing = database.get_by_source_id(record.source, record.source_id)
943
+ if existing is not None:
944
+ skipped_existing += 1
945
+ continue
946
+
947
+ # Look up org ID by CIK if available
948
+ issuer_cik = record.record.get("issuer_cik", "")
949
+ if issuer_cik:
950
+ org_id = org_database.get_id_by_source_id("sec_edgar", issuer_cik.zfill(10))
951
+ if org_id is not None:
952
+ record.known_for_org_id = org_id
953
+
954
+ records.append(record)
955
+
956
+ if len(records) >= batch_size:
957
+ embedding_texts = [r.get_embedding_text() for r in records]
958
+ embeddings = embedder.embed_batch(embedding_texts)
959
+ database.insert_batch(records, embeddings)
960
+ count += len(records)
961
+ click.echo(f"Imported {count} records...", err=True)
962
+ records = []
963
+
964
+ # Final batch
965
+ if records:
966
+ embedding_texts = [r.get_embedding_text() for r in records]
967
+ embeddings = embedder.embed_batch(embedding_texts)
968
+ database.insert_batch(records, embeddings)
969
+ count += len(records)
970
+
971
+ if skip_existing and skipped_existing > 0:
972
+ click.echo(f"\nImported {count} SEC officers/directors (skipped {skipped_existing} existing).", err=True)
973
+ else:
974
+ click.echo(f"\nImported {count} SEC officers/directors successfully.", err=True)
975
+
976
+ org_database.close()
977
+ database.close()
978
+
979
+
980
+ @db_cmd.command("import-ch-officers")
981
+ @click.option("--file", "file_path", type=click.Path(exists=True), required=True, help="Path to CH officers zip file (Prod195)")
982
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
983
+ @click.option("--limit", type=int, help="Limit number of records")
984
+ @click.option("--batch-size", type=int, default=1000, help="Batch size for commits (default: 1000)")
985
+ @click.option("--resume", is_flag=True, help="Resume from saved progress")
986
+ @click.option("--include-resigned", is_flag=True, help="Include resigned officers (default: current only)")
987
+ @click.option("--skip-existing", is_flag=True, help="Skip records that already exist")
988
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
989
+ def db_import_ch_officers(file_path: str, db_path: Optional[str], limit: Optional[int], batch_size: int, resume: bool, include_resigned: bool, skip_existing: bool, verbose: bool):
990
+ """
991
+ Import Companies House officers data into the people database.
992
+
993
+ Requires the Prod195 bulk officers zip file from Companies House.
994
+ Request access via BulkProducts@companieshouse.gov.uk.
995
+
996
+ \b
997
+ Examples:
998
+ corp-extractor db import-ch-officers --file officers.zip --limit 10000
999
+ corp-extractor db import-ch-officers --file officers.zip --resume
1000
+ corp-extractor db import-ch-officers --file officers.zip --include-resigned
1001
+ """
1002
+ _configure_logging(verbose)
1003
+
1004
+ from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
1005
+ from .database.embeddings import CompanyEmbedder
1006
+ from .database.importers.companies_house_officers import CompaniesHouseOfficersImporter
1007
+
1008
+ # Default database path
1009
+ if db_path is None:
1010
+ db_path_obj = DEFAULT_DB_PATH
1011
+ else:
1012
+ db_path_obj = Path(db_path)
1013
+
1014
+ click.echo(f"Importing Companies House officers to {db_path_obj}...", err=True)
1015
+ if resume:
1016
+ click.echo("Resuming from saved progress...", err=True)
1017
+
1018
+ # Initialize components
1019
+ database = get_person_database(db_path=db_path_obj)
1020
+ org_database = get_database(db_path=db_path_obj)
1021
+ embedder = CompanyEmbedder()
1022
+ importer = CompaniesHouseOfficersImporter()
1023
+
1024
+ # Import records in batches
1025
+ records = []
1026
+ count = 0
1027
+ skipped_existing = 0
1028
+
1029
+ def progress_callback(file_idx: int, line_num: int, total: int) -> None:
1030
+ if verbose:
1031
+ click.echo(f" File {file_idx}: line {line_num}, {total} records", err=True)
1032
+
1033
+ for record in importer.import_from_zip(
1034
+ file_path,
1035
+ limit=limit,
1036
+ resume=resume,
1037
+ current_only=not include_resigned,
1038
+ progress_callback=progress_callback,
1039
+ ):
1040
+ # Skip existing records if flag is set
1041
+ if skip_existing:
1042
+ existing = database.get_by_source_id(record.source, record.source_id)
1043
+ if existing is not None:
1044
+ skipped_existing += 1
1045
+ continue
1046
+
1047
+ # Look up org ID by company number if available
1048
+ company_number = record.record.get("company_number", "")
1049
+ if company_number:
1050
+ org_id = org_database.get_id_by_source_id("companies_house", company_number)
1051
+ if org_id is not None:
1052
+ record.known_for_org_id = org_id
1053
+
1054
+ records.append(record)
1055
+
1056
+ if len(records) >= batch_size:
1057
+ embedding_texts = [r.get_embedding_text() for r in records]
1058
+ embeddings = embedder.embed_batch(embedding_texts)
1059
+ database.insert_batch(records, embeddings)
1060
+ count += len(records)
1061
+ click.echo(f"Imported {count} records...", err=True)
1062
+ records = []
1063
+
1064
+ # Final batch
1065
+ if records:
1066
+ embedding_texts = [r.get_embedding_text() for r in records]
1067
+ embeddings = embedder.embed_batch(embedding_texts)
1068
+ database.insert_batch(records, embeddings)
1069
+ count += len(records)
1070
+
1071
+ if skip_existing and skipped_existing > 0:
1072
+ click.echo(f"\nImported {count} CH officers (skipped {skipped_existing} existing).", err=True)
1073
+ else:
1074
+ click.echo(f"\nImported {count} CH officers successfully.", err=True)
1075
+
1076
+ org_database.close()
1077
+ database.close()
1078
+
1079
+
1080
+ @db_cmd.command("import-wikidata")
1081
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1082
+ @click.option("--limit", type=int, help="Limit number of records")
1083
+ @click.option("--batch-size", type=int, default=1000, help="Batch size for commits (default: 1000)")
1084
+ @click.option("--type", "query_type", type=click.Choice(["lei", "ticker", "public", "business", "organization", "nonprofit", "government"]), default="lei",
1085
+ help="Query type to use for fetching data")
1086
+ @click.option("--all", "import_all", is_flag=True, help="Run all query types sequentially")
1087
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1088
+ def db_import_wikidata(db_path: Optional[str], limit: Optional[int], batch_size: int, query_type: str, import_all: bool, verbose: bool):
1089
+ """
1090
+ Import organization data from Wikidata via SPARQL.
1091
+
1092
+ Uses simplified SPARQL queries that avoid timeouts on Wikidata's endpoint.
1093
+ Query types target different organization categories.
1094
+
1095
+ \b
1096
+ Query types:
1097
+ lei Companies with LEI codes (fastest, most reliable)
1098
+ ticker Companies listed on stock exchanges
1099
+ public Direct instances of "public company" (Q891723)
1100
+ business Direct instances of "business enterprise" (Q4830453)
1101
+ organization All organizations (Q43229) - NGOs, associations, etc.
1102
+ nonprofit Non-profit organizations (Q163740)
1103
+ government Government agencies (Q327333)
1104
+
1105
+ \b
1106
+ Examples:
1107
+ corp-extractor db import-wikidata --limit 10
1108
+ corp-extractor db import-wikidata --type organization --limit 1000
1109
+ corp-extractor db import-wikidata --type nonprofit --limit 5000
1110
+ corp-extractor db import-wikidata --all --limit 10000
1111
+ """
1112
+ _configure_logging(verbose)
1113
+
1114
+ from .database import OrganizationDatabase, CompanyEmbedder
1115
+ from .database.importers import WikidataImporter
1116
+
1117
+ click.echo(f"Importing Wikidata organization data via SPARQL (type={query_type}, all={import_all})...", err=True)
1118
+
1119
+ # Initialize components
1120
+ embedder = CompanyEmbedder()
1121
+ database = OrganizationDatabase(db_path=db_path, embedding_dim=embedder.embedding_dim)
1122
+ importer = WikidataImporter(batch_size=500) # Smaller SPARQL batch size for reliability
1123
+
1124
+ # Import records in batches
1125
+ records = []
1126
+ count = 0
1127
+
1128
+ for record in importer.import_from_sparql(limit=limit, query_type=query_type, import_all=import_all):
1129
+ records.append(record)
1130
+
1131
+ if len(records) >= batch_size:
1132
+ names = [r.name for r in records]
1133
+ embeddings = embedder.embed_batch(names)
1134
+ database.insert_batch(records, embeddings)
1135
+ count += len(records)
1136
+ click.echo(f"Imported {count} records...", err=True)
1137
+ records = []
1138
+
1139
+ # Final batch
1140
+ if records:
1141
+ names = [r.name for r in records]
1142
+ embeddings = embedder.embed_batch(names)
1143
+ database.insert_batch(records, embeddings)
1144
+ count += len(records)
1145
+
1146
+ click.echo(f"\nImported {count} Wikidata records successfully.", err=True)
1147
+ database.close()
1148
+
1149
+
1150
+ @db_cmd.command("import-people")
1151
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1152
+ @click.option("--limit", type=int, help="Limit number of records")
1153
+ @click.option("--batch-size", type=int, default=1000, help="Batch size for commits (default: 1000)")
1154
+ @click.option("--type", "query_type", type=click.Choice([
1155
+ "executive", "politician", "athlete", "artist",
1156
+ "academic", "scientist", "journalist", "entrepreneur", "activist"
1157
+ ]), default="executive", help="Person type to import")
1158
+ @click.option("--all", "import_all", is_flag=True, help="Run all person type queries sequentially")
1159
+ @click.option("--enrich", is_flag=True, help="Query individual people to get role/org data (slower, resumable)")
1160
+ @click.option("--enrich-only", is_flag=True, help="Only enrich existing people (skip bulk import)")
1161
+ @click.option("--enrich-dates", is_flag=True, help="Query individual people to get start/end dates (slower)")
1162
+ @click.option("--skip-existing", is_flag=True, help="Skip records that already exist (default: update them)")
1163
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1164
+ def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: int, query_type: str, import_all: bool, enrich: bool, enrich_only: bool, enrich_dates: bool, skip_existing: bool, verbose: bool):
1165
+ """
1166
+ Import notable people data from Wikidata via SPARQL.
1167
+
1168
+ Uses a two-phase approach for reliability:
1169
+ 1. Bulk import: Fast fetch of QID, name, country (no timeouts)
1170
+ 2. Enrich (optional): Per-person queries for role/org/dates
1171
+
1172
+ Imports people with English Wikipedia articles (ensures notability).
1173
+
1174
+ \b
1175
+ Examples:
1176
+ corp-extractor db import-people --type executive --limit 5000
1177
+ corp-extractor db import-people --all --limit 10000
1178
+ corp-extractor db import-people --type executive --enrich
1179
+ corp-extractor db import-people --enrich-only --limit 100
1180
+ corp-extractor db import-people --type politician -v
1181
+ """
1182
+ _configure_logging(verbose)
1183
+
1184
+ from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
1185
+ from .database.embeddings import CompanyEmbedder
1186
+ from .database.importers.wikidata_people import WikidataPeopleImporter
1187
+
1188
+ # Default database path
1189
+ if db_path is None:
1190
+ db_path_obj = DEFAULT_DB_PATH
1191
+ else:
1192
+ db_path_obj = Path(db_path)
1193
+
1194
+ click.echo(f"Importing Wikidata people to {db_path_obj}...", err=True)
1195
+
1196
+ # Initialize components
1197
+ database = get_person_database(db_path=db_path_obj)
1198
+ org_database = get_database(db_path=db_path_obj)
1199
+ embedder = CompanyEmbedder()
1200
+ importer = WikidataPeopleImporter(batch_size=batch_size)
1201
+
1202
+ count = 0
1203
+
1204
+ # Phase 1: Bulk import (fast, minimal data) - skip if --enrich-only
1205
+ if not enrich_only:
1206
+ records = []
1207
+ skipped_existing = 0
1208
+
1209
+ click.echo("Phase 1: Bulk import (QID, name, country)...", err=True)
1210
+
1211
+ for record in importer.import_from_sparql(limit=limit, query_type=query_type, import_all=import_all):
1212
+ # Skip existing records if flag is set
1213
+ if skip_existing:
1214
+ existing = database.get_by_source_id(record.source, record.source_id)
1215
+ if existing is not None:
1216
+ skipped_existing += 1
1217
+ continue
1218
+
1219
+ records.append(record)
1220
+
1221
+ if len(records) >= batch_size:
1222
+ # Generate embeddings (just name for now, will re-embed after enrichment)
1223
+ embedding_texts = [r.get_embedding_text() for r in records]
1224
+ embeddings = embedder.embed_batch(embedding_texts)
1225
+ database.insert_batch(records, embeddings)
1226
+ count += len(records)
1227
+
1228
+ click.echo(f" Imported {count} people...", err=True)
1229
+ records = []
1230
+
1231
+ # Final batch
1232
+ if records:
1233
+ embedding_texts = [r.get_embedding_text() for r in records]
1234
+ embeddings = embedder.embed_batch(embedding_texts)
1235
+ database.insert_batch(records, embeddings)
1236
+ count += len(records)
1237
+
1238
+ if skip_existing and skipped_existing > 0:
1239
+ click.echo(f"\nPhase 1 complete: {count} people imported (skipped {skipped_existing} existing).", err=True)
1240
+ else:
1241
+ click.echo(f"\nPhase 1 complete: {count} people imported.", err=True)
1242
+ else:
1243
+ click.echo("Skipping Phase 1 (bulk import) - using existing database records.", err=True)
1244
+ # Enable enrich if enrich_only is set
1245
+ enrich = True
1246
+
1247
+ # Phase 2: Enrich with role/org/dates (optional, slower but resumable)
1248
+ if enrich:
1249
+ click.echo("\nPhase 2: Enriching with role/org/dates (parallel queries)...", err=True)
1250
+ # Get all people without role/org
1251
+ people_to_enrich = []
1252
+ enriched_count = 0
1253
+ for record in database.iter_records():
1254
+ if not record.known_for_role and not record.known_for_org:
1255
+ people_to_enrich.append(record)
1256
+ enriched_count += 1
1257
+ # Apply limit if --enrich-only
1258
+ if enrich_only and limit and enriched_count >= limit:
1259
+ break
1260
+
1261
+ if people_to_enrich:
1262
+ click.echo(f"Found {len(people_to_enrich)} people to enrich...", err=True)
1263
+ importer.enrich_people_role_org_batch(people_to_enrich, delay_seconds=0.1, max_workers=5)
1264
+
1265
+ # Persist the enriched data and re-generate embeddings
1266
+ updated = 0
1267
+ org_count = 0
1268
+ date_count = 0
1269
+ for person in people_to_enrich:
1270
+ if person.known_for_role or person.known_for_org:
1271
+ # Look up org ID if we have org_qid
1272
+ org_qid = person.record.get("org_qid", "")
1273
+ if org_qid:
1274
+ org_id = org_database.get_id_by_source_id("wikipedia", org_qid)
1275
+ if org_id is not None:
1276
+ person.known_for_org_id = org_id
1277
+
1278
+ # Update the record with new role/org/dates and re-embed
1279
+ new_embedding_text = person.get_embedding_text()
1280
+ new_embedding = embedder.embed(new_embedding_text)
1281
+ if database.update_role_org(
1282
+ person.source, person.source_id,
1283
+ person.known_for_role, person.known_for_org,
1284
+ person.known_for_org_id, new_embedding,
1285
+ person.from_date, person.to_date,
1286
+ ):
1287
+ updated += 1
1288
+ if person.known_for_org:
1289
+ org_count += 1
1290
+ if person.from_date or person.to_date:
1291
+ date_count += 1
1292
+ if verbose:
1293
+ date_str = ""
1294
+ if person.from_date or person.to_date:
1295
+ date_str = f" ({person.from_date or '?'} - {person.to_date or '?'})"
1296
+ click.echo(f" {person.name}: {person.known_for_role} at {person.known_for_org}{date_str}", err=True)
1297
+
1298
+ click.echo(f"Updated {updated} people ({org_count} with orgs, {date_count} with dates).", err=True)
1299
+
1300
+ # Phase 3: Enrich with dates (optional, even slower)
1301
+ if enrich_dates:
1302
+ click.echo("\nPhase 3: Enriching with dates...", err=True)
1303
+ # Get all people without dates but with role (dates are associated with positions)
1304
+ people_to_enrich = []
1305
+ for record in database.iter_records():
1306
+ if not record.from_date and not record.to_date and record.known_for_role:
1307
+ people_to_enrich.append(record)
1308
+
1309
+ if people_to_enrich:
1310
+ click.echo(f"Found {len(people_to_enrich)} people to enrich with dates...", err=True)
1311
+ enriched = importer.enrich_people_batch(people_to_enrich, delay_seconds=0.3)
1312
+
1313
+ # Persist the enriched dates
1314
+ updated = 0
1315
+ for person in people_to_enrich:
1316
+ if person.from_date or person.to_date:
1317
+ if database.update_dates(person.source, person.source_id, person.from_date, person.to_date):
1318
+ updated += 1
1319
+ if verbose:
1320
+ click.echo(f" {person.name}: {person.from_date or '?'} - {person.to_date or '?'}", err=True)
1321
+
1322
+ click.echo(f"Updated {updated} people with dates.", err=True)
1323
+
1324
+ org_database.close()
1325
+ database.close()
1326
+
1327
+
1328
+ @db_cmd.command("import-wikidata-dump")
1329
+ @click.option("--dump", "dump_path", type=click.Path(exists=True), help="Path to Wikidata JSON dump file (.bz2 or .gz)")
1330
+ @click.option("--download", is_flag=True, help="Download latest dump first (~100GB)")
1331
+ @click.option("--force", is_flag=True, help="Force re-download even if cached")
1332
+ @click.option("--no-aria2", is_flag=True, help="Don't use aria2c even if available (slower)")
1333
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1334
+ @click.option("--people/--no-people", default=True, help="Import people (default: yes)")
1335
+ @click.option("--orgs/--no-orgs", default=True, help="Import organizations (default: yes)")
1336
+ @click.option("--require-enwiki", is_flag=True, help="Only import orgs with English Wikipedia articles")
1337
+ @click.option("--resume", is_flag=True, help="Resume from last position in dump file (tracks entity index)")
1338
+ @click.option("--skip-updates", is_flag=True, help="Skip Q codes already in database (no updates)")
1339
+ @click.option("--limit", type=int, help="Max records per type (people and/or orgs)")
1340
+ @click.option("--batch-size", type=int, default=10000, help="Batch size for commits (default: 10000)")
1341
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1342
+ def db_import_wikidata_dump(
1343
+ dump_path: Optional[str],
1344
+ download: bool,
1345
+ force: bool,
1346
+ no_aria2: bool,
1347
+ db_path: Optional[str],
1348
+ people: bool,
1349
+ orgs: bool,
1350
+ require_enwiki: bool,
1351
+ resume: bool,
1352
+ skip_updates: bool,
1353
+ limit: Optional[int],
1354
+ batch_size: int,
1355
+ verbose: bool,
1356
+ ):
1357
+ """
1358
+ Import people and organizations from Wikidata JSON dump.
1359
+
1360
+ This uses the full Wikidata JSON dump (~100GB compressed) to import
1361
+ all humans and organizations with English Wikipedia articles. This
1362
+ avoids SPARQL query timeouts that occur with large result sets.
1363
+
1364
+ The dump is streamed line-by-line to minimize memory usage.
1365
+
1366
+ \b
1367
+ Features:
1368
+ - No timeouts (processes locally)
1369
+ - Complete coverage (all notable people/orgs)
1370
+ - Resumable with --resume (tracks position in dump file)
1371
+ - Skip existing with --skip-updates (loads existing Q codes)
1372
+ - People like Andy Burnham are captured via occupation (P106)
1373
+
1374
+ \b
1375
+ Resume options:
1376
+ - --resume: Resume from where the dump processing left off (tracks entity index).
1377
+ Progress is saved after each batch. Use this if import was interrupted.
1378
+ - --skip-updates: Skip Q codes already in database (no updates to existing records).
1379
+ Use this to add new records without re-processing existing ones.
1380
+
1381
+ \b
1382
+ Examples:
1383
+ corp-extractor db import-wikidata-dump --dump /path/to/dump.json.bz2 --limit 10000
1384
+ corp-extractor db import-wikidata-dump --download --people --no-orgs --limit 50000
1385
+ corp-extractor db import-wikidata-dump --dump dump.json.bz2 --orgs --no-people
1386
+ corp-extractor db import-wikidata-dump --dump dump.json.bz2 --resume # Resume interrupted import
1387
+ corp-extractor db import-wikidata-dump --dump dump.json.bz2 --skip-updates # Skip existing Q codes
1388
+ """
1389
+ _configure_logging(verbose)
1390
+
1391
+ from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
1392
+ from .database.embeddings import CompanyEmbedder
1393
+ from .database.importers.wikidata_dump import WikidataDumpImporter, DumpProgress
1394
+
1395
+ if not dump_path and not download:
1396
+ raise click.UsageError("Either --dump path or --download is required")
1397
+
1398
+ if not people and not orgs:
1399
+ raise click.UsageError("Must import at least one of --people or --orgs")
1400
+
1401
+ # Default database path
1402
+ if db_path is None:
1403
+ db_path_obj = DEFAULT_DB_PATH
1404
+ else:
1405
+ db_path_obj = Path(db_path)
1406
+
1407
+ click.echo(f"Importing Wikidata dump to {db_path_obj}...", err=True)
1408
+
1409
+ # Initialize importer
1410
+ importer = WikidataDumpImporter(dump_path=dump_path)
1411
+
1412
+ # Download if requested
1413
+ if download:
1414
+ import shutil
1415
+ dump_target = importer.get_dump_path()
1416
+ click.echo(f"Downloading Wikidata dump (~100GB) to:", err=True)
1417
+ click.echo(f" {dump_target}", err=True)
1418
+
1419
+ # Check for aria2c
1420
+ has_aria2 = shutil.which("aria2c") is not None
1421
+ use_aria2 = has_aria2 and not no_aria2
1422
+
1423
+ if use_aria2:
1424
+ click.echo(" Using aria2c for fast parallel download (16 connections)", err=True)
1425
+ dump_file = importer.download_dump(force=force, use_aria2=True)
1426
+ click.echo(f"\nUsing dump: {dump_file}", err=True)
1427
+ else:
1428
+ if not has_aria2:
1429
+ click.echo("", err=True)
1430
+ click.echo(" TIP: Install aria2c for 10-20x faster downloads:", err=True)
1431
+ click.echo(" brew install aria2 (macOS)", err=True)
1432
+ click.echo(" apt install aria2 (Ubuntu/Debian)", err=True)
1433
+ click.echo("", err=True)
1434
+
1435
+ # Use urllib to get content length first
1436
+ import urllib.request
1437
+ req = urllib.request.Request(
1438
+ "https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2",
1439
+ headers={"User-Agent": "corp-extractor/1.0"},
1440
+ method="HEAD"
1441
+ )
1442
+ with urllib.request.urlopen(req) as response:
1443
+ total_size = int(response.headers.get("content-length", 0))
1444
+
1445
+ if total_size:
1446
+ total_gb = total_size / (1024 ** 3)
1447
+ click.echo(f" Size: {total_gb:.1f} GB", err=True)
1448
+
1449
+ # Download with progress bar
1450
+ progress_bar = None
1451
+
1452
+ def update_progress(downloaded: int, total: int) -> None:
1453
+ nonlocal progress_bar
1454
+ if progress_bar is None and total > 0:
1455
+ progress_bar = click.progressbar(
1456
+ length=total,
1457
+ label="Downloading",
1458
+ show_percent=True,
1459
+ show_pos=True,
1460
+ item_show_func=lambda x: f"{(x or 0) / (1024**3):.1f} GB" if x else "",
1461
+ )
1462
+ progress_bar.__enter__()
1463
+ if progress_bar:
1464
+ # Update to absolute position
1465
+ progress_bar.update(downloaded - progress_bar.pos)
1466
+
1467
+ try:
1468
+ dump_file = importer.download_dump(force=force, use_aria2=False, progress_callback=update_progress)
1469
+ finally:
1470
+ if progress_bar:
1471
+ progress_bar.__exit__(None, None, None)
1472
+
1473
+ click.echo(f"\nUsing dump: {dump_file}", err=True)
1474
+ elif dump_path:
1475
+ click.echo(f"Using dump: {dump_path}", err=True)
1476
+
1477
+ # Initialize embedder (loads model, may take time on first run)
1478
+ click.echo("Loading embedding model...", err=True)
1479
+ sys.stderr.flush()
1480
+ embedder = CompanyEmbedder()
1481
+ click.echo("Embedding model loaded.", err=True)
1482
+ sys.stderr.flush()
1483
+
1484
+ # Load existing QID labels from database and seed the importer's cache
1485
+ database = get_person_database(db_path=db_path_obj)
1486
+ existing_labels = database.get_all_qid_labels()
1487
+ if existing_labels:
1488
+ click.echo(f"Loaded {len(existing_labels):,} existing QID labels from DB", err=True)
1489
+ importer.set_label_cache(existing_labels)
1490
+ known_qids_at_start = set(existing_labels.keys())
1491
+
1492
+ # Load existing source_ids for skip_updates mode
1493
+ existing_people_ids: set[str] = set()
1494
+ existing_org_ids: set[str] = set()
1495
+ if skip_updates:
1496
+ click.echo("Loading existing records for --skip-updates...", err=True)
1497
+ if people:
1498
+ existing_people_ids = database.get_all_source_ids(source="wikidata")
1499
+ click.echo(f" Found {len(existing_people_ids):,} existing people Q codes", err=True)
1500
+ if orgs:
1501
+ org_database = get_database(db_path=db_path_obj)
1502
+ existing_org_ids = org_database.get_all_source_ids(source="wikipedia")
1503
+ click.echo(f" Found {len(existing_org_ids):,} existing org Q codes", err=True)
1504
+
1505
+ # Load progress for resume mode (position-based resume)
1506
+ progress: Optional[DumpProgress] = None
1507
+ start_index = 0
1508
+ if resume:
1509
+ progress = DumpProgress.load()
1510
+ if progress:
1511
+ # Verify the progress is for the same dump file
1512
+ actual_dump_path = importer._dump_path or Path(dump_path) if dump_path else importer.get_dump_path()
1513
+ if progress.matches_dump(actual_dump_path):
1514
+ start_index = progress.entity_index
1515
+ click.echo(f"Resuming from entity index {start_index:,}", err=True)
1516
+ click.echo(f" Last entity: {progress.last_entity_id}", err=True)
1517
+ click.echo(f" Last updated: {progress.last_updated}", err=True)
1518
+ else:
1519
+ click.echo("Warning: Progress file is for a different dump, starting from beginning", err=True)
1520
+ progress = None
1521
+ else:
1522
+ click.echo("No progress file found, starting from beginning", err=True)
1523
+
1524
+ # Initialize progress tracking
1525
+ if progress is None:
1526
+ actual_dump_path = importer._dump_path or Path(dump_path) if dump_path else importer.get_dump_path()
1527
+ progress = DumpProgress(
1528
+ dump_path=str(actual_dump_path),
1529
+ dump_size=actual_dump_path.stat().st_size if actual_dump_path.exists() else 0,
1530
+ )
1531
+
1532
+ # Helper to persist new labels after each batch
1533
+ def persist_new_labels() -> int:
1534
+ new_labels = importer.get_new_labels_since(known_qids_at_start)
1535
+ if new_labels:
1536
+ database.insert_qid_labels(new_labels)
1537
+ known_qids_at_start.update(new_labels.keys())
1538
+ return len(new_labels)
1539
+ return 0
1540
+
1541
+ # Combined import - single pass through the dump for both people and orgs
1542
+ click.echo("\n=== Combined Import (single dump pass) ===", err=True)
1543
+ sys.stderr.flush() # Ensure output is visible immediately
1544
+ if people:
1545
+ click.echo(f" People: {'up to ' + str(limit) + ' records' if limit else 'unlimited'}", err=True)
1546
+ if skip_updates and existing_people_ids:
1547
+ click.echo(f" Skip updates: {len(existing_people_ids):,} existing Q codes", err=True)
1548
+ if orgs:
1549
+ click.echo(f" Orgs: {'up to ' + str(limit) + ' records' if limit else 'unlimited'}", err=True)
1550
+ if require_enwiki:
1551
+ click.echo(" Filter: only orgs with English Wikipedia articles", err=True)
1552
+ if skip_updates and existing_org_ids:
1553
+ click.echo(f" Skip updates: {len(existing_org_ids):,} existing Q codes", err=True)
1554
+ if start_index > 0:
1555
+ click.echo(f" Resuming from entity index {start_index:,}", err=True)
1556
+
1557
+ # Initialize databases
1558
+ person_database = get_person_database(db_path=db_path_obj)
1559
+ org_database = get_database(db_path=db_path_obj) if orgs else None
1560
+
1561
+ # Batches for each type
1562
+ people_records: list = []
1563
+ org_records: list = []
1564
+ people_count = 0
1565
+ orgs_count = 0
1566
+ last_entity_index = start_index
1567
+ last_entity_id = ""
1568
+
1569
+ def combined_progress_callback(entity_index: int, entity_id: str, ppl_count: int, org_count: int) -> None:
1570
+ nonlocal last_entity_index, last_entity_id
1571
+ last_entity_index = entity_index
1572
+ last_entity_id = entity_id
1573
+
1574
+ def save_progress() -> None:
1575
+ if progress:
1576
+ progress.entity_index = last_entity_index
1577
+ progress.last_entity_id = last_entity_id
1578
+ progress.people_yielded = people_count
1579
+ progress.orgs_yielded = orgs_count
1580
+ progress.save()
1581
+
1582
+ def flush_people_batch() -> None:
1583
+ nonlocal people_records, people_count
1584
+ if people_records:
1585
+ embedding_texts = [r.get_embedding_text() for r in people_records]
1586
+ embeddings = embedder.embed_batch(embedding_texts)
1587
+ person_database.insert_batch(people_records, embeddings)
1588
+ people_count += len(people_records)
1589
+ people_records = []
1590
+
1591
+ def flush_org_batch() -> None:
1592
+ nonlocal org_records, orgs_count
1593
+ if org_records and org_database:
1594
+ names = [r.name for r in org_records]
1595
+ embeddings = embedder.embed_batch(names)
1596
+ org_database.insert_batch(org_records, embeddings)
1597
+ orgs_count += len(org_records)
1598
+ org_records = []
1599
+
1600
+ # Calculate total for progress bar (if limits set for both)
1601
+ total_limit = None
1602
+ if limit and people and orgs:
1603
+ total_limit = limit * 2 # Rough estimate
1604
+ elif limit:
1605
+ total_limit = limit
1606
+
1607
+ click.echo("Starting dump iteration...", err=True)
1608
+ sys.stderr.flush()
1609
+
1610
+ records_seen = 0
1611
+ try:
1612
+ if total_limit:
1613
+ # Use progress bar when we have limits
1614
+ with click.progressbar(
1615
+ length=total_limit,
1616
+ label="Processing dump",
1617
+ show_percent=True,
1618
+ show_pos=True,
1619
+ ) as pbar:
1620
+ for record_type, record in importer.import_all(
1621
+ people_limit=limit if people else 0,
1622
+ orgs_limit=limit if orgs else 0,
1623
+ import_people=people,
1624
+ import_orgs=orgs,
1625
+ require_enwiki=require_enwiki,
1626
+ skip_people_ids=existing_people_ids if skip_updates else None,
1627
+ skip_org_ids=existing_org_ids if skip_updates else None,
1628
+ start_index=start_index,
1629
+ progress_callback=combined_progress_callback,
1630
+ ):
1631
+ records_seen += 1
1632
+ pbar.update(1)
1633
+
1634
+ if record_type == "person":
1635
+ people_records.append(record)
1636
+ if len(people_records) >= batch_size:
1637
+ flush_people_batch()
1638
+ persist_new_labels()
1639
+ save_progress()
1640
+ else: # org
1641
+ org_records.append(record)
1642
+ if len(org_records) >= batch_size:
1643
+ flush_org_batch()
1644
+ persist_new_labels()
1645
+ save_progress()
1646
+ else:
1647
+ # No limit - show counter updates
1648
+ for record_type, record in importer.import_all(
1649
+ people_limit=None,
1650
+ orgs_limit=None,
1651
+ import_people=people,
1652
+ import_orgs=orgs,
1653
+ require_enwiki=require_enwiki,
1654
+ skip_people_ids=existing_people_ids if skip_updates else None,
1655
+ skip_org_ids=existing_org_ids if skip_updates else None,
1656
+ start_index=start_index,
1657
+ progress_callback=combined_progress_callback,
1658
+ ):
1659
+ records_seen += 1
1660
+ # Show first record immediately as proof of life
1661
+ if records_seen == 1:
1662
+ click.echo(f" First record found: {record.name}", err=True)
1663
+ sys.stderr.flush()
1664
+
1665
+ if record_type == "person":
1666
+ people_records.append(record)
1667
+ if len(people_records) >= batch_size:
1668
+ flush_people_batch()
1669
+ persist_new_labels()
1670
+ save_progress()
1671
+ click.echo(f"\r Progress: {people_count:,} people, {orgs_count:,} orgs...", nl=False, err=True)
1672
+ sys.stderr.flush()
1673
+ else: # org
1674
+ org_records.append(record)
1675
+ if len(org_records) >= batch_size:
1676
+ flush_org_batch()
1677
+ persist_new_labels()
1678
+ save_progress()
1679
+ click.echo(f"\r Progress: {people_count:,} people, {orgs_count:,} orgs...", nl=False, err=True)
1680
+ sys.stderr.flush()
1681
+
1682
+ click.echo("", err=True) # Newline after counter
1683
+
1684
+ # Final batches
1685
+ flush_people_batch()
1686
+ flush_org_batch()
1687
+ persist_new_labels()
1688
+ save_progress()
1689
+
1690
+ finally:
1691
+ # Ensure we save progress even on interrupt
1692
+ save_progress()
1693
+
1694
+ click.echo(f"Import complete: {people_count:,} people, {orgs_count:,} orgs", err=True)
1695
+
1696
+ # Keep references for final label resolution
1697
+ database = person_database
1698
+ if org_database:
1699
+ org_database.close()
1700
+
1701
+ # Final label resolution pass for any remaining unresolved QIDs
1702
+ click.echo("\n=== Final QID Label Resolution ===", err=True)
1703
+
1704
+ # Get the full label cache (includes labels from DB + new ones from import)
1705
+ all_labels = importer.get_label_cache()
1706
+ click.echo(f" Total labels in cache: {len(all_labels):,}", err=True)
1707
+
1708
+ # Check for any remaining unresolved QIDs in the database
1709
+ people_unresolved = database.get_unresolved_qids()
1710
+ click.echo(f" Unresolved QIDs in people: {len(people_unresolved):,}", err=True)
1711
+
1712
+ org_unresolved: set[str] = set()
1713
+ if orgs:
1714
+ org_database = get_database(db_path=db_path_obj)
1715
+ org_unresolved = org_database.get_unresolved_qids()
1716
+ click.echo(f" Unresolved QIDs in orgs: {len(org_unresolved):,}", err=True)
1717
+
1718
+ all_unresolved = people_unresolved | org_unresolved
1719
+ need_sparql = all_unresolved - set(all_labels.keys())
1720
+
1721
+ if need_sparql:
1722
+ click.echo(f" Resolving {len(need_sparql):,} remaining QIDs via SPARQL...", err=True)
1723
+ sparql_resolved = importer.resolve_qids_via_sparql(need_sparql)
1724
+ all_labels.update(sparql_resolved)
1725
+ # Persist newly resolved labels
1726
+ if sparql_resolved:
1727
+ database.insert_qid_labels(sparql_resolved)
1728
+ click.echo(f" SPARQL resolved and stored: {len(sparql_resolved):,}", err=True)
1729
+
1730
+ # Update records with any newly resolved labels
1731
+ if all_labels:
1732
+ updates, deletes = database.resolve_qid_labels(all_labels)
1733
+ if updates or deletes:
1734
+ click.echo(f" People: {updates:,} updated, {deletes:,} duplicates deleted", err=True)
1735
+
1736
+ if orgs:
1737
+ org_database = get_database(db_path=db_path_obj)
1738
+ org_updates = org_database.resolve_qid_labels(all_labels)
1739
+ if org_updates:
1740
+ click.echo(f" Updated orgs: {org_updates:,} regions", err=True)
1741
+ org_database.close()
1742
+
1743
+ # Final stats
1744
+ final_label_count = database.get_qid_labels_count()
1745
+ click.echo(f" Total labels in DB: {final_label_count:,}", err=True)
1746
+ database.close()
1747
+
1748
+ click.echo("\nWikidata dump import complete!", err=True)
1749
+
1750
+
1751
+ @db_cmd.command("search-people")
1752
+ @click.argument("query")
1753
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1754
+ @click.option("--top-k", type=int, default=10, help="Number of results")
1755
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1756
+ def db_search_people(query: str, db_path: Optional[str], top_k: int, verbose: bool):
1757
+ """
1758
+ Search for a person in the database.
1759
+
1760
+ \b
1761
+ Examples:
1762
+ corp-extractor db search-people "Tim Cook"
1763
+ corp-extractor db search-people "Elon Musk" --top-k 5
1764
+ """
1765
+ _configure_logging(verbose)
1766
+
1767
+ from .database.store import get_person_database, DEFAULT_DB_PATH
1768
+ from .database.embeddings import CompanyEmbedder
1769
+
1770
+ # Default database path
1771
+ if db_path is None:
1772
+ db_path_obj = DEFAULT_DB_PATH
1773
+ else:
1774
+ db_path_obj = Path(db_path)
1775
+
1776
+ click.echo(f"Searching for '{query}' in {db_path_obj}...", err=True)
1777
+
1778
+ # Initialize components
1779
+ database = get_person_database(db_path=db_path_obj)
1780
+ embedder = CompanyEmbedder()
1781
+
1782
+ # Embed query and search
1783
+ query_embedding = embedder.embed(query)
1784
+ results = database.search(query_embedding, top_k=top_k, query_text=query)
1785
+
1786
+ if not results:
1787
+ click.echo("No results found.", err=True)
1788
+ return
1789
+
1790
+ click.echo(f"\nFound {len(results)} results:\n")
1791
+ for i, (record, similarity) in enumerate(results, 1):
1792
+ role_str = f" ({record.known_for_role})" if record.known_for_role else ""
1793
+ org_str = f" at {record.known_for_org}" if record.known_for_org else ""
1794
+ country_str = f" [{record.country}]" if record.country else ""
1795
+ click.echo(f" {i}. {record.name}{role_str}{org_str}{country_str}")
1796
+ click.echo(f" Source: wikidata:{record.source_id}, Type: {record.person_type.value}, Score: {similarity:.3f}")
1797
+ click.echo()
1798
+
1799
+ database.close()
1800
+
1801
+
1802
+ @db_cmd.command("import-companies-house")
1803
+ @click.option("--download", is_flag=True, help="Download bulk data file (free, no API key needed)")
1804
+ @click.option("--force", is_flag=True, help="Force re-download even if cached")
1805
+ @click.option("--file", "file_path", type=click.Path(exists=True), help="Local Companies House CSV/JSON file")
1806
+ @click.option("--search", "search_terms", type=str, help="Comma-separated search terms (requires API key)")
1807
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1808
+ @click.option("--limit", type=int, help="Limit number of records")
1809
+ @click.option("--batch-size", type=int, default=50000, help="Batch size for commits (default: 50000)")
1810
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1811
+ def db_import_companies_house(
1812
+ download: bool,
1813
+ force: bool,
1814
+ file_path: Optional[str],
1815
+ search_terms: Optional[str],
1816
+ db_path: Optional[str],
1817
+ limit: Optional[int],
1818
+ batch_size: int,
1819
+ verbose: bool,
1820
+ ):
1821
+ """
1822
+ Import UK Companies House data into the entity database.
1823
+
1824
+ \b
1825
+ Options:
1826
+ --download Download free bulk data (all UK companies, ~5M records)
1827
+ --file Import from local CSV/JSON file
1828
+ --search Search via API (requires COMPANIES_HOUSE_API_KEY)
1829
+
1830
+ \b
1831
+ Examples:
1832
+ corp-extractor db import-companies-house --download
1833
+ corp-extractor db import-companies-house --download --limit 100000
1834
+ corp-extractor db import-companies-house --file /path/to/companies.csv
1835
+ corp-extractor db import-companies-house --search "bank,insurance"
1836
+ """
1837
+ _configure_logging(verbose)
1838
+
1839
+ from .database import OrganizationDatabase, CompanyEmbedder
1840
+ from .database.importers import CompaniesHouseImporter
1841
+
1842
+ if not file_path and not search_terms and not download:
1843
+ raise click.UsageError("Either --download, --file, or --search is required")
1844
+
1845
+ click.echo("Importing Companies House data...", err=True)
1846
+
1847
+ # Initialize components
1848
+ embedder = CompanyEmbedder()
1849
+ database = OrganizationDatabase(db_path=db_path, embedding_dim=embedder.embedding_dim)
1850
+ importer = CompaniesHouseImporter()
1851
+
1852
+ # Get records
1853
+ if download:
1854
+ # Download bulk data file
1855
+ csv_path = importer.download_bulk_data(force=force)
1856
+ click.echo(f"Using bulk data file: {csv_path}", err=True)
1857
+ record_iter = importer.import_from_file(csv_path, limit=limit)
1858
+ elif file_path:
1859
+ record_iter = importer.import_from_file(file_path, limit=limit)
1860
+ else:
1861
+ terms = [t.strip() for t in search_terms.split(",") if t.strip()]
1862
+ click.echo(f"Searching for: {terms}", err=True)
1863
+ record_iter = importer.import_from_search(
1864
+ search_terms=terms,
1865
+ limit_per_term=limit or 100,
1866
+ total_limit=limit,
1867
+ )
1868
+
1869
+ # Import records in batches
1870
+ records = []
1871
+ count = 0
1872
+
1873
+ for record in record_iter:
1874
+ records.append(record)
1875
+
1876
+ if len(records) >= batch_size:
1877
+ names = [r.name for r in records]
1878
+ embeddings = embedder.embed_batch(names)
1879
+ database.insert_batch(records, embeddings)
1880
+ count += len(records)
1881
+ click.echo(f"Imported {count} records...", err=True)
1882
+ records = []
1883
+
1884
+ # Final batch
1885
+ if records:
1886
+ names = [r.name for r in records]
1887
+ embeddings = embedder.embed_batch(names)
1888
+ database.insert_batch(records, embeddings)
1889
+ count += len(records)
1890
+
1891
+ click.echo(f"\nImported {count} Companies House records successfully.", err=True)
1892
+ database.close()
1893
+
1894
+
1895
+ @db_cmd.command("status")
1896
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1897
+ def db_status(db_path: Optional[str]):
1898
+ """
1899
+ Show database status and statistics.
1900
+
1901
+ \b
1902
+ Examples:
1903
+ corp-extractor db status
1904
+ corp-extractor db status --db /path/to/entities.db
1905
+ """
1906
+ from .database import OrganizationDatabase
1907
+
1908
+ try:
1909
+ database = OrganizationDatabase(db_path=db_path)
1910
+ stats = database.get_stats()
1911
+
1912
+ click.echo("\nEntity Database Status")
1913
+ click.echo("=" * 40)
1914
+ click.echo(f"Total records: {stats.total_records:,}")
1915
+ click.echo(f"Embedding dimension: {stats.embedding_dimension}")
1916
+ click.echo(f"Database size: {stats.database_size_bytes / 1024 / 1024:.2f} MB")
1917
+
1918
+ # Check for missing embeddings
1919
+ missing_embeddings = database.get_missing_embedding_count()
1920
+ if missing_embeddings > 0:
1921
+ click.echo(f"\n⚠️ Missing embeddings: {missing_embeddings:,}")
1922
+ click.echo(" Run 'corp-extractor db repair-embeddings' to fix")
1923
+
1924
+ if stats.by_source:
1925
+ click.echo("\nRecords by source:")
1926
+ for source, count in stats.by_source.items():
1927
+ click.echo(f" {source}: {count:,}")
1928
+
1929
+ # Show canonicalization stats
1930
+ canon_stats = database.get_canon_stats()
1931
+ if canon_stats["canonicalized_records"] > 0:
1932
+ click.echo("\nCanonicalization:")
1933
+ click.echo(f" Canonicalized: {canon_stats['canonicalized_records']:,} / {canon_stats['total_records']:,}")
1934
+ click.echo(f" Canonical groups: {canon_stats['canonical_groups']:,}")
1935
+ click.echo(f" Multi-record groups: {canon_stats['multi_record_groups']:,}")
1936
+ click.echo(f" Records in multi-groups: {canon_stats['records_in_multi_groups']:,}")
1937
+ else:
1938
+ click.echo("\nCanonicalization: Not run yet")
1939
+ click.echo(" Run 'corp-extractor db canonicalize' to link equivalent records")
1940
+
1941
+ database.close()
1942
+
1943
+ except Exception as e:
1944
+ raise click.ClickException(f"Failed to read database: {e}")
1945
+
1946
+
1947
+ @db_cmd.command("canonicalize")
1948
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
1949
+ @click.option("--batch-size", type=int, default=10000, help="Batch size for updates (default: 10000)")
1950
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
1951
+ def db_canonicalize(db_path: Optional[str], batch_size: int, verbose: bool):
1952
+ """
1953
+ Canonicalize organizations by linking equivalent records across sources.
1954
+
1955
+ Records are considered equivalent if they share:
1956
+ - Same LEI (globally unique legal entity identifier)
1957
+ - Same ticker symbol
1958
+ - Same CIK (SEC identifier)
1959
+ - Same normalized name (after lowercasing, removing dots)
1960
+ - Same name with suffix expansion (Ltd -> Limited, etc.)
1961
+
1962
+ For each group, the highest-priority source becomes canonical:
1963
+ gleif > sec_edgar > companies_house > wikipedia
1964
+
1965
+ Canonicalization enables better search re-ranking by boosting results
1966
+ that have records from multiple authoritative sources.
1967
+
1968
+ \b
1969
+ Examples:
1970
+ corp-extractor db canonicalize
1971
+ corp-extractor db canonicalize -v
1972
+ corp-extractor db canonicalize --db /path/to/entities.db
1973
+ """
1974
+ _configure_logging(verbose)
1975
+
1976
+ from .database import OrganizationDatabase
1977
+ from .database.store import get_person_database
1978
+
1979
+ try:
1980
+ # Canonicalize organizations
1981
+ database = OrganizationDatabase(db_path=db_path)
1982
+ click.echo("Running organization canonicalization...", err=True)
1983
+
1984
+ result = database.canonicalize(batch_size=batch_size)
1985
+
1986
+ click.echo("\nOrganization Canonicalization Results")
1987
+ click.echo("=" * 40)
1988
+ click.echo(f"Total records processed: {result['total_records']:,}")
1989
+ click.echo(f"Equivalence groups found: {result['groups_found']:,}")
1990
+ click.echo(f"Multi-record groups: {result['multi_record_groups']:,}")
1991
+ click.echo(f"Records updated: {result['records_updated']:,}")
1992
+
1993
+ database.close()
1994
+
1995
+ # Canonicalize people
1996
+ db_path_obj = Path(db_path) if db_path else None
1997
+ person_db = get_person_database(db_path=db_path_obj)
1998
+ click.echo("\nRunning people canonicalization...", err=True)
1999
+
2000
+ people_result = person_db.canonicalize(batch_size=batch_size)
2001
+
2002
+ click.echo("\nPeople Canonicalization Results")
2003
+ click.echo("=" * 40)
2004
+ click.echo(f"Total records processed: {people_result['total_records']:,}")
2005
+ click.echo(f"Matched by organization: {people_result['matched_by_org']:,}")
2006
+ click.echo(f"Matched by date overlap: {people_result['matched_by_date']:,}")
2007
+ click.echo(f"Canonical groups: {people_result['canonical_groups']:,}")
2008
+ click.echo(f"Records in multi-record groups: {people_result['records_in_groups']:,}")
2009
+
2010
+ person_db.close()
2011
+
2012
+ except Exception as e:
2013
+ raise click.ClickException(f"Canonicalization failed: {e}")
2014
+
2015
+
2016
+ @db_cmd.command("search")
2017
+ @click.argument("query")
2018
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
2019
+ @click.option("--top-k", type=int, default=10, help="Number of results")
2020
+ @click.option("--source", type=click.Choice(["gleif", "sec_edgar", "companies_house", "wikipedia"]), help="Filter by source")
2021
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
2022
+ def db_search(query: str, db_path: Optional[str], top_k: int, source: Optional[str], verbose: bool):
2023
+ """
2024
+ Search for an organization in the database.
2025
+
2026
+ \b
2027
+ Examples:
2028
+ corp-extractor db search "Apple Inc"
2029
+ corp-extractor db search "Microsoft" --source sec_edgar
2030
+ """
2031
+ _configure_logging(verbose)
2032
+
2033
+ from .database import OrganizationDatabase, CompanyEmbedder
2034
+
2035
+ embedder = CompanyEmbedder()
2036
+ database = OrganizationDatabase(db_path=db_path)
2037
+
2038
+ click.echo(f"Searching for: {query}", err=True)
2039
+
2040
+ # Embed query
2041
+ query_embedding = embedder.embed(query)
2042
+
2043
+ # Search
2044
+ results = database.search(query_embedding, top_k=top_k, source_filter=source)
2045
+
2046
+ if not results:
2047
+ click.echo("No results found.")
2048
+ return
2049
+
2050
+ click.echo(f"\nTop {len(results)} matches:")
2051
+ click.echo("-" * 60)
2052
+
2053
+ for i, (record, similarity) in enumerate(results, 1):
2054
+ click.echo(f"{i}. {record.legal_name}")
2055
+ click.echo(f" Source: {record.source} | ID: {record.source_id}")
2056
+ click.echo(f" Canonical ID: {record.canonical_id}")
2057
+ click.echo(f" Similarity: {similarity:.4f}")
2058
+ if verbose and record.record:
2059
+ if record.record.get("ticker"):
2060
+ click.echo(f" Ticker: {record.record['ticker']}")
2061
+ if record.record.get("jurisdiction"):
2062
+ click.echo(f" Jurisdiction: {record.record['jurisdiction']}")
2063
+ click.echo()
2064
+
2065
+ database.close()
2066
+
2067
+
2068
+ @db_cmd.command("download")
2069
+ @click.option("--repo", type=str, default="Corp-o-Rate-Community/entity-references", help="HuggingFace repo ID")
2070
+ @click.option("--db", "db_path", type=click.Path(), help="Output path for database")
2071
+ @click.option("--full", is_flag=True, help="Download full version (larger, includes record metadata)")
2072
+ @click.option("--force", is_flag=True, help="Force re-download")
2073
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
2074
+ def db_download(repo: str, db_path: Optional[str], full: bool, force: bool, verbose: bool):
2075
+ """
2076
+ Download entity database from HuggingFace Hub.
2077
+
2078
+ By default downloads the lite version (smaller, without record metadata).
2079
+ Use --full for the complete database with all source record data.
2080
+
2081
+ \b
2082
+ Examples:
2083
+ corp-extractor db download
2084
+ corp-extractor db download --full
2085
+ corp-extractor db download --repo my-org/my-entity-db
2086
+ """
2087
+ _configure_logging(verbose)
2088
+ from .database.hub import download_database
2089
+
2090
+ filename = "entities.db" if full else "entities-lite.db"
2091
+ click.echo(f"Downloading {'full ' if full else 'lite '}database from {repo}...", err=True)
2092
+
2093
+ try:
2094
+ path = download_database(
2095
+ repo_id=repo,
2096
+ filename=filename,
2097
+ force_download=force,
2098
+ )
2099
+ click.echo(f"Database downloaded to: {path}")
2100
+ except Exception as e:
2101
+ raise click.ClickException(f"Download failed: {e}")
2102
+
2103
+
2104
+ @db_cmd.command("upload")
2105
+ @click.argument("db_path", type=click.Path(exists=True), required=False)
2106
+ @click.option("--repo", type=str, default="Corp-o-Rate-Community/entity-references", help="HuggingFace repo ID")
2107
+ @click.option("--message", type=str, default="Update entity database", help="Commit message")
2108
+ @click.option("--no-lite", is_flag=True, help="Skip creating lite version (without record data)")
2109
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
2110
+ def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, verbose: bool):
2111
+ """
2112
+ Upload entity database to HuggingFace Hub.
2113
+
2114
+ First VACUUMs the database, then creates and uploads:
2115
+ - entities.db (full database)
2116
+ - entities-lite.db (without record data, smaller)
2117
+
2118
+ If no path is provided, uploads from the default cache location.
2119
+ Requires HF_TOKEN environment variable to be set.
2120
+
2121
+ \b
2122
+ Examples:
2123
+ corp-extractor db upload
2124
+ corp-extractor db upload /path/to/entities.db
2125
+ corp-extractor db upload --no-lite
2126
+ corp-extractor db upload --repo my-org/my-entity-db
2127
+ """
2128
+ _configure_logging(verbose)
2129
+ from .database.hub import upload_database_with_variants, DEFAULT_CACHE_DIR, DEFAULT_DB_FULL_FILENAME
2130
+
2131
+ # Use default cache location if no path provided
2132
+ if db_path is None:
2133
+ db_path = str(DEFAULT_CACHE_DIR / DEFAULT_DB_FULL_FILENAME)
2134
+ if not Path(db_path).exists():
2135
+ raise click.ClickException(
2136
+ f"Database not found at default location: {db_path}\n"
2137
+ "Build the database first with import commands, or specify a path."
2138
+ )
2139
+
2140
+ click.echo(f"Uploading {db_path} to {repo}...", err=True)
2141
+ click.echo(" - Running VACUUM to optimize database", err=True)
2142
+ if not no_lite:
2143
+ click.echo(" - Creating lite version (without record data)", err=True)
2144
+
2145
+ try:
2146
+ results = upload_database_with_variants(
2147
+ db_path=db_path,
2148
+ repo_id=repo,
2149
+ commit_message=message,
2150
+ include_lite=not no_lite,
2151
+ )
2152
+ click.echo(f"\nUploaded {len(results)} file(s) successfully:")
2153
+ for filename, url in results.items():
2154
+ click.echo(f" - {filename}")
2155
+ except Exception as e:
2156
+ raise click.ClickException(f"Upload failed: {e}")
2157
+
2158
+
2159
+ @db_cmd.command("create-lite")
2160
+ @click.argument("db_path", type=click.Path(exists=True))
2161
+ @click.option("-o", "--output", type=click.Path(), help="Output path (default: adds -lite suffix)")
2162
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
2163
+ def db_create_lite(db_path: str, output: Optional[str], verbose: bool):
2164
+ """
2165
+ Create a lite version of the database without record data.
2166
+
2167
+ The lite version strips the `record` column (full source data),
2168
+ keeping only core fields and embeddings. This significantly
2169
+ reduces file size while maintaining search functionality.
2170
+
2171
+ \b
2172
+ Examples:
2173
+ corp-extractor db create-lite entities.db
2174
+ corp-extractor db create-lite entities.db -o entities-lite.db
2175
+ """
2176
+ _configure_logging(verbose)
2177
+ from .database.hub import create_lite_database
2178
+
2179
+ click.echo(f"Creating lite database from {db_path}...", err=True)
2180
+
2181
+ try:
2182
+ lite_path = create_lite_database(db_path, output)
2183
+ click.echo(f"Lite database created: {lite_path}")
2184
+ except Exception as e:
2185
+ raise click.ClickException(f"Failed to create lite database: {e}")
2186
+
2187
+
2188
+ @db_cmd.command("repair-embeddings")
2189
+ @click.option("--db", "db_path", type=click.Path(), help="Database path")
2190
+ @click.option("--batch-size", type=int, default=1000, help="Batch size for embedding generation (default: 1000)")
2191
+ @click.option("--source", type=str, help="Only repair specific source (gleif, sec_edgar, etc.)")
2192
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
2193
+ def db_repair_embeddings(db_path: Optional[str], batch_size: int, source: Optional[str], verbose: bool):
2194
+ """
2195
+ Generate missing embeddings for organizations in the database.
2196
+
2197
+ This repairs databases where organizations were imported without embeddings
2198
+ being properly stored in the organization_embeddings table.
2199
+
2200
+ \b
2201
+ Examples:
2202
+ corp-extractor db repair-embeddings
2203
+ corp-extractor db repair-embeddings --source wikipedia
2204
+ corp-extractor db repair-embeddings --batch-size 500
2205
+ """
2206
+ _configure_logging(verbose)
2207
+
2208
+ from .database import OrganizationDatabase, CompanyEmbedder
2209
+
2210
+ database = OrganizationDatabase(db_path=db_path)
2211
+ embedder = CompanyEmbedder()
2212
+
2213
+ # Check how many need repair
2214
+ missing_count = database.get_missing_embedding_count()
2215
+ if missing_count == 0:
2216
+ click.echo("All organizations have embeddings. Nothing to repair.")
2217
+ database.close()
2218
+ return
2219
+
2220
+ click.echo(f"Found {missing_count:,} organizations without embeddings.", err=True)
2221
+ click.echo("Generating embeddings...", err=True)
2222
+
2223
+ # Process in batches
2224
+ org_ids = []
2225
+ names = []
2226
+ count = 0
2227
+
2228
+ for org_id, name in database.get_organizations_without_embeddings(batch_size=batch_size, source=source):
2229
+ org_ids.append(org_id)
2230
+ names.append(name)
2231
+
2232
+ if len(names) >= batch_size:
2233
+ # Generate embeddings
2234
+ embeddings = embedder.embed_batch(names)
2235
+ database.insert_embeddings_batch(org_ids, embeddings)
2236
+ count += len(names)
2237
+ click.echo(f"Repaired {count:,} / {missing_count:,} embeddings...", err=True)
2238
+ org_ids = []
2239
+ names = []
2240
+
2241
+ # Final batch
2242
+ if names:
2243
+ embeddings = embedder.embed_batch(names)
2244
+ database.insert_embeddings_batch(org_ids, embeddings)
2245
+ count += len(names)
2246
+
2247
+ click.echo(f"\nRepaired {count:,} embeddings successfully.", err=True)
2248
+ database.close()
2249
+
2250
+
2251
+ @db_cmd.command("migrate")
2252
+ @click.argument("db_path", type=click.Path(exists=True))
2253
+ @click.option("--rename-file", is_flag=True, help="Also rename companies.db to entities.db")
2254
+ @click.option("--yes", is_flag=True, help="Skip confirmation prompt")
2255
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
2256
+ def db_migrate(db_path: str, rename_file: bool, yes: bool, verbose: bool):
2257
+ """
2258
+ Migrate database from legacy schema to new schema.
2259
+
2260
+ Migrates from old naming (companies/company_embeddings tables)
2261
+ to new naming (organizations/organization_embeddings tables).
2262
+
2263
+ \b
2264
+ What this does:
2265
+ - Renames 'companies' table to 'organizations'
2266
+ - Renames 'company_embeddings' table to 'organization_embeddings'
2267
+ - Updates all indexes
2268
+
2269
+ \b
2270
+ Examples:
2271
+ corp-extractor db migrate companies.db
2272
+ corp-extractor db migrate companies.db --rename-file
2273
+ corp-extractor db migrate ~/.cache/corp-extractor/companies.db --yes
2274
+ """
2275
+ _configure_logging(verbose)
2276
+
2277
+ from pathlib import Path
2278
+ from .database import OrganizationDatabase
2279
+
2280
+ db_path_obj = Path(db_path)
2281
+
2282
+ if not yes:
2283
+ click.confirm(
2284
+ f"This will migrate {db_path} from legacy schema (companies) to new schema (organizations).\n"
2285
+ "This operation cannot be undone. Continue?",
2286
+ abort=True
2287
+ )
2288
+
2289
+ try:
2290
+ database = OrganizationDatabase(db_path=db_path)
2291
+ migrations = database.migrate_from_legacy_schema()
2292
+ database.close()
2293
+
2294
+ if migrations:
2295
+ click.echo("Migration completed:")
2296
+ for table, action in migrations.items():
2297
+ click.echo(f" {table}: {action}")
2298
+ else:
2299
+ click.echo("No migration needed. Database already uses new schema.")
2300
+
2301
+ # Optionally rename the file
2302
+ if rename_file and db_path_obj.name.startswith("companies"):
2303
+ new_name = db_path_obj.name.replace("companies", "entities")
2304
+ new_path = db_path_obj.parent / new_name
2305
+ db_path_obj.rename(new_path)
2306
+ click.echo(f"Renamed file: {db_path} -> {new_path}")
2307
+
2308
+ except Exception as e:
2309
+ raise click.ClickException(f"Migration failed: {e}")
2310
+
2311
+
2312
+ # =============================================================================
2313
+ # Document commands
2314
+ # =============================================================================
2315
+
2316
+ @main.group("document")
2317
+ def document_cmd():
2318
+ """
2319
+ Process documents with chunking, deduplication, and citations.
2320
+
2321
+ \b
2322
+ Commands:
2323
+ process Process a document through the full pipeline
2324
+ chunk Preview chunking without extraction
2325
+
2326
+ \b
2327
+ Examples:
2328
+ corp-extractor document process article.txt
2329
+ corp-extractor document process report.pdf --no-summary
2330
+ corp-extractor document chunk article.txt --max-tokens 500
2331
+ """
2332
+ pass
2333
+
2334
+
2335
+ @document_cmd.command("process")
2336
+ @click.argument("input_source") # Can be file path or URL
2337
+ @click.option("--title", type=str, help="Document title (for citations)")
2338
+ @click.option("--author", "authors", type=str, multiple=True, help="Document author(s)")
2339
+ @click.option("--year", type=int, help="Publication year")
2340
+ @click.option("--max-tokens", type=int, default=1000, help="Target tokens per chunk (default: 1000)")
2341
+ @click.option("--overlap", type=int, default=100, help="Token overlap between chunks (default: 100)")
2342
+ @click.option("--no-summary", is_flag=True, help="Skip document summarization")
2343
+ @click.option("--no-dedup", is_flag=True, help="Skip deduplication across chunks")
2344
+ @click.option("--use-ocr", is_flag=True, help="Force OCR for PDF parsing")
2345
+ @click.option(
2346
+ "--stages",
2347
+ type=str,
2348
+ default="1-6",
2349
+ help="Pipeline stages to run (e.g., '1-3' or '1,2,5')"
2350
+ )
2351
+ @click.option(
2352
+ "-o", "--output",
2353
+ type=click.Choice(["table", "json", "triples"], case_sensitive=False),
2354
+ default="table",
2355
+ help="Output format (default: table)"
2356
+ )
2357
+ @click.option("-v", "--verbose", is_flag=True, help="Show verbose output")
2358
+ @click.option("-q", "--quiet", is_flag=True, help="Suppress progress messages")
2359
+ def document_process(
2360
+ input_source: str,
2361
+ title: Optional[str],
2362
+ authors: tuple[str, ...],
2363
+ year: Optional[int],
2364
+ max_tokens: int,
2365
+ overlap: int,
2366
+ no_summary: bool,
2367
+ no_dedup: bool,
2368
+ use_ocr: bool,
2369
+ stages: str,
2370
+ output: str,
2371
+ verbose: bool,
2372
+ quiet: bool,
2373
+ ):
2374
+ """
2375
+ Process a document or URL through the extraction pipeline with chunking.
2376
+
2377
+ Supports text files, URLs (web pages and PDFs).
2378
+
2379
+ \b
2380
+ Examples:
2381
+ corp-extractor document process article.txt
2382
+ corp-extractor document process report.txt --title "Annual Report" --year 2024
2383
+ corp-extractor document process https://example.com/article
2384
+ corp-extractor document process https://example.com/report.pdf --use-ocr
2385
+ corp-extractor document process doc.txt --no-summary --stages 1-3
2386
+ corp-extractor document process doc.txt -o json
2387
+ """
2388
+ _configure_logging(verbose)
2389
+
2390
+ # Import document pipeline
2391
+ from .document import DocumentPipeline, DocumentPipelineConfig, Document
2392
+ from .models.document import ChunkingConfig
2393
+ from .pipeline import PipelineConfig
2394
+ _load_all_plugins()
2395
+
2396
+ # Parse stages
2397
+ enabled_stages = _parse_stages(stages)
2398
+
2399
+ # Build configs
2400
+ chunking_config = ChunkingConfig(
2401
+ target_tokens=max_tokens,
2402
+ max_tokens=max_tokens * 2,
2403
+ overlap_tokens=overlap,
2404
+ )
2405
+
2406
+ pipeline_config = PipelineConfig(
2407
+ enabled_stages=enabled_stages,
2408
+ )
2409
+
2410
+ doc_config = DocumentPipelineConfig(
2411
+ chunking=chunking_config,
2412
+ generate_summary=not no_summary,
2413
+ deduplicate_across_chunks=not no_dedup,
2414
+ pipeline_config=pipeline_config,
2415
+ )
2416
+
2417
+ # Create pipeline
2418
+ pipeline = DocumentPipeline(doc_config)
2419
+
2420
+ # Detect if input is a URL
2421
+ is_url = input_source.startswith(("http://", "https://"))
2422
+
2423
+ # Process
2424
+ try:
2425
+ if is_url:
2426
+ # Process URL
2427
+ from .document import URLLoaderConfig
2428
+
2429
+ if not quiet:
2430
+ click.echo(f"Fetching URL: {input_source}", err=True)
2431
+
2432
+ loader_config = URLLoaderConfig(use_ocr=use_ocr)
2433
+ ctx = pipeline.process_url_sync(input_source, loader_config)
2434
+
2435
+ if not quiet:
2436
+ click.echo(f"Processed: {ctx.document.metadata.title or 'Untitled'}", err=True)
2437
+
2438
+ else:
2439
+ # Process file
2440
+ from pathlib import Path
2441
+ import os
2442
+
2443
+ if not os.path.exists(input_source):
2444
+ raise click.ClickException(f"File not found: {input_source}")
2445
+
2446
+ # Read input file
2447
+ with open(input_source, "r", encoding="utf-8") as f:
2448
+ text = f.read()
2449
+
2450
+ if not text.strip():
2451
+ raise click.ClickException("Input file is empty")
2452
+
2453
+ if not quiet:
2454
+ click.echo(f"Processing document: {input_source} ({len(text)} chars)", err=True)
2455
+
2456
+ # Create document with metadata
2457
+ doc_title = title or Path(input_source).stem
2458
+ document = Document.from_text(
2459
+ text=text,
2460
+ title=doc_title,
2461
+ source_type="text",
2462
+ authors=list(authors),
2463
+ year=year,
2464
+ )
2465
+
2466
+ ctx = pipeline.process(document)
2467
+
2468
+ # Output results
2469
+ if output == "json":
2470
+ _print_document_json(ctx)
2471
+ elif output == "triples":
2472
+ _print_document_triples(ctx)
2473
+ else:
2474
+ _print_document_table(ctx, verbose)
2475
+
2476
+ # Report stats
2477
+ if not quiet:
2478
+ click.echo(f"\nChunks: {ctx.chunk_count}", err=True)
2479
+ click.echo(f"Statements: {ctx.statement_count}", err=True)
2480
+ if ctx.duplicates_removed > 0:
2481
+ click.echo(f"Duplicates removed: {ctx.duplicates_removed}", err=True)
2482
+
2483
+ if ctx.processing_errors:
2484
+ click.echo(f"\nErrors: {len(ctx.processing_errors)}", err=True)
2485
+ for error in ctx.processing_errors:
2486
+ click.echo(f" - {error}", err=True)
2487
+
2488
+ except Exception as e:
2489
+ logging.exception("Document processing error:")
2490
+ raise click.ClickException(f"Processing failed: {e}")
2491
+
2492
+
2493
+ @document_cmd.command("chunk")
2494
+ @click.argument("input_path", type=click.Path(exists=True))
2495
+ @click.option("--max-tokens", type=int, default=1000, help="Target tokens per chunk (default: 1000)")
2496
+ @click.option("--overlap", type=int, default=100, help="Token overlap between chunks (default: 100)")
2497
+ @click.option("-o", "--output", type=click.Choice(["table", "json"]), default="table", help="Output format")
2498
+ @click.option("-v", "--verbose", is_flag=True, help="Show verbose output")
2499
+ def document_chunk(
2500
+ input_path: str,
2501
+ max_tokens: int,
2502
+ overlap: int,
2503
+ output: str,
2504
+ verbose: bool,
2505
+ ):
2506
+ """
2507
+ Preview document chunking without running extraction.
2508
+
2509
+ Shows how a document would be split into chunks for processing.
2510
+
2511
+ \b
2512
+ Examples:
2513
+ corp-extractor document chunk article.txt
2514
+ corp-extractor document chunk article.txt --max-tokens 500
2515
+ corp-extractor document chunk article.txt -o json
2516
+ """
2517
+ _configure_logging(verbose)
2518
+
2519
+ # Read input file
2520
+ with open(input_path, "r", encoding="utf-8") as f:
2521
+ text = f.read()
2522
+
2523
+ if not text.strip():
2524
+ raise click.ClickException("Input file is empty")
2525
+
2526
+ click.echo(f"Chunking document: {input_path} ({len(text)} chars)", err=True)
2527
+
2528
+ from .document import DocumentChunker, Document
2529
+ from .models.document import ChunkingConfig
2530
+
2531
+ config = ChunkingConfig(
2532
+ target_tokens=max_tokens,
2533
+ max_tokens=max_tokens * 2,
2534
+ overlap_tokens=overlap,
2535
+ )
2536
+
2537
+ from pathlib import Path
2538
+ document = Document.from_text(text, title=Path(input_path).stem)
2539
+ chunker = DocumentChunker(config)
2540
+ chunks = chunker.chunk_document(document)
2541
+
2542
+ if output == "json":
2543
+ import json
2544
+ chunk_data = [
2545
+ {
2546
+ "index": c.chunk_index,
2547
+ "tokens": c.token_count,
2548
+ "chars": len(c.text),
2549
+ "pages": c.page_numbers,
2550
+ "overlap": c.overlap_chars,
2551
+ "preview": c.text[:100] + "..." if len(c.text) > 100 else c.text,
2552
+ }
2553
+ for c in chunks
2554
+ ]
2555
+ click.echo(json.dumps({"chunks": chunk_data, "total": len(chunks)}, indent=2))
2556
+ else:
2557
+ click.echo(f"\nCreated {len(chunks)} chunk(s):\n")
2558
+ click.echo("-" * 80)
2559
+
2560
+ for chunk in chunks:
2561
+ click.echo(f"Chunk {chunk.chunk_index + 1}:")
2562
+ click.echo(f" Tokens: {chunk.token_count}")
2563
+ click.echo(f" Characters: {len(chunk.text)}")
2564
+ if chunk.page_numbers:
2565
+ click.echo(f" Pages: {chunk.page_numbers}")
2566
+ if chunk.overlap_chars > 0:
2567
+ click.echo(f" Overlap: {chunk.overlap_chars} chars")
2568
+
2569
+ preview = chunk.text[:200].replace("\n", " ")
2570
+ if len(chunk.text) > 200:
2571
+ preview += "..."
2572
+ click.echo(f" Preview: {preview}")
2573
+ click.echo("-" * 80)
2574
+
2575
+
2576
+ def _print_document_json(ctx):
2577
+ """Print document context as JSON."""
2578
+ import json
2579
+ click.echo(json.dumps(ctx.as_dict(), indent=2, default=str))
2580
+
2581
+
2582
+ def _print_document_triples(ctx):
2583
+ """Print document statements as triples."""
2584
+ for stmt in ctx.labeled_statements:
2585
+ parts = [stmt.subject_fqn, stmt.statement.predicate, stmt.object_fqn]
2586
+ if stmt.page_number:
2587
+ parts.append(f"p.{stmt.page_number}")
2588
+ click.echo("\t".join(parts))
2589
+
2590
+
2591
+ def _print_document_table(ctx, verbose: bool):
2592
+ """Print document context in table format."""
2593
+ # Show summary if available
2594
+ if ctx.document.summary:
2595
+ click.echo("\nDocument Summary:")
2596
+ click.echo("-" * 40)
2597
+ click.echo(ctx.document.summary)
2598
+ click.echo("-" * 40)
2599
+
2600
+ if not ctx.labeled_statements:
2601
+ click.echo("\nNo statements extracted.")
2602
+ return
2603
+
2604
+ click.echo(f"\nExtracted {len(ctx.labeled_statements)} statement(s):\n")
2605
+ click.echo("-" * 80)
2606
+
2607
+ for i, stmt in enumerate(ctx.labeled_statements, 1):
2608
+ click.echo(f"{i}. {stmt.subject_fqn}")
2609
+ click.echo(f" --[{stmt.statement.predicate}]-->")
2610
+ click.echo(f" {stmt.object_fqn}")
2611
+
2612
+ # Show citation
2613
+ if stmt.citation:
2614
+ click.echo(f" Citation: {stmt.citation}")
2615
+ elif stmt.page_number:
2616
+ click.echo(f" Page: {stmt.page_number}")
2617
+
2618
+ # Show labels
2619
+ for label in stmt.labels:
2620
+ if isinstance(label.label_value, float):
2621
+ click.echo(f" {label.label_type}: {label.label_value:.3f}")
2622
+ else:
2623
+ click.echo(f" {label.label_type}: {label.label_value}")
2624
+
2625
+ # Show taxonomy (top 3)
2626
+ if stmt.taxonomy_results:
2627
+ sorted_taxonomy = sorted(stmt.taxonomy_results, key=lambda t: t.confidence, reverse=True)[:3]
2628
+ taxonomy_strs = [f"{t.category}:{t.label}" for t in sorted_taxonomy]
2629
+ click.echo(f" Topics: {', '.join(taxonomy_strs)}")
2630
+
2631
+ if verbose and stmt.statement.source_text:
2632
+ source = stmt.statement.source_text[:60] + "..." if len(stmt.statement.source_text) > 60 else stmt.statement.source_text
2633
+ click.echo(f" Source: \"{source}\"")
2634
+
2635
+ click.echo("-" * 80)
2636
+
2637
+ # Show timings in verbose mode
2638
+ if verbose and ctx.stage_timings:
2639
+ click.echo("\nStage timings:")
2640
+ for stage, duration in ctx.stage_timings.items():
2641
+ click.echo(f" {stage}: {duration:.3f}s")
2642
+
2643
+
638
2644
  # =============================================================================
639
2645
  # Helper functions
640
2646
  # =============================================================================