PyPI - corp-extractor - Versions diffs - 0.9.0__py3-none-any.whl → 0.9.3__py3-none-any.whl - Mend

corp-extractor 0.9.0py3-none-any.whl → 0.9.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +40 -9
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/RECORD +29 -26
statement_extractor/cli.py +866 -77
statement_extractor/database/hub.py +35 -127
statement_extractor/database/importers/__init__.py +10 -2
statement_extractor/database/importers/companies_house.py +16 -2
statement_extractor/database/importers/companies_house_officers.py +431 -0
statement_extractor/database/importers/gleif.py +23 -0
statement_extractor/database/importers/sec_edgar.py +17 -0
statement_extractor/database/importers/sec_form4.py +512 -0
statement_extractor/database/importers/wikidata.py +151 -43
statement_extractor/database/importers/wikidata_dump.py +1951 -0
statement_extractor/database/importers/wikidata_people.py +823 -325
statement_extractor/database/models.py +30 -6
statement_extractor/database/store.py +1485 -60
statement_extractor/document/deduplicator.py +10 -12
statement_extractor/extractor.py +1 -1
statement_extractor/models/__init__.py +3 -2
statement_extractor/models/statement.py +15 -17
statement_extractor/models.py +1 -1
statement_extractor/pipeline/context.py +5 -5
statement_extractor/pipeline/orchestrator.py +12 -12
statement_extractor/plugins/base.py +17 -17
statement_extractor/plugins/extractors/gliner2.py +28 -28
statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
statement_extractor/plugins/qualifiers/person.py +11 -1
statement_extractor/plugins/splitters/t5_gemma.py +35 -39
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0

statement_extractor/cli.py CHANGED Viewed

@@ -439,7 +439,7 @@ def _print_pipeline_json(ctx):
     """Print pipeline results as JSON."""
     output = {
         "statement_count": ctx.statement_count,
-        "raw_triples": [t.model_dump() for t in ctx.raw_triples],
+        "split_sentences": [s.model_dump() for s in ctx.split_sentences],
         "statements": [s.model_dump() for s in ctx.statements],
         "labeled_statements": [stmt.as_dict() for stmt in ctx.labeled_statements],
         "timings": ctx.stage_timings,
@@ -472,9 +472,10 @@ def _print_pipeline_triples(ctx):
     elif ctx.statements:
         for stmt in ctx.statements:
             click.echo(f"{stmt.subject.text}\t{stmt.predicate}\t{stmt.object.text}")
-    elif ctx.raw_triples:
-        for triple in ctx.raw_triples:
-            click.echo(f"{triple.subject_text}\t{triple.predicate_text}\t{triple.object_text}")
+    elif ctx.split_sentences:
+        # Stage 1 only output - just show the split sentences (no triples yet)
+        for sentence in ctx.split_sentences:
+            click.echo(sentence.text)
 def _print_pipeline_table(ctx, verbose: bool):
@@ -528,20 +529,16 @@ def _print_pipeline_table(ctx, verbose: bool):
             click.echo("-" * 80)
-    elif ctx.raw_triples:
-        click.echo(f"\nExtracted {len(ctx.raw_triples)} raw triple(s):\n")
+    elif ctx.split_sentences:
+        click.echo(f"\nSplit into {len(ctx.split_sentences)} atomic sentence(s):\n")
         click.echo("-" * 80)
-        for i, triple in enumerate(ctx.raw_triples, 1):
-            click.echo(f"{i}. {triple.subject_text}")
-            click.echo(f"   --[{triple.predicate_text}]-->")
-            click.echo(f"   {triple.object_text}")
+        for i, sentence in enumerate(ctx.split_sentences, 1):
+            text_preview = sentence.text[:100] + "..." if len(sentence.text) > 100 else sentence.text
+            click.echo(f"{i}. {text_preview}")
             if verbose:
-                click.echo(f"   Confidence: {triple.confidence:.2f}")
-                if triple.source_sentence:
-                    source = triple.source_sentence[:60] + "..." if len(triple.source_sentence) > 60 else triple.source_sentence
-                    click.echo(f"   Source: \"{source}\"")
+                click.echo(f"   Confidence: {sentence.confidence:.2f}")
             click.echo("-" * 80)
@@ -666,22 +663,27 @@ def db_cmd():
     Commands:
         import-gleif           Import GLEIF LEI data (~3M records)
         import-sec             Import SEC Edgar bulk data (~100K+ filers)
+        import-sec-officers    Import SEC Form 4 officers/directors
+        import-ch-officers     Import UK Companies House officers (Prod195)
         import-companies-house Import UK Companies House (~5M records)
-        import-wikidata        Import Wikidata organizations
-        import-people          Import Wikidata notable people
+        import-wikidata        Import Wikidata organizations (SPARQL, may timeout)
+        import-people          Import Wikidata notable people (SPARQL, may timeout)
+        import-wikidata-dump   Import from Wikidata JSON dump (recommended)
+        canonicalize           Link equivalent records across sources
         status                 Show database status
         search                 Search for an organization
         search-people          Search for a person
         download               Download database from HuggingFace
-        upload                 Upload database with lite/compressed variants
+        upload                 Upload database with lite variant
         create-lite            Create lite version (no record data)
-        compress               Compress database with gzip
     \b
     Examples:
         corp-extractor db import-sec --download
+        corp-extractor db import-sec-officers --start-year 2023 --limit 10000
         corp-extractor db import-gleif --download --limit 100000
-        corp-extractor db import-people --all --limit 10000
+        corp-extractor db import-wikidata-dump --download --limit 50000
+        corp-extractor db canonicalize
         corp-extractor db status
         corp-extractor db search "Apple Inc"
         corp-extractor db search-people "Tim Cook"
@@ -868,6 +870,213 @@ def db_import_sec(download: bool, file_path: Optional[str], db_path: Optional[st
     database.close()
+@db_cmd.command("import-sec-officers")
+@click.option("--db", "db_path", type=click.Path(), help="Database path")
+@click.option("--start-year", type=int, default=2020, help="Start year (default: 2020)")
+@click.option("--end-year", type=int, help="End year (default: current year)")
+@click.option("--limit", type=int, help="Limit number of records")
+@click.option("--batch-size", type=int, default=1000, help="Batch size for commits (default: 1000)")
+@click.option("--resume", is_flag=True, help="Resume from saved progress")
+@click.option("--skip-existing", is_flag=True, help="Skip records that already exist")
+@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
+def db_import_sec_officers(db_path: Optional[str], start_year: int, end_year: Optional[int], limit: Optional[int], batch_size: int, resume: bool, skip_existing: bool, verbose: bool):
+    """
+    Import SEC Form 4 insider data into the people database.
+    Downloads Form 4 filings from SEC EDGAR and extracts officers, directors,
+    and significant investors (10%+ owners) from each company.
+    Form 4 filings are submitted when insiders buy or sell company stock.
+    They contain the person's name, role (officer/director), and company.
+    Rate limited to 5 requests/second to comply with SEC guidelines.
+    \b
+    Examples:
+        corp-extractor db import-sec-officers --limit 1000
+        corp-extractor db import-sec-officers --start-year 2023
+        corp-extractor db import-sec-officers --resume
+        corp-extractor db import-sec-officers --skip-existing -v
+    """
+    _configure_logging(verbose)
+    from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
+    from .database.embeddings import CompanyEmbedder
+    from .database.importers.sec_form4 import SecForm4Importer
+    # Default database path
+    if db_path is None:
+        db_path_obj = DEFAULT_DB_PATH
+    else:
+        db_path_obj = Path(db_path)
+    click.echo(f"Importing SEC Form 4 officers/directors to {db_path_obj}...", err=True)
+    click.echo(f"Year range: {start_year} - {end_year or 'current'}", err=True)
+    if resume:
+        click.echo("Resuming from saved progress...", err=True)
+    # Initialize components
+    database = get_person_database(db_path=db_path_obj)
+    org_database = get_database(db_path=db_path_obj)
+    embedder = CompanyEmbedder()
+    importer = SecForm4Importer()
+    # Import records in batches
+    records = []
+    count = 0
+    skipped_existing = 0
+    def progress_callback(year: int, quarter: int, filing_idx: int, accession: str, total: int) -> None:
+        if verbose and filing_idx % 100 == 0:
+            click.echo(f"  {year} Q{quarter}: {filing_idx} filings, {total} records", err=True)
+    for record in importer.import_range(
+        start_year=start_year,
+        end_year=end_year,
+        limit=limit,
+        resume=resume,
+        progress_callback=progress_callback,
+    ):
+        # Skip existing records if flag is set
+        if skip_existing:
+            existing = database.get_by_source_id(record.source, record.source_id)
+            if existing is not None:
+                skipped_existing += 1
+                continue
+        # Look up org ID by CIK if available
+        issuer_cik = record.record.get("issuer_cik", "")
+        if issuer_cik:
+            org_id = org_database.get_id_by_source_id("sec_edgar", issuer_cik.zfill(10))
+            if org_id is not None:
+                record.known_for_org_id = org_id
+        records.append(record)
+        if len(records) >= batch_size:
+            embedding_texts = [r.get_embedding_text() for r in records]
+            embeddings = embedder.embed_batch(embedding_texts)
+            database.insert_batch(records, embeddings)
+            count += len(records)
+            click.echo(f"Imported {count} records...", err=True)
+            records = []
+    # Final batch
+    if records:
+        embedding_texts = [r.get_embedding_text() for r in records]
+        embeddings = embedder.embed_batch(embedding_texts)
+        database.insert_batch(records, embeddings)
+        count += len(records)
+    if skip_existing and skipped_existing > 0:
+        click.echo(f"\nImported {count} SEC officers/directors (skipped {skipped_existing} existing).", err=True)
+    else:
+        click.echo(f"\nImported {count} SEC officers/directors successfully.", err=True)
+    org_database.close()
+    database.close()
+@db_cmd.command("import-ch-officers")
+@click.option("--file", "file_path", type=click.Path(exists=True), required=True, help="Path to CH officers zip file (Prod195)")
+@click.option("--db", "db_path", type=click.Path(), help="Database path")
+@click.option("--limit", type=int, help="Limit number of records")
+@click.option("--batch-size", type=int, default=1000, help="Batch size for commits (default: 1000)")
+@click.option("--resume", is_flag=True, help="Resume from saved progress")
+@click.option("--include-resigned", is_flag=True, help="Include resigned officers (default: current only)")
+@click.option("--skip-existing", is_flag=True, help="Skip records that already exist")
+@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
+def db_import_ch_officers(file_path: str, db_path: Optional[str], limit: Optional[int], batch_size: int, resume: bool, include_resigned: bool, skip_existing: bool, verbose: bool):
+    """
+    Import Companies House officers data into the people database.
+    Requires the Prod195 bulk officers zip file from Companies House.
+    Request access via BulkProducts@companieshouse.gov.uk.
+    \b
+    Examples:
+        corp-extractor db import-ch-officers --file officers.zip --limit 10000
+        corp-extractor db import-ch-officers --file officers.zip --resume
+        corp-extractor db import-ch-officers --file officers.zip --include-resigned
+    """
+    _configure_logging(verbose)
+    from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
+    from .database.embeddings import CompanyEmbedder
+    from .database.importers.companies_house_officers import CompaniesHouseOfficersImporter
+    # Default database path
+    if db_path is None:
+        db_path_obj = DEFAULT_DB_PATH
+    else:
+        db_path_obj = Path(db_path)
+    click.echo(f"Importing Companies House officers to {db_path_obj}...", err=True)
+    if resume:
+        click.echo("Resuming from saved progress...", err=True)
+    # Initialize components
+    database = get_person_database(db_path=db_path_obj)
+    org_database = get_database(db_path=db_path_obj)
+    embedder = CompanyEmbedder()
+    importer = CompaniesHouseOfficersImporter()
+    # Import records in batches
+    records = []
+    count = 0
+    skipped_existing = 0
+    def progress_callback(file_idx: int, line_num: int, total: int) -> None:
+        if verbose:
+            click.echo(f"  File {file_idx}: line {line_num}, {total} records", err=True)
+    for record in importer.import_from_zip(
+        file_path,
+        limit=limit,
+        resume=resume,
+        current_only=not include_resigned,
+        progress_callback=progress_callback,
+    ):
+        # Skip existing records if flag is set
+        if skip_existing:
+            existing = database.get_by_source_id(record.source, record.source_id)
+            if existing is not None:
+                skipped_existing += 1
+                continue
+        # Look up org ID by company number if available
+        company_number = record.record.get("company_number", "")
+        if company_number:
+            org_id = org_database.get_id_by_source_id("companies_house", company_number)
+            if org_id is not None:
+                record.known_for_org_id = org_id
+        records.append(record)
+        if len(records) >= batch_size:
+            embedding_texts = [r.get_embedding_text() for r in records]
+            embeddings = embedder.embed_batch(embedding_texts)
+            database.insert_batch(records, embeddings)
+            count += len(records)
+            click.echo(f"Imported {count} records...", err=True)
+            records = []
+    # Final batch
+    if records:
+        embedding_texts = [r.get_embedding_text() for r in records]
+        embeddings = embedder.embed_batch(embedding_texts)
+        database.insert_batch(records, embeddings)
+        count += len(records)
+    if skip_existing and skipped_existing > 0:
+        click.echo(f"\nImported {count} CH officers (skipped {skipped_existing} existing).", err=True)
+    else:
+        click.echo(f"\nImported {count} CH officers successfully.", err=True)
+    org_database.close()
+    database.close()
 @db_cmd.command("import-wikidata")
 @click.option("--db", "db_path", type=click.Path(), help="Database path")
 @click.option("--limit", type=int, help="Limit number of records")
@@ -947,23 +1156,32 @@ def db_import_wikidata(db_path: Optional[str], limit: Optional[int], batch_size:
     "academic", "scientist", "journalist", "entrepreneur", "activist"
 ]), default="executive", help="Person type to import")
 @click.option("--all", "import_all", is_flag=True, help="Run all person type queries sequentially")
+@click.option("--enrich", is_flag=True, help="Query individual people to get role/org data (slower, resumable)")
+@click.option("--enrich-only", is_flag=True, help="Only enrich existing people (skip bulk import)")
+@click.option("--enrich-dates", is_flag=True, help="Query individual people to get start/end dates (slower)")
+@click.option("--skip-existing", is_flag=True, help="Skip records that already exist (default: update them)")
 @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
-def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: int, query_type: str, import_all: bool, verbose: bool):
+def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: int, query_type: str, import_all: bool, enrich: bool, enrich_only: bool, enrich_dates: bool, skip_existing: bool, verbose: bool):
     """
     Import notable people data from Wikidata via SPARQL.
+    Uses a two-phase approach for reliability:
+    1. Bulk import: Fast fetch of QID, name, country (no timeouts)
+    2. Enrich (optional): Per-person queries for role/org/dates
     Imports people with English Wikipedia articles (ensures notability).
-    Includes executives, politicians, athletes, artists, academics, and more.
     \b
     Examples:
         corp-extractor db import-people --type executive --limit 5000
         corp-extractor db import-people --all --limit 10000
+        corp-extractor db import-people --type executive --enrich
+        corp-extractor db import-people --enrich-only --limit 100
         corp-extractor db import-people --type politician -v
     """
     _configure_logging(verbose)
-    from .database.store import get_person_database, DEFAULT_DB_PATH
+    from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
     from .database.embeddings import CompanyEmbedder
     from .database.importers.wikidata_people import WikidataPeopleImporter
@@ -977,35 +1195,558 @@ def db_import_people(db_path: Optional[str], limit: Optional[int], batch_size: i
     # Initialize components
     database = get_person_database(db_path=db_path_obj)
+    org_database = get_database(db_path=db_path_obj)
     embedder = CompanyEmbedder()
     importer = WikidataPeopleImporter(batch_size=batch_size)
-    # Batch processing
-    records = []
     count = 0
-    for record in importer.import_from_sparql(limit=limit, query_type=query_type, import_all=import_all):
-        records.append(record)
+    # Phase 1: Bulk import (fast, minimal data) - skip if --enrich-only
+    if not enrich_only:
+        records = []
+        skipped_existing = 0
-        if len(records) >= batch_size:
-            # Generate embeddings using the combined name|role|org format
+        click.echo("Phase 1: Bulk import (QID, name, country)...", err=True)
+        for record in importer.import_from_sparql(limit=limit, query_type=query_type, import_all=import_all):
+            # Skip existing records if flag is set
+            if skip_existing:
+                existing = database.get_by_source_id(record.source, record.source_id)
+                if existing is not None:
+                    skipped_existing += 1
+                    continue
+            records.append(record)
+            if len(records) >= batch_size:
+                # Generate embeddings (just name for now, will re-embed after enrichment)
+                embedding_texts = [r.get_embedding_text() for r in records]
+                embeddings = embedder.embed_batch(embedding_texts)
+                database.insert_batch(records, embeddings)
+                count += len(records)
+                click.echo(f"  Imported {count} people...", err=True)
+                records = []
+        # Final batch
+        if records:
             embedding_texts = [r.get_embedding_text() for r in records]
             embeddings = embedder.embed_batch(embedding_texts)
             database.insert_batch(records, embeddings)
             count += len(records)
-            click.echo(f"  Imported {count} people...", err=True)
-            records = []
-    # Final batch
-    if records:
-        embedding_texts = [r.get_embedding_text() for r in records]
-        embeddings = embedder.embed_batch(embedding_texts)
-        database.insert_batch(records, embeddings)
-        count += len(records)
+        if skip_existing and skipped_existing > 0:
+            click.echo(f"\nPhase 1 complete: {count} people imported (skipped {skipped_existing} existing).", err=True)
+        else:
+            click.echo(f"\nPhase 1 complete: {count} people imported.", err=True)
+    else:
+        click.echo("Skipping Phase 1 (bulk import) - using existing database records.", err=True)
+        # Enable enrich if enrich_only is set
+        enrich = True
+    # Phase 2: Enrich with role/org/dates (optional, slower but resumable)
+    if enrich:
+        click.echo("\nPhase 2: Enriching with role/org/dates (parallel queries)...", err=True)
+        # Get all people without role/org
+        people_to_enrich = []
+        enriched_count = 0
+        for record in database.iter_records():
+            if not record.known_for_role and not record.known_for_org:
+                people_to_enrich.append(record)
+                enriched_count += 1
+                # Apply limit if --enrich-only
+                if enrich_only and limit and enriched_count >= limit:
+                    break
+        if people_to_enrich:
+            click.echo(f"Found {len(people_to_enrich)} people to enrich...", err=True)
+            importer.enrich_people_role_org_batch(people_to_enrich, delay_seconds=0.1, max_workers=5)
+            # Persist the enriched data and re-generate embeddings
+            updated = 0
+            org_count = 0
+            date_count = 0
+            for person in people_to_enrich:
+                if person.known_for_role or person.known_for_org:
+                    # Look up org ID if we have org_qid
+                    org_qid = person.record.get("org_qid", "")
+                    if org_qid:
+                        org_id = org_database.get_id_by_source_id("wikipedia", org_qid)
+                        if org_id is not None:
+                            person.known_for_org_id = org_id
+                    # Update the record with new role/org/dates and re-embed
+                    new_embedding_text = person.get_embedding_text()
+                    new_embedding = embedder.embed(new_embedding_text)
+                    if database.update_role_org(
+                        person.source, person.source_id,
+                        person.known_for_role, person.known_for_org,
+                        person.known_for_org_id, new_embedding,
+                        person.from_date, person.to_date,
+                    ):
+                        updated += 1
+                        if person.known_for_org:
+                            org_count += 1
+                        if person.from_date or person.to_date:
+                            date_count += 1
+                        if verbose:
+                            date_str = ""
+                            if person.from_date or person.to_date:
+                                date_str = f" ({person.from_date or '?'} - {person.to_date or '?'})"
+                            click.echo(f"  {person.name}: {person.known_for_role} at {person.known_for_org}{date_str}", err=True)
+            click.echo(f"Updated {updated} people ({org_count} with orgs, {date_count} with dates).", err=True)
+    # Phase 3: Enrich with dates (optional, even slower)
+    if enrich_dates:
+        click.echo("\nPhase 3: Enriching with dates...", err=True)
+        # Get all people without dates but with role (dates are associated with positions)
+        people_to_enrich = []
+        for record in database.iter_records():
+            if not record.from_date and not record.to_date and record.known_for_role:
+                people_to_enrich.append(record)
+        if people_to_enrich:
+            click.echo(f"Found {len(people_to_enrich)} people to enrich with dates...", err=True)
+            enriched = importer.enrich_people_batch(people_to_enrich, delay_seconds=0.3)
+            # Persist the enriched dates
+            updated = 0
+            for person in people_to_enrich:
+                if person.from_date or person.to_date:
+                    if database.update_dates(person.source, person.source_id, person.from_date, person.to_date):
+                        updated += 1
+                        if verbose:
+                            click.echo(f"  {person.name}: {person.from_date or '?'} - {person.to_date or '?'}", err=True)
+            click.echo(f"Updated {updated} people with dates.", err=True)
+    org_database.close()
+    database.close()
+@db_cmd.command("import-wikidata-dump")
+@click.option("--dump", "dump_path", type=click.Path(exists=True), help="Path to Wikidata JSON dump file (.bz2 or .gz)")
+@click.option("--download", is_flag=True, help="Download latest dump first (~100GB)")
+@click.option("--force", is_flag=True, help="Force re-download even if cached")
+@click.option("--no-aria2", is_flag=True, help="Don't use aria2c even if available (slower)")
+@click.option("--db", "db_path", type=click.Path(), help="Database path")
+@click.option("--people/--no-people", default=True, help="Import people (default: yes)")
+@click.option("--orgs/--no-orgs", default=True, help="Import organizations (default: yes)")
+@click.option("--require-enwiki", is_flag=True, help="Only import orgs with English Wikipedia articles")
+@click.option("--resume", is_flag=True, help="Resume from last position in dump file (tracks entity index)")
+@click.option("--skip-updates", is_flag=True, help="Skip Q codes already in database (no updates)")
+@click.option("--limit", type=int, help="Max records per type (people and/or orgs)")
+@click.option("--batch-size", type=int, default=10000, help="Batch size for commits (default: 10000)")
+@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
+def db_import_wikidata_dump(
+    dump_path: Optional[str],
+    download: bool,
+    force: bool,
+    no_aria2: bool,
+    db_path: Optional[str],
+    people: bool,
+    orgs: bool,
+    require_enwiki: bool,
+    resume: bool,
+    skip_updates: bool,
+    limit: Optional[int],
+    batch_size: int,
+    verbose: bool,
+):
+    """
+    Import people and organizations from Wikidata JSON dump.
+    This uses the full Wikidata JSON dump (~100GB compressed) to import
+    all humans and organizations with English Wikipedia articles. This
+    avoids SPARQL query timeouts that occur with large result sets.
+    The dump is streamed line-by-line to minimize memory usage.
+    \b
+    Features:
+    - No timeouts (processes locally)
+    - Complete coverage (all notable people/orgs)
+    - Resumable with --resume (tracks position in dump file)
+    - Skip existing with --skip-updates (loads existing Q codes)
+    - People like Andy Burnham are captured via occupation (P106)
+    \b
+    Resume options:
+    - --resume: Resume from where the dump processing left off (tracks entity index).
+                Progress is saved after each batch. Use this if import was interrupted.
+    - --skip-updates: Skip Q codes already in database (no updates to existing records).
+                      Use this to add new records without re-processing existing ones.
+    \b
+    Examples:
+        corp-extractor db import-wikidata-dump --dump /path/to/dump.json.bz2 --limit 10000
+        corp-extractor db import-wikidata-dump --download --people --no-orgs --limit 50000
+        corp-extractor db import-wikidata-dump --dump dump.json.bz2 --orgs --no-people
+        corp-extractor db import-wikidata-dump --dump dump.json.bz2 --resume  # Resume interrupted import
+        corp-extractor db import-wikidata-dump --dump dump.json.bz2 --skip-updates  # Skip existing Q codes
+    """
+    _configure_logging(verbose)
+    from .database.store import get_person_database, get_database, DEFAULT_DB_PATH
+    from .database.embeddings import CompanyEmbedder
+    from .database.importers.wikidata_dump import WikidataDumpImporter, DumpProgress
+    if not dump_path and not download:
+        raise click.UsageError("Either --dump path or --download is required")
+    if not people and not orgs:
+        raise click.UsageError("Must import at least one of --people or --orgs")
+    # Default database path
+    if db_path is None:
+        db_path_obj = DEFAULT_DB_PATH
+    else:
+        db_path_obj = Path(db_path)
+    click.echo(f"Importing Wikidata dump to {db_path_obj}...", err=True)
+    # Initialize importer
+    importer = WikidataDumpImporter(dump_path=dump_path)
+    # Download if requested
+    if download:
+        import shutil
+        dump_target = importer.get_dump_path()
+        click.echo(f"Downloading Wikidata dump (~100GB) to:", err=True)
+        click.echo(f"  {dump_target}", err=True)
+        # Check for aria2c
+        has_aria2 = shutil.which("aria2c") is not None
+        use_aria2 = has_aria2 and not no_aria2
+        if use_aria2:
+            click.echo("  Using aria2c for fast parallel download (16 connections)", err=True)
+            dump_file = importer.download_dump(force=force, use_aria2=True)
+            click.echo(f"\nUsing dump: {dump_file}", err=True)
+        else:
+            if not has_aria2:
+                click.echo("", err=True)
+                click.echo("  TIP: Install aria2c for 10-20x faster downloads:", err=True)
+                click.echo("       brew install aria2  (macOS)", err=True)
+                click.echo("       apt install aria2   (Ubuntu/Debian)", err=True)
+                click.echo("", err=True)
+            # Use urllib to get content length first
+            import urllib.request
+            req = urllib.request.Request(
+                "https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2",
+                headers={"User-Agent": "corp-extractor/1.0"},
+                method="HEAD"
+            )
+            with urllib.request.urlopen(req) as response:
+                total_size = int(response.headers.get("content-length", 0))
+            if total_size:
+                total_gb = total_size / (1024 ** 3)
+                click.echo(f"  Size: {total_gb:.1f} GB", err=True)
+            # Download with progress bar
+            progress_bar = None
+            def update_progress(downloaded: int, total: int) -> None:
+                nonlocal progress_bar
+                if progress_bar is None and total > 0:
+                    progress_bar = click.progressbar(
+                        length=total,
+                        label="Downloading",
+                        show_percent=True,
+                        show_pos=True,
+                        item_show_func=lambda x: f"{(x or 0) / (1024**3):.1f} GB" if x else "",
+                    )
+                    progress_bar.__enter__()
+                if progress_bar:
+                    # Update to absolute position
+                    progress_bar.update(downloaded - progress_bar.pos)
+            try:
+                dump_file = importer.download_dump(force=force, use_aria2=False, progress_callback=update_progress)
+            finally:
+                if progress_bar:
+                    progress_bar.__exit__(None, None, None)
+            click.echo(f"\nUsing dump: {dump_file}", err=True)
+    elif dump_path:
+        click.echo(f"Using dump: {dump_path}", err=True)
+    # Initialize embedder (loads model, may take time on first run)
+    click.echo("Loading embedding model...", err=True)
+    sys.stderr.flush()
+    embedder = CompanyEmbedder()
+    click.echo("Embedding model loaded.", err=True)
+    sys.stderr.flush()
+    # Load existing QID labels from database and seed the importer's cache
+    database = get_person_database(db_path=db_path_obj)
+    existing_labels = database.get_all_qid_labels()
+    if existing_labels:
+        click.echo(f"Loaded {len(existing_labels):,} existing QID labels from DB", err=True)
+        importer.set_label_cache(existing_labels)
+    known_qids_at_start = set(existing_labels.keys())
+    # Load existing source_ids for skip_updates mode
+    existing_people_ids: set[str] = set()
+    existing_org_ids: set[str] = set()
+    if skip_updates:
+        click.echo("Loading existing records for --skip-updates...", err=True)
+        if people:
+            existing_people_ids = database.get_all_source_ids(source="wikidata")
+            click.echo(f"  Found {len(existing_people_ids):,} existing people Q codes", err=True)
+        if orgs:
+            org_database = get_database(db_path=db_path_obj)
+            existing_org_ids = org_database.get_all_source_ids(source="wikipedia")
+            click.echo(f"  Found {len(existing_org_ids):,} existing org Q codes", err=True)
+    # Load progress for resume mode (position-based resume)
+    progress: Optional[DumpProgress] = None
+    start_index = 0
+    if resume:
+        progress = DumpProgress.load()
+        if progress:
+            # Verify the progress is for the same dump file
+            actual_dump_path = importer._dump_path or Path(dump_path) if dump_path else importer.get_dump_path()
+            if progress.matches_dump(actual_dump_path):
+                start_index = progress.entity_index
+                click.echo(f"Resuming from entity index {start_index:,}", err=True)
+                click.echo(f"  Last entity: {progress.last_entity_id}", err=True)
+                click.echo(f"  Last updated: {progress.last_updated}", err=True)
+            else:
+                click.echo("Warning: Progress file is for a different dump, starting from beginning", err=True)
+                progress = None
+        else:
+            click.echo("No progress file found, starting from beginning", err=True)
+    # Initialize progress tracking
+    if progress is None:
+        actual_dump_path = importer._dump_path or Path(dump_path) if dump_path else importer.get_dump_path()
+        progress = DumpProgress(
+            dump_path=str(actual_dump_path),
+            dump_size=actual_dump_path.stat().st_size if actual_dump_path.exists() else 0,
+        )
-    click.echo(f"\nImported {count} people successfully.", err=True)
+    # Helper to persist new labels after each batch
+    def persist_new_labels() -> int:
+        new_labels = importer.get_new_labels_since(known_qids_at_start)
+        if new_labels:
+            database.insert_qid_labels(new_labels)
+            known_qids_at_start.update(new_labels.keys())
+            return len(new_labels)
+        return 0
+    # Combined import - single pass through the dump for both people and orgs
+    click.echo("\n=== Combined Import (single dump pass) ===", err=True)
+    sys.stderr.flush()  # Ensure output is visible immediately
+    if people:
+        click.echo(f"  People: {'up to ' + str(limit) + ' records' if limit else 'unlimited'}", err=True)
+        if skip_updates and existing_people_ids:
+            click.echo(f"    Skip updates: {len(existing_people_ids):,} existing Q codes", err=True)
+    if orgs:
+        click.echo(f"  Orgs: {'up to ' + str(limit) + ' records' if limit else 'unlimited'}", err=True)
+        if require_enwiki:
+            click.echo("    Filter: only orgs with English Wikipedia articles", err=True)
+        if skip_updates and existing_org_ids:
+            click.echo(f"    Skip updates: {len(existing_org_ids):,} existing Q codes", err=True)
+    if start_index > 0:
+        click.echo(f"  Resuming from entity index {start_index:,}", err=True)
+    # Initialize databases
+    person_database = get_person_database(db_path=db_path_obj)
+    org_database = get_database(db_path=db_path_obj) if orgs else None
+    # Batches for each type
+    people_records: list = []
+    org_records: list = []
+    people_count = 0
+    orgs_count = 0
+    last_entity_index = start_index
+    last_entity_id = ""
+    def combined_progress_callback(entity_index: int, entity_id: str, ppl_count: int, org_count: int) -> None:
+        nonlocal last_entity_index, last_entity_id
+        last_entity_index = entity_index
+        last_entity_id = entity_id
+    def save_progress() -> None:
+        if progress:
+            progress.entity_index = last_entity_index
+            progress.last_entity_id = last_entity_id
+            progress.people_yielded = people_count
+            progress.orgs_yielded = orgs_count
+            progress.save()
+    def flush_people_batch() -> None:
+        nonlocal people_records, people_count
+        if people_records:
+            embedding_texts = [r.get_embedding_text() for r in people_records]
+            embeddings = embedder.embed_batch(embedding_texts)
+            person_database.insert_batch(people_records, embeddings)
+            people_count += len(people_records)
+            people_records = []
+    def flush_org_batch() -> None:
+        nonlocal org_records, orgs_count
+        if org_records and org_database:
+            names = [r.name for r in org_records]
+            embeddings = embedder.embed_batch(names)
+            org_database.insert_batch(org_records, embeddings)
+            orgs_count += len(org_records)
+            org_records = []
+    # Calculate total for progress bar (if limits set for both)
+    total_limit = None
+    if limit and people and orgs:
+        total_limit = limit * 2  # Rough estimate
+    elif limit:
+        total_limit = limit
+    click.echo("Starting dump iteration...", err=True)
+    sys.stderr.flush()
+    records_seen = 0
+    try:
+        if total_limit:
+            # Use progress bar when we have limits
+            with click.progressbar(
+                length=total_limit,
+                label="Processing dump",
+                show_percent=True,
+                show_pos=True,
+            ) as pbar:
+                for record_type, record in importer.import_all(
+                    people_limit=limit if people else 0,
+                    orgs_limit=limit if orgs else 0,
+                    import_people=people,
+                    import_orgs=orgs,
+                    require_enwiki=require_enwiki,
+                    skip_people_ids=existing_people_ids if skip_updates else None,
+                    skip_org_ids=existing_org_ids if skip_updates else None,
+                    start_index=start_index,
+                    progress_callback=combined_progress_callback,
+                ):
+                    records_seen += 1
+                    pbar.update(1)
+                    if record_type == "person":
+                        people_records.append(record)
+                        if len(people_records) >= batch_size:
+                            flush_people_batch()
+                            persist_new_labels()
+                            save_progress()
+                    else:  # org
+                        org_records.append(record)
+                        if len(org_records) >= batch_size:
+                            flush_org_batch()
+                            persist_new_labels()
+                            save_progress()
+        else:
+            # No limit - show counter updates
+            for record_type, record in importer.import_all(
+                people_limit=None,
+                orgs_limit=None,
+                import_people=people,
+                import_orgs=orgs,
+                require_enwiki=require_enwiki,
+                skip_people_ids=existing_people_ids if skip_updates else None,
+                skip_org_ids=existing_org_ids if skip_updates else None,
+                start_index=start_index,
+                progress_callback=combined_progress_callback,
+            ):
+                records_seen += 1
+                # Show first record immediately as proof of life
+                if records_seen == 1:
+                    click.echo(f"  First record found: {record.name}", err=True)
+                    sys.stderr.flush()
+                if record_type == "person":
+                    people_records.append(record)
+                    if len(people_records) >= batch_size:
+                        flush_people_batch()
+                        persist_new_labels()
+                        save_progress()
+                        click.echo(f"\r  Progress: {people_count:,} people, {orgs_count:,} orgs...", nl=False, err=True)
+                        sys.stderr.flush()
+                else:  # org
+                    org_records.append(record)
+                    if len(org_records) >= batch_size:
+                        flush_org_batch()
+                        persist_new_labels()
+                        save_progress()
+                        click.echo(f"\r  Progress: {people_count:,} people, {orgs_count:,} orgs...", nl=False, err=True)
+                        sys.stderr.flush()
+            click.echo("", err=True)  # Newline after counter
+        # Final batches
+        flush_people_batch()
+        flush_org_batch()
+        persist_new_labels()
+        save_progress()
+    finally:
+        # Ensure we save progress even on interrupt
+        save_progress()
+    click.echo(f"Import complete: {people_count:,} people, {orgs_count:,} orgs", err=True)
+    # Keep references for final label resolution
+    database = person_database
+    if org_database:
+        org_database.close()
+    # Final label resolution pass for any remaining unresolved QIDs
+    click.echo("\n=== Final QID Label Resolution ===", err=True)
+    # Get the full label cache (includes labels from DB + new ones from import)
+    all_labels = importer.get_label_cache()
+    click.echo(f"  Total labels in cache: {len(all_labels):,}", err=True)
+    # Check for any remaining unresolved QIDs in the database
+    people_unresolved = database.get_unresolved_qids()
+    click.echo(f"  Unresolved QIDs in people: {len(people_unresolved):,}", err=True)
+    org_unresolved: set[str] = set()
+    if orgs:
+        org_database = get_database(db_path=db_path_obj)
+        org_unresolved = org_database.get_unresolved_qids()
+        click.echo(f"  Unresolved QIDs in orgs: {len(org_unresolved):,}", err=True)
+    all_unresolved = people_unresolved | org_unresolved
+    need_sparql = all_unresolved - set(all_labels.keys())
+    if need_sparql:
+        click.echo(f"  Resolving {len(need_sparql):,} remaining QIDs via SPARQL...", err=True)
+        sparql_resolved = importer.resolve_qids_via_sparql(need_sparql)
+        all_labels.update(sparql_resolved)
+        # Persist newly resolved labels
+        if sparql_resolved:
+            database.insert_qid_labels(sparql_resolved)
+            click.echo(f"  SPARQL resolved and stored: {len(sparql_resolved):,}", err=True)
+    # Update records with any newly resolved labels
+    if all_labels:
+        updates, deletes = database.resolve_qid_labels(all_labels)
+        if updates or deletes:
+            click.echo(f"  People: {updates:,} updated, {deletes:,} duplicates deleted", err=True)
+        if orgs:
+            org_database = get_database(db_path=db_path_obj)
+            org_updates = org_database.resolve_qid_labels(all_labels)
+            if org_updates:
+                click.echo(f"  Updated orgs: {org_updates:,} regions", err=True)
+            org_database.close()
+    # Final stats
+    final_label_count = database.get_qid_labels_count()
+    click.echo(f"  Total labels in DB: {final_label_count:,}", err=True)
     database.close()
+    click.echo("\nWikidata dump import complete!", err=True)
 @db_cmd.command("search-people")
 @click.argument("query")
@@ -1185,12 +1926,93 @@ def db_status(db_path: Optional[str]):
             for source, count in stats.by_source.items():
                 click.echo(f"  {source}: {count:,}")
+        # Show canonicalization stats
+        canon_stats = database.get_canon_stats()
+        if canon_stats["canonicalized_records"] > 0:
+            click.echo("\nCanonicalization:")
+            click.echo(f"  Canonicalized: {canon_stats['canonicalized_records']:,} / {canon_stats['total_records']:,}")
+            click.echo(f"  Canonical groups: {canon_stats['canonical_groups']:,}")
+            click.echo(f"  Multi-record groups: {canon_stats['multi_record_groups']:,}")
+            click.echo(f"  Records in multi-groups: {canon_stats['records_in_multi_groups']:,}")
+        else:
+            click.echo("\nCanonicalization: Not run yet")
+            click.echo("   Run 'corp-extractor db canonicalize' to link equivalent records")
         database.close()
     except Exception as e:
         raise click.ClickException(f"Failed to read database: {e}")
+@db_cmd.command("canonicalize")
+@click.option("--db", "db_path", type=click.Path(), help="Database path")
+@click.option("--batch-size", type=int, default=10000, help="Batch size for updates (default: 10000)")
+@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
+def db_canonicalize(db_path: Optional[str], batch_size: int, verbose: bool):
+    """
+    Canonicalize organizations by linking equivalent records across sources.
+    Records are considered equivalent if they share:
+    - Same LEI (globally unique legal entity identifier)
+    - Same ticker symbol
+    - Same CIK (SEC identifier)
+    - Same normalized name (after lowercasing, removing dots)
+    - Same name with suffix expansion (Ltd -> Limited, etc.)
+    For each group, the highest-priority source becomes canonical:
+    gleif > sec_edgar > companies_house > wikipedia
+    Canonicalization enables better search re-ranking by boosting results
+    that have records from multiple authoritative sources.
+    \b
+    Examples:
+        corp-extractor db canonicalize
+        corp-extractor db canonicalize -v
+        corp-extractor db canonicalize --db /path/to/entities.db
+    """
+    _configure_logging(verbose)
+    from .database import OrganizationDatabase
+    from .database.store import get_person_database
+    try:
+        # Canonicalize organizations
+        database = OrganizationDatabase(db_path=db_path)
+        click.echo("Running organization canonicalization...", err=True)
+        result = database.canonicalize(batch_size=batch_size)
+        click.echo("\nOrganization Canonicalization Results")
+        click.echo("=" * 40)
+        click.echo(f"Total records processed: {result['total_records']:,}")
+        click.echo(f"Equivalence groups found: {result['groups_found']:,}")
+        click.echo(f"Multi-record groups: {result['multi_record_groups']:,}")
+        click.echo(f"Records updated: {result['records_updated']:,}")
+        database.close()
+        # Canonicalize people
+        db_path_obj = Path(db_path) if db_path else None
+        person_db = get_person_database(db_path=db_path_obj)
+        click.echo("\nRunning people canonicalization...", err=True)
+        people_result = person_db.canonicalize(batch_size=batch_size)
+        click.echo("\nPeople Canonicalization Results")
+        click.echo("=" * 40)
+        click.echo(f"Total records processed: {people_result['total_records']:,}")
+        click.echo(f"Matched by organization: {people_result['matched_by_org']:,}")
+        click.echo(f"Matched by date overlap: {people_result['matched_by_date']:,}")
+        click.echo(f"Canonical groups: {people_result['canonical_groups']:,}")
+        click.echo(f"Records in multi-record groups: {people_result['records_in_groups']:,}")
+        person_db.close()
+    except Exception as e:
+        raise click.ClickException(f"Canonicalization failed: {e}")
 @db_cmd.command("search")
 @click.argument("query")
 @click.option("--db", "db_path", type=click.Path(), help="Database path")
@@ -1247,10 +2069,9 @@ def db_search(query: str, db_path: Optional[str], top_k: int, source: Optional[s
 @click.option("--repo", type=str, default="Corp-o-Rate-Community/entity-references", help="HuggingFace repo ID")
 @click.option("--db", "db_path", type=click.Path(), help="Output path for database")
 @click.option("--full", is_flag=True, help="Download full version (larger, includes record metadata)")
-@click.option("--no-compress", is_flag=True, help="Download uncompressed version (slower)")
 @click.option("--force", is_flag=True, help="Force re-download")
 @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
-def db_download(repo: str, db_path: Optional[str], full: bool, no_compress: bool, force: bool, verbose: bool):
+def db_download(repo: str, db_path: Optional[str], full: bool, force: bool, verbose: bool):
     """
     Download entity database from HuggingFace Hub.
@@ -1274,7 +2095,6 @@ def db_download(repo: str, db_path: Optional[str], full: bool, no_compress: bool
             repo_id=repo,
             filename=filename,
             force_download=force,
-            prefer_compressed=not no_compress,
         )
         click.echo(f"Database downloaded to: {path}")
     except Exception as e:
@@ -1286,27 +2106,23 @@ def db_download(repo: str, db_path: Optional[str], full: bool, no_compress: bool
 @click.option("--repo", type=str, default="Corp-o-Rate-Community/entity-references", help="HuggingFace repo ID")
 @click.option("--message", type=str, default="Update entity database", help="Commit message")
 @click.option("--no-lite", is_flag=True, help="Skip creating lite version (without record data)")
-@click.option("--no-compress", is_flag=True, help="Skip creating compressed versions")
 @click.option("-v", "--verbose", is_flag=True, help="Verbose output")
-def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, no_compress: bool, verbose: bool):
+def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, verbose: bool):
     """
-    Upload entity database to HuggingFace Hub with variants.
+    Upload entity database to HuggingFace Hub.
-    If no path is provided, uploads from the default cache location.
-    By default uploads:
+    First VACUUMs the database, then creates and uploads:
     - entities.db (full database)
     - entities-lite.db (without record data, smaller)
-    - entities.db.gz (compressed full)
-    - entities-lite.db.gz (compressed lite)
+    If no path is provided, uploads from the default cache location.
     Requires HF_TOKEN environment variable to be set.
     \b
     Examples:
         corp-extractor db upload
         corp-extractor db upload /path/to/entities.db
-        corp-extractor db upload --no-lite --no-compress
+        corp-extractor db upload --no-lite
         corp-extractor db upload --repo my-org/my-entity-db
     """
     _configure_logging(verbose)
@@ -1322,10 +2138,9 @@ def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, no
             )
     click.echo(f"Uploading {db_path} to {repo}...", err=True)
+    click.echo("  - Running VACUUM to optimize database", err=True)
     if not no_lite:
         click.echo("  - Creating lite version (without record data)", err=True)
-    if not no_compress:
-        click.echo("  - Creating compressed versions", err=True)
     try:
         results = upload_database_with_variants(
@@ -1333,7 +2148,6 @@ def db_upload(db_path: Optional[str], repo: str, message: str, no_lite: bool, no
             repo_id=repo,
             commit_message=message,
             include_lite=not no_lite,
-            include_compressed=not no_compress,
         )
         click.echo(f"\nUploaded {len(results)} file(s) successfully:")
         for filename, url in results.items():
@@ -1371,31 +2185,6 @@ def db_create_lite(db_path: str, output: Optional[str], verbose: bool):
         raise click.ClickException(f"Failed to create lite database: {e}")
-@db_cmd.command("compress")
-@click.argument("db_path", type=click.Path(exists=True))
-@click.option("-o", "--output", type=click.Path(), help="Output path (default: adds .gz suffix)")
-@click.option("-v", "--verbose", is_flag=True, help="Verbose output")
-def db_compress(db_path: str, output: Optional[str], verbose: bool):
-    """
-    Compress a database file using gzip.
-    \b
-    Examples:
-        corp-extractor db compress entities.db
-        corp-extractor db compress entities.db -o entities.db.gz
-    """
-    _configure_logging(verbose)
-    from .database.hub import compress_database
-    click.echo(f"Compressing {db_path}...", err=True)
-    try:
-        compressed_path = compress_database(db_path, output)
-        click.echo(f"Compressed database created: {compressed_path}")
-    except Exception as e:
-        raise click.ClickException(f"Compression failed: {e}")
 @db_cmd.command("repair-embeddings")
 @click.option("--db", "db_path", type=click.Path(), help="Database path")
 @click.option("--batch-size", type=int, default=1000, help="Batch size for embedding generation (default: 1000)")

corp-extractor 0.9.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

corp-extractor 0.9.0py3-none-any.whl → 0.9.3py3-none-any.whl