PyPI - corp-extractor - Versions diffs - 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl - Mend

corp-extractor 0.9.0py3-none-any.whl → 0.9.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +72 -11
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +34 -27
statement_extractor/cli.py +1317 -101
statement_extractor/database/embeddings.py +45 -0
statement_extractor/database/hub.py +86 -136
statement_extractor/database/importers/__init__.py +10 -2
statement_extractor/database/importers/companies_house.py +16 -2
statement_extractor/database/importers/companies_house_officers.py +431 -0
statement_extractor/database/importers/gleif.py +23 -0
statement_extractor/database/importers/import_utils.py +264 -0
statement_extractor/database/importers/sec_edgar.py +17 -0
statement_extractor/database/importers/sec_form4.py +512 -0
statement_extractor/database/importers/wikidata.py +151 -43
statement_extractor/database/importers/wikidata_dump.py +2282 -0
statement_extractor/database/importers/wikidata_people.py +867 -325
statement_extractor/database/migrate_v2.py +852 -0
statement_extractor/database/models.py +155 -7
statement_extractor/database/schema_v2.py +409 -0
statement_extractor/database/seed_data.py +359 -0
statement_extractor/database/store.py +3449 -233
statement_extractor/document/deduplicator.py +10 -12
statement_extractor/extractor.py +1 -1
statement_extractor/models/__init__.py +3 -2
statement_extractor/models/statement.py +15 -17
statement_extractor/models.py +1 -1
statement_extractor/pipeline/context.py +5 -5
statement_extractor/pipeline/orchestrator.py +12 -12
statement_extractor/plugins/base.py +17 -17
statement_extractor/plugins/extractors/gliner2.py +28 -28
statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
statement_extractor/plugins/qualifiers/person.py +120 -53
statement_extractor/plugins/splitters/t5_gemma.py +35 -39
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0

statement_extractor/database/embeddings.py CHANGED Viewed

@@ -121,6 +121,51 @@ class CompanyEmbedder:
         )
         return embeddings.astype(np.float32)
+    def quantize_to_int8(self, embedding: np.ndarray) -> np.ndarray:
+        """
+        Quantize L2-normalized float32 embedding to int8.
+        For normalized embeddings (values in [-1, 1]), this provides
+        75% storage reduction with ~92% recall at top-100.
+        Args:
+            embedding: L2-normalized float32 embedding vector
+        Returns:
+            int8 embedding vector
+        """
+        return np.clip(np.round(embedding * 127), -127, 127).astype(np.int8)
+    def embed_and_quantize(self, text: str) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Embed text and return both float32 and int8 embeddings.
+        Args:
+            text: Text to embed
+        Returns:
+            Tuple of (float32_embedding, int8_embedding)
+        """
+        fp32 = self.embed(text)
+        return fp32, self.quantize_to_int8(fp32)
+    def embed_batch_and_quantize(
+        self, texts: list[str], batch_size: int = 32
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Embed multiple texts and return both float32 and int8 embeddings.
+        Args:
+            texts: List of texts to embed
+            batch_size: Batch size for processing
+        Returns:
+            Tuple of (float32_embeddings, int8_embeddings) matrices
+        """
+        fp32 = self.embed_batch(texts, batch_size=batch_size)
+        int8 = np.array([self.quantize_to_int8(e) for e in fp32])
+        return fp32, int8
     def similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
         """
         Compute cosine similarity between two embeddings.

statement_extractor/database/hub.py CHANGED Viewed

@@ -6,10 +6,8 @@ Provides functionality to:
 - Upload/publish database updates
 - Version management for database files
 - Create "lite" versions without full records for smaller downloads
-- Optional gzip compression for reduced file sizes
 """
-import gzip
 import logging
 import os
 import shutil
@@ -22,11 +20,9 @@ logger = logging.getLogger(__name__)
 # Default HuggingFace repo for entity database
 DEFAULT_REPO_ID = "Corp-o-Rate-Community/entity-references"
-DEFAULT_DB_FILENAME = "entities-lite.db"  # Lite is the default (smaller download)
-DEFAULT_DB_FULL_FILENAME = "entities.db"
-DEFAULT_DB_LITE_FILENAME = "entities-lite.db"
-DEFAULT_DB_COMPRESSED_FILENAME = "entities.db.gz"
-DEFAULT_DB_LITE_COMPRESSED_FILENAME = "entities-lite.db.gz"
+DEFAULT_DB_FILENAME = "entities-v2-lite.db"  # Lite is the default (smaller download)
+DEFAULT_DB_FULL_FILENAME = "entities-v2.db"
+DEFAULT_DB_LITE_FILENAME = "entities-v2-lite.db"
 # Local cache directory
 DEFAULT_CACHE_DIR = Path.home() / ".cache" / "corp-extractor"
@@ -59,7 +55,8 @@ def get_database_path(
     # Check common locations
     possible_paths = [
         cache_dir / filename,
-        cache_dir / "entities.db",
+        cache_dir / "entities-v2.db",
+        cache_dir / "entities.db",  # Legacy fallback
         Path.home() / ".cache" / "huggingface" / "hub" / f"datasets--{repo_id.replace('/', '--')}" / filename,
     ]
@@ -139,7 +136,7 @@ def upload_database(
         commit_message=commit_message,
     )
-    logger.info(f"Database uploaded successfully")
+    logger.info("Database uploaded successfully")
     return result
@@ -189,6 +186,33 @@ def check_for_updates(
     return latest != current_version, latest
+def vacuum_database(db_path: str | Path) -> None:
+    """
+    VACUUM the database to reclaim space and optimize it.
+    Args:
+        db_path: Path to the database file
+    """
+    db_path = Path(db_path)
+    if not db_path.exists():
+        raise FileNotFoundError(f"Database not found: {db_path}")
+    original_size = db_path.stat().st_size
+    logger.info(f"Running VACUUM on {db_path} ({original_size / (1024*1024):.1f}MB)")
+    # Use isolation_level=None for autocommit (required for VACUUM)
+    conn = sqlite3.connect(str(db_path), isolation_level=None)
+    try:
+        conn.execute("VACUUM")
+    finally:
+        conn.close()
+    new_size = db_path.stat().st_size
+    reduction = (1 - new_size / original_size) * 100
+    logger.info(f"After VACUUM: {new_size / (1024*1024):.1f}MB (reduced {reduction:.1f}%)")
 def create_lite_database(
     source_db_path: str | Path,
     output_path: Optional[str | Path] = None,
@@ -196,8 +220,10 @@ def create_lite_database(
     """
     Create a lite version of the database without full records.
-    The lite version strips the `record` column content (sets to empty {}),
-    significantly reducing file size while keeping embeddings and core fields.
+    The lite version:
+    - Strips the `record` column content (sets to empty {})
+    - Drops float32 embedding tables (keeps only scalar int8 embeddings)
+    - Significantly reduces file size (~75% reduction)
     Args:
         source_db_path: Path to the full database
@@ -206,6 +232,8 @@ def create_lite_database(
     Returns:
         Path to the lite database
     """
+    import sqlite_vec
     source_db_path = Path(source_db_path)
     if not source_db_path.exists():
         raise FileNotFoundError(f"Source database not found: {source_db_path}")
@@ -223,14 +251,51 @@ def create_lite_database(
     # Connect and strip record contents
     # Use isolation_level=None for autocommit (required for VACUUM)
     conn = sqlite3.connect(str(output_path), isolation_level=None)
+    # Load sqlite-vec extension (required for vec0 virtual tables)
+    conn.enable_load_extension(True)
+    sqlite_vec.load(conn)
+    conn.enable_load_extension(False)
     try:
         # Update all records to have empty record JSON
         conn.execute("BEGIN")
         cursor = conn.execute("UPDATE organizations SET record = '{}'")
         updated = cursor.rowcount
-        logger.info(f"Stripped {updated} record fields")
+        logger.info(f"Stripped {updated} organization record fields")
+        # Also strip people records if table exists
+        cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='people'")
+        if cursor.fetchone():
+            cursor = conn.execute("UPDATE people SET record = '{}'")
+            logger.info(f"Stripped {cursor.rowcount} people record fields")
         conn.execute("COMMIT")
+        # Drop float32 embedding tables (keep only scalar int8 for 75% storage savings)
+        # Check if scalar tables exist before dropping float32 tables
+        cursor = conn.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name='organization_embeddings_scalar'"
+        )
+        has_org_scalar = cursor.fetchone() is not None
+        cursor = conn.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name='person_embeddings_scalar'"
+        )
+        has_person_scalar = cursor.fetchone() is not None
+        if has_org_scalar:
+            logger.info("Dropping float32 organization_embeddings table (keeping scalar)")
+            conn.execute("DROP TABLE IF EXISTS organization_embeddings")
+        else:
+            logger.warning("No scalar organization embeddings found, keeping float32 table")
+        if has_person_scalar:
+            logger.info("Dropping float32 person_embeddings table (keeping scalar)")
+            conn.execute("DROP TABLE IF EXISTS person_embeddings")
+        else:
+            logger.warning("No scalar person embeddings found, keeping float32 table")
         # Vacuum to reclaim space (must be outside transaction)
         conn.execute("VACUUM")
     finally:
@@ -248,98 +313,20 @@ def create_lite_database(
     return output_path
-def compress_database(
-    db_path: str | Path,
-    output_path: Optional[str | Path] = None,
-) -> Path:
-    """
-    Compress a database file using gzip.
-    Args:
-        db_path: Path to the database file
-        output_path: Output path for compressed file (default: adds .gz suffix)
-    Returns:
-        Path to the compressed file
-    """
-    db_path = Path(db_path)
-    if not db_path.exists():
-        raise FileNotFoundError(f"Database not found: {db_path}")
-    if output_path is None:
-        output_path = db_path.with_suffix(db_path.suffix + ".gz")
-    output_path = Path(output_path)
-    logger.info(f"Compressing {db_path} to {output_path}")
-    with open(db_path, "rb") as f_in:
-        with gzip.open(output_path, "wb", compresslevel=9) as f_out:
-            shutil.copyfileobj(f_in, f_out)
-    # Log compression results
-    original_size = db_path.stat().st_size
-    compressed_size = output_path.stat().st_size
-    ratio = (1 - compressed_size / original_size) * 100
-    logger.info(f"Original: {original_size / (1024*1024):.1f}MB")
-    logger.info(f"Compressed: {compressed_size / (1024*1024):.1f}MB")
-    logger.info(f"Compression ratio: {ratio:.1f}%")
-    return output_path
-def decompress_database(
-    compressed_path: str | Path,
-    output_path: Optional[str | Path] = None,
-) -> Path:
-    """
-    Decompress a gzipped database file.
-    Args:
-        compressed_path: Path to the .gz file
-        output_path: Output path (default: removes .gz suffix)
-    Returns:
-        Path to the decompressed file
-    """
-    compressed_path = Path(compressed_path)
-    if not compressed_path.exists():
-        raise FileNotFoundError(f"Compressed file not found: {compressed_path}")
-    if output_path is None:
-        if compressed_path.suffix == ".gz":
-            output_path = compressed_path.with_suffix("")
-        else:
-            output_path = compressed_path.with_stem(compressed_path.stem + "-decompressed")
-    output_path = Path(output_path)
-    logger.info(f"Decompressing {compressed_path} to {output_path}")
-    with gzip.open(compressed_path, "rb") as f_in:
-        with open(output_path, "wb") as f_out:
-            shutil.copyfileobj(f_in, f_out)
-    logger.info(f"Decompressed to {output_path}")
-    return output_path
 def upload_database_with_variants(
     db_path: str | Path,
     repo_id: str = DEFAULT_REPO_ID,
     commit_message: str = "Update entity database",
     token: Optional[str] = None,
     include_lite: bool = True,
-    include_compressed: bool = True,
     include_readme: bool = True,
 ) -> dict[str, str]:
     """
-    Upload entity database with optional lite and compressed variants.
+    Upload entity database with optional lite variant.
-    Creates and uploads:
-    - entities.db (full database)
-    - entities-lite.db (without record data, smaller)
-    - entities.db.gz (compressed full database)
-    - entities-lite.db.gz (compressed lite database)
+    First VACUUMs the database, then creates and uploads:
+    - entities-v2.db (full database with v2 normalized schema)
+    - entities-v2-lite.db (without record data, smaller)
     - README.md (dataset card from HUGGINGFACE_README.md)
     Args:
@@ -348,7 +335,6 @@ def upload_database_with_variants(
         commit_message: Git commit message
         token: HuggingFace API token
         include_lite: Whether to create and upload lite version
-        include_compressed: Whether to create and upload compressed versions
         include_readme: Whether to upload the README.md dataset card
     Returns:
@@ -383,6 +369,9 @@ def upload_database_with_variants(
     except Exception as e:
         logger.debug(f"Repo creation note: {e}")
+    # VACUUM the database first to optimize it
+    vacuum_database(db_path)
     results = {}
     # Create temp directory for variants
@@ -399,20 +388,6 @@ def upload_database_with_variants(
             create_lite_database(db_path, lite_path)
             files_to_upload.append((lite_path, DEFAULT_DB_LITE_FILENAME))
-        # Compressed versions
-        if include_compressed:
-            # Compress full database
-            compressed_path = temp_path / DEFAULT_DB_COMPRESSED_FILENAME
-            compress_database(db_path, compressed_path)
-            files_to_upload.append((compressed_path, DEFAULT_DB_COMPRESSED_FILENAME))
-            # Compress lite database
-            if include_lite:
-                lite_compressed_path = temp_path / DEFAULT_DB_LITE_COMPRESSED_FILENAME
-                lite_path = temp_path / DEFAULT_DB_LITE_FILENAME
-                compress_database(lite_path, lite_compressed_path)
-                files_to_upload.append((lite_compressed_path, DEFAULT_DB_LITE_COMPRESSED_FILENAME))
         # Copy all files to a staging directory for upload_folder
         staging_dir = temp_path / "staging"
         staging_dir.mkdir()
@@ -455,7 +430,6 @@ def download_database(
     revision: Optional[str] = None,
     cache_dir: Optional[Path] = None,
     force_download: bool = False,
-    prefer_compressed: bool = True,
 ) -> Path:
     """
     Download entity database from HuggingFace Hub.
@@ -466,10 +440,9 @@ def download_database(
         revision: Git revision (branch, tag, commit) or None for latest
         cache_dir: Local cache directory
         force_download: Force re-download even if cached
-        prefer_compressed: Try to download compressed version first
     Returns:
-        Path to the downloaded database file (decompressed if was .gz)
+        Path to the downloaded database file
     """
     try:
         from huggingface_hub import hf_hub_download
@@ -482,34 +455,11 @@ def download_database(
     cache_dir = cache_dir or DEFAULT_CACHE_DIR
     cache_dir.mkdir(parents=True, exist_ok=True)
-    # Try compressed version first if preferred
-    download_filename = filename
-    if prefer_compressed and not filename.endswith(".gz"):
-        compressed_filename = filename + ".gz"
-        try:
-            logger.info(f"Trying compressed version: {compressed_filename}")
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=compressed_filename,
-                revision=revision,
-                cache_dir=str(cache_dir),
-                force_download=force_download,
-                repo_type="dataset",
-            )
-            # Decompress to final location
-            final_path = cache_dir / filename
-            decompress_database(local_path, final_path)
-            logger.info(f"Database downloaded and decompressed to {final_path}")
-            return final_path
-        except Exception as e:
-            logger.debug(f"Compressed version not available: {e}")
-    # Download uncompressed version
     logger.info(f"Downloading entity database from {repo_id}...")
     local_path = hf_hub_download(
         repo_id=repo_id,
-        filename=download_filename,
+        filename=filename,
         revision=revision,
         cache_dir=str(cache_dir),
         force_download=force_download,

statement_extractor/database/importers/__init__.py CHANGED Viewed

@@ -4,21 +4,29 @@ Data importers for the entity database.
 Provides importers for various data sources:
 - GLEIF: Legal Entity Identifier data
 - SEC Edgar: US SEC company data
+- SEC Form 4: US SEC insider ownership data (officers/directors)
 - Companies House: UK company data
-- Wikidata: Wikipedia/Wikidata organization data
-- Wikidata People: Notable people from Wikipedia/Wikidata
+- Wikidata: Wikipedia/Wikidata organization data (SPARQL-based, may timeout)
+- Wikidata People: Notable people from Wikipedia/Wikidata (SPARQL-based, may timeout)
+- Wikidata Dump: Bulk import from Wikidata JSON dump (recommended for large imports)
 """
 from .gleif import GleifImporter
 from .sec_edgar import SecEdgarImporter
+from .sec_form4 import SecForm4Importer
 from .companies_house import CompaniesHouseImporter
+from .companies_house_officers import CompaniesHouseOfficersImporter
 from .wikidata import WikidataImporter
 from .wikidata_people import WikidataPeopleImporter
+from .wikidata_dump import WikidataDumpImporter
 __all__ = [
     "GleifImporter",
     "SecEdgarImporter",
+    "SecForm4Importer",
     "CompaniesHouseImporter",
+    "CompaniesHouseOfficersImporter",
     "WikidataImporter",
     "WikidataPeopleImporter",
+    "WikidataDumpImporter",
 ]

statement_extractor/database/importers/companies_house.py CHANGED Viewed

@@ -342,13 +342,18 @@ class CompaniesHouseImporter:
             raw_company_type = item.get("company_type", "")
             entity_type = _get_entity_type_from_company_type(raw_company_type)
+            # Get dates
+            date_of_creation = item.get("date_of_creation")
+            date_of_cessation = item.get("date_of_cessation")  # For dissolved companies
             # Build record
             record_data = {
                 "company_number": company_number,
                 "title": title,
                 "company_status": company_status,
                 "company_type": raw_company_type,
-                "date_of_creation": item.get("date_of_creation"),
+                "date_of_creation": date_of_creation,
+                "date_of_cessation": date_of_cessation,
                 "locality": locality,
                 "region": region,
                 "country": country,
@@ -360,6 +365,8 @@ class CompaniesHouseImporter:
                 source_id=company_number,
                 region=country,
                 entity_type=entity_type,
+                from_date=date_of_creation,
+                to_date=date_of_cessation,
                 record=record_data,
             )
@@ -397,12 +404,17 @@ class CompaniesHouseImporter:
             raw_company_type = row.get("CompanyCategory", "").strip()
             entity_type = _get_entity_type_from_company_type(raw_company_type)
+            # Get dates from CSV
+            date_of_creation = row.get("IncorporationDate", "").strip() or None
+            date_of_cessation = row.get("DissolutionDate", "").strip() or None
             record_data = {
                 "company_number": company_number,
                 "title": company_name,
                 "company_status": company_status,
                 "company_type": raw_company_type,
-                "date_of_creation": row.get("IncorporationDate", "").strip(),
+                "date_of_creation": date_of_creation,
+                "date_of_cessation": date_of_cessation,
                 "country": row.get("CountryOfOrigin", "United Kingdom").strip(),
                 "sic_code": row.get("SICCode.SicText_1", "").strip(),
             }
@@ -416,6 +428,8 @@ class CompaniesHouseImporter:
                 source_id=company_number,
                 region=region,
                 entity_type=entity_type,
+                from_date=date_of_creation,
+                to_date=date_of_cessation,
                 record=record_data,
             )

corp-extractor 0.9.0__py3-none-any.whl → 0.9.4__py3-none-any.whl

corp-extractor 0.9.0py3-none-any.whl → 0.9.4py3-none-any.whl