PyPI - corp-extractor - Versions diffs - 0.9.0__py3-none-any.whl → 0.9.3__py3-none-any.whl - Mend

corp-extractor 0.9.0py3-none-any.whl → 0.9.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/METADATA +40 -9
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/RECORD +29 -26
statement_extractor/cli.py +866 -77
statement_extractor/database/hub.py +35 -127
statement_extractor/database/importers/__init__.py +10 -2
statement_extractor/database/importers/companies_house.py +16 -2
statement_extractor/database/importers/companies_house_officers.py +431 -0
statement_extractor/database/importers/gleif.py +23 -0
statement_extractor/database/importers/sec_edgar.py +17 -0
statement_extractor/database/importers/sec_form4.py +512 -0
statement_extractor/database/importers/wikidata.py +151 -43
statement_extractor/database/importers/wikidata_dump.py +1951 -0
statement_extractor/database/importers/wikidata_people.py +823 -325
statement_extractor/database/models.py +30 -6
statement_extractor/database/store.py +1485 -60
statement_extractor/document/deduplicator.py +10 -12
statement_extractor/extractor.py +1 -1
statement_extractor/models/__init__.py +3 -2
statement_extractor/models/statement.py +15 -17
statement_extractor/models.py +1 -1
statement_extractor/pipeline/context.py +5 -5
statement_extractor/pipeline/orchestrator.py +12 -12
statement_extractor/plugins/base.py +17 -17
statement_extractor/plugins/extractors/gliner2.py +28 -28
statement_extractor/plugins/qualifiers/embedding_company.py +7 -5
statement_extractor/plugins/qualifiers/person.py +11 -1
statement_extractor/plugins/splitters/t5_gemma.py +35 -39
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/WHEEL +0 -0
{corp_extractor-0.9.0.dist-info → corp_extractor-0.9.3.dist-info}/entry_points.txt +0 -0

statement_extractor/database/hub.py CHANGED Viewed

@@ -6,10 +6,8 @@ Provides functionality to:
 - Upload/publish database updates
 - Version management for database files
 - Create "lite" versions without full records for smaller downloads
-- Optional gzip compression for reduced file sizes
 """
-import gzip
 import logging
 import os
 import shutil
@@ -25,8 +23,6 @@ DEFAULT_REPO_ID = "Corp-o-Rate-Community/entity-references"
 DEFAULT_DB_FILENAME = "entities-lite.db"  # Lite is the default (smaller download)
 DEFAULT_DB_FULL_FILENAME = "entities.db"
 DEFAULT_DB_LITE_FILENAME = "entities-lite.db"
-DEFAULT_DB_COMPRESSED_FILENAME = "entities.db.gz"
-DEFAULT_DB_LITE_COMPRESSED_FILENAME = "entities-lite.db.gz"
 # Local cache directory
 DEFAULT_CACHE_DIR = Path.home() / ".cache" / "corp-extractor"
@@ -139,7 +135,7 @@ def upload_database(
         commit_message=commit_message,
     )
-    logger.info(f"Database uploaded successfully")
+    logger.info("Database uploaded successfully")
     return result
@@ -189,6 +185,33 @@ def check_for_updates(
     return latest != current_version, latest
+def vacuum_database(db_path: str | Path) -> None:
+    """
+    VACUUM the database to reclaim space and optimize it.
+    Args:
+        db_path: Path to the database file
+    """
+    db_path = Path(db_path)
+    if not db_path.exists():
+        raise FileNotFoundError(f"Database not found: {db_path}")
+    original_size = db_path.stat().st_size
+    logger.info(f"Running VACUUM on {db_path} ({original_size / (1024*1024):.1f}MB)")
+    # Use isolation_level=None for autocommit (required for VACUUM)
+    conn = sqlite3.connect(str(db_path), isolation_level=None)
+    try:
+        conn.execute("VACUUM")
+    finally:
+        conn.close()
+    new_size = db_path.stat().st_size
+    reduction = (1 - new_size / original_size) * 100
+    logger.info(f"After VACUUM: {new_size / (1024*1024):.1f}MB (reduced {reduction:.1f}%)")
 def create_lite_database(
     source_db_path: str | Path,
     output_path: Optional[str | Path] = None,
@@ -248,98 +271,20 @@ def create_lite_database(
     return output_path
-def compress_database(
-    db_path: str | Path,
-    output_path: Optional[str | Path] = None,
-) -> Path:
-    """
-    Compress a database file using gzip.
-    Args:
-        db_path: Path to the database file
-        output_path: Output path for compressed file (default: adds .gz suffix)
-    Returns:
-        Path to the compressed file
-    """
-    db_path = Path(db_path)
-    if not db_path.exists():
-        raise FileNotFoundError(f"Database not found: {db_path}")
-    if output_path is None:
-        output_path = db_path.with_suffix(db_path.suffix + ".gz")
-    output_path = Path(output_path)
-    logger.info(f"Compressing {db_path} to {output_path}")
-    with open(db_path, "rb") as f_in:
-        with gzip.open(output_path, "wb", compresslevel=9) as f_out:
-            shutil.copyfileobj(f_in, f_out)
-    # Log compression results
-    original_size = db_path.stat().st_size
-    compressed_size = output_path.stat().st_size
-    ratio = (1 - compressed_size / original_size) * 100
-    logger.info(f"Original: {original_size / (1024*1024):.1f}MB")
-    logger.info(f"Compressed: {compressed_size / (1024*1024):.1f}MB")
-    logger.info(f"Compression ratio: {ratio:.1f}%")
-    return output_path
-def decompress_database(
-    compressed_path: str | Path,
-    output_path: Optional[str | Path] = None,
-) -> Path:
-    """
-    Decompress a gzipped database file.
-    Args:
-        compressed_path: Path to the .gz file
-        output_path: Output path (default: removes .gz suffix)
-    Returns:
-        Path to the decompressed file
-    """
-    compressed_path = Path(compressed_path)
-    if not compressed_path.exists():
-        raise FileNotFoundError(f"Compressed file not found: {compressed_path}")
-    if output_path is None:
-        if compressed_path.suffix == ".gz":
-            output_path = compressed_path.with_suffix("")
-        else:
-            output_path = compressed_path.with_stem(compressed_path.stem + "-decompressed")
-    output_path = Path(output_path)
-    logger.info(f"Decompressing {compressed_path} to {output_path}")
-    with gzip.open(compressed_path, "rb") as f_in:
-        with open(output_path, "wb") as f_out:
-            shutil.copyfileobj(f_in, f_out)
-    logger.info(f"Decompressed to {output_path}")
-    return output_path
 def upload_database_with_variants(
     db_path: str | Path,
     repo_id: str = DEFAULT_REPO_ID,
     commit_message: str = "Update entity database",
     token: Optional[str] = None,
     include_lite: bool = True,
-    include_compressed: bool = True,
     include_readme: bool = True,
 ) -> dict[str, str]:
     """
-    Upload entity database with optional lite and compressed variants.
+    Upload entity database with optional lite variant.
-    Creates and uploads:
+    First VACUUMs the database, then creates and uploads:
     - entities.db (full database)
     - entities-lite.db (without record data, smaller)
-    - entities.db.gz (compressed full database)
-    - entities-lite.db.gz (compressed lite database)
     - README.md (dataset card from HUGGINGFACE_README.md)
     Args:
@@ -348,7 +293,6 @@ def upload_database_with_variants(
         commit_message: Git commit message
         token: HuggingFace API token
         include_lite: Whether to create and upload lite version
-        include_compressed: Whether to create and upload compressed versions
         include_readme: Whether to upload the README.md dataset card
     Returns:
@@ -383,6 +327,9 @@ def upload_database_with_variants(
     except Exception as e:
         logger.debug(f"Repo creation note: {e}")
+    # VACUUM the database first to optimize it
+    vacuum_database(db_path)
     results = {}
     # Create temp directory for variants
@@ -399,20 +346,6 @@ def upload_database_with_variants(
             create_lite_database(db_path, lite_path)
             files_to_upload.append((lite_path, DEFAULT_DB_LITE_FILENAME))
-        # Compressed versions
-        if include_compressed:
-            # Compress full database
-            compressed_path = temp_path / DEFAULT_DB_COMPRESSED_FILENAME
-            compress_database(db_path, compressed_path)
-            files_to_upload.append((compressed_path, DEFAULT_DB_COMPRESSED_FILENAME))
-            # Compress lite database
-            if include_lite:
-                lite_compressed_path = temp_path / DEFAULT_DB_LITE_COMPRESSED_FILENAME
-                lite_path = temp_path / DEFAULT_DB_LITE_FILENAME
-                compress_database(lite_path, lite_compressed_path)
-                files_to_upload.append((lite_compressed_path, DEFAULT_DB_LITE_COMPRESSED_FILENAME))
         # Copy all files to a staging directory for upload_folder
         staging_dir = temp_path / "staging"
         staging_dir.mkdir()
@@ -455,7 +388,6 @@ def download_database(
     revision: Optional[str] = None,
     cache_dir: Optional[Path] = None,
     force_download: bool = False,
-    prefer_compressed: bool = True,
 ) -> Path:
     """
     Download entity database from HuggingFace Hub.
@@ -466,10 +398,9 @@ def download_database(
         revision: Git revision (branch, tag, commit) or None for latest
         cache_dir: Local cache directory
         force_download: Force re-download even if cached
-        prefer_compressed: Try to download compressed version first
     Returns:
-        Path to the downloaded database file (decompressed if was .gz)
+        Path to the downloaded database file
     """
     try:
         from huggingface_hub import hf_hub_download
@@ -482,34 +413,11 @@ def download_database(
     cache_dir = cache_dir or DEFAULT_CACHE_DIR
     cache_dir.mkdir(parents=True, exist_ok=True)
-    # Try compressed version first if preferred
-    download_filename = filename
-    if prefer_compressed and not filename.endswith(".gz"):
-        compressed_filename = filename + ".gz"
-        try:
-            logger.info(f"Trying compressed version: {compressed_filename}")
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=compressed_filename,
-                revision=revision,
-                cache_dir=str(cache_dir),
-                force_download=force_download,
-                repo_type="dataset",
-            )
-            # Decompress to final location
-            final_path = cache_dir / filename
-            decompress_database(local_path, final_path)
-            logger.info(f"Database downloaded and decompressed to {final_path}")
-            return final_path
-        except Exception as e:
-            logger.debug(f"Compressed version not available: {e}")
-    # Download uncompressed version
     logger.info(f"Downloading entity database from {repo_id}...")
     local_path = hf_hub_download(
         repo_id=repo_id,
-        filename=download_filename,
+        filename=filename,
         revision=revision,
         cache_dir=str(cache_dir),
         force_download=force_download,

statement_extractor/database/importers/__init__.py CHANGED Viewed

@@ -4,21 +4,29 @@ Data importers for the entity database.
 Provides importers for various data sources:
 - GLEIF: Legal Entity Identifier data
 - SEC Edgar: US SEC company data
+- SEC Form 4: US SEC insider ownership data (officers/directors)
 - Companies House: UK company data
-- Wikidata: Wikipedia/Wikidata organization data
-- Wikidata People: Notable people from Wikipedia/Wikidata
+- Wikidata: Wikipedia/Wikidata organization data (SPARQL-based, may timeout)
+- Wikidata People: Notable people from Wikipedia/Wikidata (SPARQL-based, may timeout)
+- Wikidata Dump: Bulk import from Wikidata JSON dump (recommended for large imports)
 """
 from .gleif import GleifImporter
 from .sec_edgar import SecEdgarImporter
+from .sec_form4 import SecForm4Importer
 from .companies_house import CompaniesHouseImporter
+from .companies_house_officers import CompaniesHouseOfficersImporter
 from .wikidata import WikidataImporter
 from .wikidata_people import WikidataPeopleImporter
+from .wikidata_dump import WikidataDumpImporter
 __all__ = [
     "GleifImporter",
     "SecEdgarImporter",
+    "SecForm4Importer",
     "CompaniesHouseImporter",
+    "CompaniesHouseOfficersImporter",
     "WikidataImporter",
     "WikidataPeopleImporter",
+    "WikidataDumpImporter",
 ]

statement_extractor/database/importers/companies_house.py CHANGED Viewed

@@ -342,13 +342,18 @@ class CompaniesHouseImporter:
             raw_company_type = item.get("company_type", "")
             entity_type = _get_entity_type_from_company_type(raw_company_type)
+            # Get dates
+            date_of_creation = item.get("date_of_creation")
+            date_of_cessation = item.get("date_of_cessation")  # For dissolved companies
             # Build record
             record_data = {
                 "company_number": company_number,
                 "title": title,
                 "company_status": company_status,
                 "company_type": raw_company_type,
-                "date_of_creation": item.get("date_of_creation"),
+                "date_of_creation": date_of_creation,
+                "date_of_cessation": date_of_cessation,
                 "locality": locality,
                 "region": region,
                 "country": country,
@@ -360,6 +365,8 @@ class CompaniesHouseImporter:
                 source_id=company_number,
                 region=country,
                 entity_type=entity_type,
+                from_date=date_of_creation,
+                to_date=date_of_cessation,
                 record=record_data,
             )
@@ -397,12 +404,17 @@ class CompaniesHouseImporter:
             raw_company_type = row.get("CompanyCategory", "").strip()
             entity_type = _get_entity_type_from_company_type(raw_company_type)
+            # Get dates from CSV
+            date_of_creation = row.get("IncorporationDate", "").strip() or None
+            date_of_cessation = row.get("DissolutionDate", "").strip() or None
             record_data = {
                 "company_number": company_number,
                 "title": company_name,
                 "company_status": company_status,
                 "company_type": raw_company_type,
-                "date_of_creation": row.get("IncorporationDate", "").strip(),
+                "date_of_creation": date_of_creation,
+                "date_of_cessation": date_of_cessation,
                 "country": row.get("CountryOfOrigin", "United Kingdom").strip(),
                 "sic_code": row.get("SICCode.SicText_1", "").strip(),
             }
@@ -416,6 +428,8 @@ class CompaniesHouseImporter:
                 source_id=company_number,
                 region=region,
                 entity_type=entity_type,
+                from_date=date_of_creation,
+                to_date=date_of_cessation,
                 record=record_data,
             )

corp-extractor 0.9.0__py3-none-any.whl → 0.9.3__py3-none-any.whl

corp-extractor 0.9.0py3-none-any.whl → 0.9.3py3-none-any.whl