PyPI - corp-extractor - Versions diffs - 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl - Mend

corp-extractor 0.9.3py3-none-any.whl → 0.9.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/METADATA +33 -3
{corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/RECORD +16 -12
statement_extractor/cli.py +472 -45
statement_extractor/database/embeddings.py +45 -0
statement_extractor/database/hub.py +51 -9
statement_extractor/database/importers/import_utils.py +264 -0
statement_extractor/database/importers/wikidata_dump.py +334 -3
statement_extractor/database/importers/wikidata_people.py +44 -0
statement_extractor/database/migrate_v2.py +852 -0
statement_extractor/database/models.py +125 -1
statement_extractor/database/schema_v2.py +409 -0
statement_extractor/database/seed_data.py +359 -0
statement_extractor/database/store.py +2113 -322
statement_extractor/plugins/qualifiers/person.py +109 -52
{corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/WHEEL +0 -0
{corp_extractor-0.9.3.dist-info → corp_extractor-0.9.4.dist-info}/entry_points.txt +0 -0

statement_extractor/database/embeddings.py CHANGED Viewed

@@ -121,6 +121,51 @@ class CompanyEmbedder:
         )
         return embeddings.astype(np.float32)
+    def quantize_to_int8(self, embedding: np.ndarray) -> np.ndarray:
+        """
+        Quantize L2-normalized float32 embedding to int8.
+        For normalized embeddings (values in [-1, 1]), this provides
+        75% storage reduction with ~92% recall at top-100.
+        Args:
+            embedding: L2-normalized float32 embedding vector
+        Returns:
+            int8 embedding vector
+        """
+        return np.clip(np.round(embedding * 127), -127, 127).astype(np.int8)
+    def embed_and_quantize(self, text: str) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Embed text and return both float32 and int8 embeddings.
+        Args:
+            text: Text to embed
+        Returns:
+            Tuple of (float32_embedding, int8_embedding)
+        """
+        fp32 = self.embed(text)
+        return fp32, self.quantize_to_int8(fp32)
+    def embed_batch_and_quantize(
+        self, texts: list[str], batch_size: int = 32
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Embed multiple texts and return both float32 and int8 embeddings.
+        Args:
+            texts: List of texts to embed
+            batch_size: Batch size for processing
+        Returns:
+            Tuple of (float32_embeddings, int8_embeddings) matrices
+        """
+        fp32 = self.embed_batch(texts, batch_size=batch_size)
+        int8 = np.array([self.quantize_to_int8(e) for e in fp32])
+        return fp32, int8
     def similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
         """
         Compute cosine similarity between two embeddings.

statement_extractor/database/hub.py CHANGED Viewed

@@ -20,9 +20,9 @@ logger = logging.getLogger(__name__)
 # Default HuggingFace repo for entity database
 DEFAULT_REPO_ID = "Corp-o-Rate-Community/entity-references"
-DEFAULT_DB_FILENAME = "entities-lite.db"  # Lite is the default (smaller download)
-DEFAULT_DB_FULL_FILENAME = "entities.db"
-DEFAULT_DB_LITE_FILENAME = "entities-lite.db"
+DEFAULT_DB_FILENAME = "entities-v2-lite.db"  # Lite is the default (smaller download)
+DEFAULT_DB_FULL_FILENAME = "entities-v2.db"
+DEFAULT_DB_LITE_FILENAME = "entities-v2-lite.db"
 # Local cache directory
 DEFAULT_CACHE_DIR = Path.home() / ".cache" / "corp-extractor"
@@ -55,7 +55,8 @@ def get_database_path(
     # Check common locations
     possible_paths = [
         cache_dir / filename,
-        cache_dir / "entities.db",
+        cache_dir / "entities-v2.db",
+        cache_dir / "entities.db",  # Legacy fallback
         Path.home() / ".cache" / "huggingface" / "hub" / f"datasets--{repo_id.replace('/', '--')}" / filename,
     ]
@@ -219,8 +220,10 @@ def create_lite_database(
     """
     Create a lite version of the database without full records.
-    The lite version strips the `record` column content (sets to empty {}),
-    significantly reducing file size while keeping embeddings and core fields.
+    The lite version:
+    - Strips the `record` column content (sets to empty {})
+    - Drops float32 embedding tables (keeps only scalar int8 embeddings)
+    - Significantly reduces file size (~75% reduction)
     Args:
         source_db_path: Path to the full database
@@ -229,6 +232,8 @@ def create_lite_database(
     Returns:
         Path to the lite database
     """
+    import sqlite_vec
     source_db_path = Path(source_db_path)
     if not source_db_path.exists():
         raise FileNotFoundError(f"Source database not found: {source_db_path}")
@@ -246,14 +251,51 @@ def create_lite_database(
     # Connect and strip record contents
     # Use isolation_level=None for autocommit (required for VACUUM)
     conn = sqlite3.connect(str(output_path), isolation_level=None)
+    # Load sqlite-vec extension (required for vec0 virtual tables)
+    conn.enable_load_extension(True)
+    sqlite_vec.load(conn)
+    conn.enable_load_extension(False)
     try:
         # Update all records to have empty record JSON
         conn.execute("BEGIN")
         cursor = conn.execute("UPDATE organizations SET record = '{}'")
         updated = cursor.rowcount
-        logger.info(f"Stripped {updated} record fields")
+        logger.info(f"Stripped {updated} organization record fields")
+        # Also strip people records if table exists
+        cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='people'")
+        if cursor.fetchone():
+            cursor = conn.execute("UPDATE people SET record = '{}'")
+            logger.info(f"Stripped {cursor.rowcount} people record fields")
         conn.execute("COMMIT")
+        # Drop float32 embedding tables (keep only scalar int8 for 75% storage savings)
+        # Check if scalar tables exist before dropping float32 tables
+        cursor = conn.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name='organization_embeddings_scalar'"
+        )
+        has_org_scalar = cursor.fetchone() is not None
+        cursor = conn.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name='person_embeddings_scalar'"
+        )
+        has_person_scalar = cursor.fetchone() is not None
+        if has_org_scalar:
+            logger.info("Dropping float32 organization_embeddings table (keeping scalar)")
+            conn.execute("DROP TABLE IF EXISTS organization_embeddings")
+        else:
+            logger.warning("No scalar organization embeddings found, keeping float32 table")
+        if has_person_scalar:
+            logger.info("Dropping float32 person_embeddings table (keeping scalar)")
+            conn.execute("DROP TABLE IF EXISTS person_embeddings")
+        else:
+            logger.warning("No scalar person embeddings found, keeping float32 table")
         # Vacuum to reclaim space (must be outside transaction)
         conn.execute("VACUUM")
     finally:
@@ -283,8 +325,8 @@ def upload_database_with_variants(
     Upload entity database with optional lite variant.
     First VACUUMs the database, then creates and uploads:
-    - entities.db (full database)
-    - entities-lite.db (without record data, smaller)
+    - entities-v2.db (full database with v2 normalized schema)
+    - entities-v2-lite.db (without record data, smaller)
     - README.md (dataset card from HUGGINGFACE_README.md)
     Args:

statement_extractor/database/importers/import_utils.py ADDED Viewed

@@ -0,0 +1,264 @@
+"""
+Shared utilities for v2 database importers.
+Provides helper functions for resolving locations, roles, and QIDs
+to their normalized FK references in the v2 schema.
+"""
+import logging
+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+    from ..store import LocationsDatabase, RolesDatabase
+logger = logging.getLogger(__name__)
+def parse_qid(qid_text: Optional[str]) -> Optional[int]:
+    """
+    Parse a QID string to integer.
+    Args:
+        qid_text: QID string like "Q12345" or just "12345"
+    Returns:
+        Integer QID or None if invalid
+    """
+    if not qid_text:
+        return None
+    # Strip whitespace
+    qid_text = qid_text.strip()
+    # Handle "Q12345" format
+    if qid_text.startswith("Q") or qid_text.startswith("q"):
+        qid_text = qid_text[1:]
+    try:
+        return int(qid_text)
+    except ValueError:
+        return None
+def format_qid(qid_int: Optional[int]) -> Optional[str]:
+    """
+    Format an integer QID back to string format.
+    Args:
+        qid_int: Integer QID (e.g., 12345)
+    Returns:
+        String QID like "Q12345" or None
+    """
+    if qid_int is None:
+        return None
+    return f"Q{qid_int}"
+def normalize_name(name: str) -> str:
+    """
+    Normalize a name for database lookup.
+    Args:
+        name: Name to normalize
+    Returns:
+        Lowercase, stripped name
+    """
+    if not name:
+        return ""
+    return name.lower().strip()
+def get_or_create_location(
+    locations_db: "LocationsDatabase",
+    name: str,
+    location_type_id: int,
+    source_id: int = 4,  # wikidata
+    qid: Optional[int] = None,
+    source_identifier: Optional[str] = None,
+    parent_ids: Optional[list[int]] = None,
+) -> int:
+    """
+    Get or create a location record.
+    Args:
+        locations_db: LocationsDatabase instance
+        name: Location name
+        location_type_id: FK to location_types table
+        source_id: FK to source_types table
+        qid: Optional Wikidata QID as integer
+        source_identifier: Optional source-specific identifier
+        parent_ids: Optional list of parent location IDs
+    Returns:
+        Location ID
+    """
+    return locations_db.get_or_create(
+        name=name,
+        location_type_id=location_type_id,
+        source_id=source_id,
+        qid=qid,
+        source_identifier=source_identifier,
+        parent_ids=parent_ids,
+    )
+def get_or_create_role(
+    roles_db: "RolesDatabase",
+    name: str,
+    source_id: int = 4,  # wikidata
+    qid: Optional[int] = None,
+    source_identifier: Optional[str] = None,
+) -> int:
+    """
+    Get or create a role record.
+    Args:
+        roles_db: RolesDatabase instance
+        name: Role/title name
+        source_id: FK to source_types table
+        qid: Optional Wikidata QID as integer
+        source_identifier: Optional source-specific identifier
+    Returns:
+        Role ID
+    """
+    return roles_db.get_or_create(
+        name=name,
+        source_id=source_id,
+        qid=qid,
+        source_identifier=source_identifier,
+    )
+def resolve_country_to_location_id(
+    locations_db: "LocationsDatabase",
+    country_text: str,
+) -> Optional[int]:
+    """
+    Resolve a country name/code to a location ID.
+    Args:
+        locations_db: LocationsDatabase instance
+        country_text: Country code (e.g., "US") or name (e.g., "United States")
+    Returns:
+        Location ID or None if not found
+    """
+    if not country_text:
+        return None
+    return locations_db.resolve_region_text(country_text)
+def get_source_id(source_name: str) -> int:
+    """
+    Get source_id for a source name.
+    Args:
+        source_name: Source name (e.g., "gleif", "sec_edgar")
+    Returns:
+        Source ID (1-4)
+    """
+    from ..seed_data import SOURCE_NAME_TO_ID
+    return SOURCE_NAME_TO_ID.get(source_name, 4)  # default to wikidata
+def get_source_name(source_id: int) -> str:
+    """
+    Get source name for a source_id.
+    Args:
+        source_id: Source ID (1-4)
+    Returns:
+        Source name
+    """
+    from ..seed_data import SOURCE_ID_TO_NAME
+    return SOURCE_ID_TO_NAME.get(source_id, "wikidata")
+def get_entity_type_id(entity_type_name: str) -> int:
+    """
+    Get entity_type_id for an entity type name.
+    Args:
+        entity_type_name: Entity type name (e.g., "business", "fund")
+    Returns:
+        Entity type ID (1-17)
+    """
+    from ..seed_data import ORG_TYPE_NAME_TO_ID
+    return ORG_TYPE_NAME_TO_ID.get(entity_type_name, 17)  # default to unknown
+def get_entity_type_name(entity_type_id: int) -> str:
+    """
+    Get entity type name for an entity_type_id.
+    Args:
+        entity_type_id: Entity type ID (1-17)
+    Returns:
+        Entity type name
+    """
+    from ..seed_data import ORG_TYPE_ID_TO_NAME
+    return ORG_TYPE_ID_TO_NAME.get(entity_type_id, "unknown")
+def get_person_type_id(person_type_name: str) -> int:
+    """
+    Get person_type_id for a person type name.
+    Args:
+        person_type_name: Person type name (e.g., "executive", "politician")
+    Returns:
+        Person type ID (1-15)
+    """
+    from ..seed_data import PEOPLE_TYPE_NAME_TO_ID
+    return PEOPLE_TYPE_NAME_TO_ID.get(person_type_name, 15)  # default to unknown
+def get_person_type_name(person_type_id: int) -> str:
+    """
+    Get person type name for a person_type_id.
+    Args:
+        person_type_id: Person type ID (1-15)
+    Returns:
+        Person type name
+    """
+    from ..seed_data import PEOPLE_TYPE_ID_TO_NAME
+    return PEOPLE_TYPE_ID_TO_NAME.get(person_type_id, "unknown")
+def get_location_type_id(location_type_name: str) -> int:
+    """
+    Get location_type_id for a location type name.
+    Args:
+        location_type_name: Location type name (e.g., "country", "city")
+    Returns:
+        Location type ID
+    """
+    from ..seed_data import LOCATION_TYPE_NAME_TO_ID
+    return LOCATION_TYPE_NAME_TO_ID.get(location_type_name, 36)  # default to other
+def get_location_type_id_from_qid(wikidata_qid: int) -> int:
+    """
+    Get location_type_id from a Wikidata P31 QID.
+    Args:
+        wikidata_qid: Wikidata instance-of QID (e.g., 515 for city)
+    Returns:
+        Location type ID (defaults to 36 = other)
+    """
+    from ..seed_data import LOCATION_TYPE_QID_TO_ID
+    return LOCATION_TYPE_QID_TO_ID.get(wikidata_qid, 36)  # default to other

corp-extractor 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl

corp-extractor 0.9.3py3-none-any.whl → 0.9.4py3-none-any.whl