PyPI - resolvekit - Versions diffs - 0.0.1__py3-none-any.whl - Mend

resolvekit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

resolvekit/README.md +134 -0
resolvekit/__init__.py +67 -0
resolvekit/api/README.md +165 -0
resolvekit/api/__init__.py +10 -0
resolvekit/api/convenience.py +53 -0
resolvekit/api/resolver.py +457 -0
resolvekit/builders/README.md +173 -0
resolvekit/builders/__init__.py +0 -0
resolvekit/calibration/README.md +351 -0
resolvekit/calibration/__init__.py +12 -0
resolvekit/calibration/calibrator.py +184 -0
resolvekit/calibration/features.py +139 -0
resolvekit/calibration/models.py +78 -0
resolvekit/cli/README.md +215 -0
resolvekit/cli/__init__.py +0 -0
resolvekit/cli/main.py +18 -0
resolvekit/config.py +128 -0
resolvekit/constants.py +252 -0
resolvekit/constraints/README.md +102 -0
resolvekit/constraints/__init__.py +17 -0
resolvekit/constraints/constraint_engine.py +111 -0
resolvekit/constraints/hierarchy_validator.py +148 -0
resolvekit/constraints/membership_validator.py +60 -0
resolvekit/constraints/protocols.py +33 -0
resolvekit/constraints/temporal_validator.py +43 -0
resolvekit/constraints/type_validator.py +42 -0
resolvekit/data/README.md +165 -0
resolvekit/data/__init__.py +14 -0
resolvekit/data/alias_repository.py +206 -0
resolvekit/data/code_repository.py +85 -0
resolvekit/data/context_filters.py +49 -0
resolvekit/data/db_manager.py +196 -0
resolvekit/data/entity_repository.py +466 -0
resolvekit/data/membership_repository.py +107 -0
resolvekit/data/query_builder.py +177 -0
resolvekit/data/schema.py +122 -0
resolvekit/disambiguation/README.md +72 -0
resolvekit/disambiguation/__init__.py +0 -0
resolvekit/extraction/README.md +204 -0
resolvekit/extraction/__init__.py +0 -0
resolvekit/matchers/README.md +77 -0
resolvekit/matchers/__init__.py +65 -0
resolvekit/matchers/alias_exact.py +65 -0
resolvekit/matchers/canonical_name.py +62 -0
resolvekit/matchers/cascade.py +127 -0
resolvekit/matchers/code_validators.py +250 -0
resolvekit/matchers/exact_code.py +177 -0
resolvekit/matchers/fts_matcher.py +106 -0
resolvekit/matchers/fuzzy_matcher.py +142 -0
resolvekit/matchers/priorities.py +174 -0
resolvekit/matchers/protocols.py +75 -0
resolvekit/normalization/README.md +192 -0
resolvekit/normalization/__init__.py +8 -0
resolvekit/normalization/normalizer.py +164 -0
resolvekit/overlays/README.md +226 -0
resolvekit/overlays/__init__.py +0 -0
resolvekit/types.py +534 -0
resolvekit/utils/README.md +188 -0
resolvekit/utils/__init__.py +48 -0
resolvekit/utils/cache.py +109 -0
resolvekit/utils/dates.py +339 -0
resolvekit/utils/errors.py +145 -0
resolvekit/utils/files.py +366 -0
resolvekit/utils/logging.py +219 -0
resolvekit/utils/text.py +475 -0
resolvekit/utils/validation.py +301 -0
resolvekit-0.0.1.dist-info/METADATA +36 -0
resolvekit-0.0.1.dist-info/RECORD +70 -0
resolvekit-0.0.1.dist-info/WHEEL +4 -0
resolvekit-0.0.1.dist-info/entry_points.txt +3 -0

resolvekit/data/README.md ADDED Viewed

@@ -0,0 +1,165 @@
+# Data Module
+## Purpose
+The data module handles all data storage, retrieval, and management including SQLite database operations, data pack loading, and overlay management.
+## Components
+### Core Components
+1. **Database Manager** (`db_manager.py`)
+   - SQLite connection management
+   - PRAGMA configuration for performance
+   - Database attachment (base + overlays)
+   - Transaction management
+2. **Schema** (`schema.py`)
+   - SQL schema definitions for all tables
+   - FTS5 virtual table configuration
+   - Indexes and constraints
+   - Migration utilities
+3. **Models** (`models.py`)
+   - Python data classes for entities, aliases, codes, etc.
+   - ORM-like interface (or raw SQL with dataclass mapping)
+   - Type-safe data access
+4. **Loaders** (`loaders.py`)
+   - Load entities, aliases, codes from SQLite
+   - Cache frequently accessed data
+   - Lazy loading for large datasets
+5. **Query Builder** (`query_builder.py`)
+   - Construct SQL queries for various operations
+   - Handle union queries across base + overlays
+   - FTS query construction with proper escaping
+### Data Access Layer
+- `entities.py`: Entity CRUD operations
+- `aliases.py`: Alias lookup and search
+- `codes.py`: Code system lookups
+- `hierarchies.py`: Hierarchy traversal queries
+- `memberships.py`: Group membership queries
+- `provenance.py`: Data source attribution
+## Database Schema
+### Main Tables
+```sql
+-- Entities table
+CREATE TABLE entities (
+    dcid TEXT PRIMARY KEY,
+    canonical_name TEXT NOT NULL,
+    entity_type TEXT NOT NULL,
+    parent_dcid TEXT,
+    centroid_lat REAL,
+    centroid_lon REAL,
+    valid_from TEXT,
+    valid_until TEXT,
+    FOREIGN KEY (parent_dcid) REFERENCES entities(dcid)
+);
+-- Aliases table
+CREATE TABLE aliases (
+    alias_id INTEGER PRIMARY KEY,
+    entity_dcid TEXT NOT NULL,
+    alias_text TEXT NOT NULL,
+    alias_norm TEXT NOT NULL,
+    language TEXT,
+    alias_type TEXT CHECK(alias_type IN ('canonical','endonym','exonym','abbr','code')),
+    valid_from TEXT,
+    valid_until TEXT,
+    source TEXT,
+    alias_uid TEXT UNIQUE,
+    FOREIGN KEY (entity_dcid) REFERENCES entities(dcid)
+);
+-- FTS5 virtual table
+CREATE VIRTUAL TABLE aliases_fts USING fts5(
+    alias_norm,
+    content='aliases',
+    content_rowid='alias_id',
+    tokenize = "unicode61 remove_diacritics 2 tokenchars '.-'",
+    prefix='2,3'
+);
+-- Codes table
+CREATE TABLE codes (
+    entity_dcid TEXT NOT NULL,
+    code_system TEXT NOT NULL,
+    code_value TEXT NOT NULL,
+    valid_from TEXT,
+    valid_until TEXT,
+    source TEXT,
+    PRIMARY KEY (entity_dcid, code_system),
+    FOREIGN KEY (entity_dcid) REFERENCES entities(dcid)
+);
+-- Memberships table
+CREATE TABLE memberships (
+    id INTEGER PRIMARY KEY,
+    entity_dcid TEXT NOT NULL,
+    group_dcid TEXT NOT NULL,
+    valid_from TEXT NOT NULL,
+    valid_until TEXT,
+    source TEXT,
+    FOREIGN KEY (entity_dcid) REFERENCES entities(dcid),
+    FOREIGN KEY (group_dcid) REFERENCES entities(dcid)
+);
+-- Ambiguity registry
+CREATE TABLE ambiguity (
+    surface TEXT PRIMARY KEY,
+    types TEXT,
+    notes TEXT
+);
+-- Provenance table
+CREATE TABLE provenance (
+    id INTEGER PRIMARY KEY,
+    entity_dcid TEXT NOT NULL,
+    field TEXT,
+    source TEXT,
+    license TEXT,
+    quality INTEGER,
+    last_updated TEXT,
+    FOREIGN KEY (entity_dcid) REFERENCES entities(dcid)
+);
+```
+## SQLite Configuration
+Performance pragmas applied on connection:
+```python
+PRAGMA journal_mode=OFF;
+PRAGMA synchronous=OFF;
+PRAGMA temp_store=MEMORY;
+PRAGMA mmap_size=268435456;  # ~256MB
+PRAGMA cache_size=-100000;   # ~100MB
+```
+## Overlay Precedence
+When querying across base + overlays:
+1. User overlays (precedence: 100+)
+2. Organization overlays (precedence: 10-99)
+3. Base pack (precedence: 0)
+Queries use `UNION ALL` with deduplication by `alias_uid` or `dcid`, keeping highest precedence.
+## Design Principles
+1. **Read-optimized**: Pre-built indexes, no writes at runtime
+2. **Efficient caching**: Cache hot data (codes, popular entities)
+3. **Overlay transparency**: Queries automatically span all attached databases
+4. **Type safety**: Use dataclasses for structured data access
+## Implementation Priority
+**Phase A** - Core resolver (schema, basic loaders)
+**Phase C** - Overlay system

resolvekit/data/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Data layer for resolvekit."""
+from resolvekit.data.db_manager import DatabaseManager
+from resolvekit.data.entity_repository import EntityRepository
+from resolvekit.data.query_builder import QueryBuilder
+from resolvekit.data.schema import create_schema, get_schema_version
+__all__ = [
+    "DatabaseManager",
+    "EntityRepository",
+    "QueryBuilder",
+    "create_schema",
+    "get_schema_version",
+]

resolvekit/data/alias_repository.py ADDED Viewed

@@ -0,0 +1,206 @@
+"""Repository for alias and FTS operations."""
+from typing import Any, ClassVar
+from resolvekit.data.context_filters import ContextFilterBuilder
+from resolvekit.data.db_manager import DatabaseManager
+from resolvekit.data.query_builder import QueryBuilder
+from resolvekit.normalization.normalizer import TextNormalizer
+from resolvekit.types import Entity, EntityRow, MatchContext
+class AliasRepository:
+    """Repository for alias and FTS operations."""
+    # Derive column list from EntityRow model (single source of truth)
+    ENTITY_COLUMNS: ClassVar[list[str]] = list(EntityRow.model_fields.keys())
+    def __init__(self, db_manager: DatabaseManager, normalizer: TextNormalizer):
+        """
+        Initialize repository.
+        Args:
+            db_manager: Database manager instance
+            normalizer: Text normalizer instance
+        """
+        self.db = db_manager
+        self.normalizer = normalizer
+        self.query_builder = QueryBuilder(db_manager)
+    def find_exact_normalized(
+        self, normalized: str, context: MatchContext | None = None
+    ) -> list[tuple[Entity, str]]:
+        """
+        Find entities with exact normalized alias match.
+        Performance:
+        - Uses idx_aliases_norm index
+        - UNION ALL for overlays (no deduplication in SQL)
+        - Dedupe in Python by DCID (keep highest precedence)
+        Args:
+            normalized: Normalized alias string to match
+            context: Optional filtering context
+        Returns:
+            List of (entity, matched_alias) tuples
+        """
+        params: dict[str, Any] = {"normalized": normalized}
+        # Build context filters using shared utility
+        temporal_filter, type_filter, parent_filter = (
+            ContextFilterBuilder.build_filters(context, params, table_prefix="e")
+        )
+        # Query from main database
+        entity_cols = ", ".join(f"e.{col}" for col in self.ENTITY_COLUMNS)
+        sql = f"""
+            SELECT {entity_cols}, a.alias_text, 0 AS precedence
+            FROM main.aliases a
+            JOIN main.entities e ON a.entity_dcid = e.dcid
+            WHERE a.alias_norm = :normalized
+            {temporal_filter}
+            {type_filter}
+            {parent_filter}
+        """
+        # Add overlays if present
+        for schema_name, precedence in self.db.overlays:
+            sql += f"""
+                UNION ALL
+                SELECT {entity_cols}, a.alias_text, {precedence} AS precedence
+                FROM {schema_name}.aliases a
+                JOIN {schema_name}.entities e ON a.entity_dcid = e.dcid
+                WHERE a.alias_norm = :normalized
+                {temporal_filter}
+                {type_filter}
+                {parent_filter}
+            """
+        # Order by precedence to prioritize overlays
+        sql += "\nORDER BY precedence DESC"
+        result = self.db.execute(sql, params)
+        # Dedupe by DCID (keep first occurrence = highest precedence)
+        seen_dcids = set()
+        matches = []
+        for row in result:
+            entity = self._row_to_entity(row)
+            if entity.dcid not in seen_dcids:
+                seen_dcids.add(entity.dcid)
+                matches.append((entity, row.alias_text))
+        return matches
+    def search_fts(
+        self, query: str, limit: int = 50, context: MatchContext | None = None
+    ) -> list[tuple[Entity, float, int]]:
+        """
+        FTS5 search with BM25 ranking.
+        Performance:
+        - LIMIT pushed to SQL
+        - UNION ALL for overlays
+        - ORDER BY rank LIMIT in SQL
+        Args:
+            query: Query string for FTS search
+            limit: Maximum results to return
+            context: Optional filtering context
+        Returns:
+            List of (entity, bm25_score, rank) tuples
+        """
+        params: dict[str, Any] = {"query": query}
+        # Build context filters using shared utility
+        temporal_filter, type_filter, parent_filter = (
+            ContextFilterBuilder.build_filters(context, params, table_prefix="e")
+        )
+        # Entity columns for SELECT
+        entity_cols = ", ".join(f"e.{col}" for col in self.ENTITY_COLUMNS)
+        # Query from main database
+        # Note: FTS5 rank is negative (closer to 0 = better)
+        # fts.rank is equivalent to bm25(aliases_fts) in SQLite 3.20.0+
+        sql = f"""
+            SELECT {entity_cols}, fts.rank as rank, 0 AS precedence
+            FROM main.aliases_fts fts
+            JOIN main.aliases a ON fts.rowid = a.alias_id
+            JOIN main.entities e ON a.entity_dcid = e.dcid
+            WHERE fts.alias_norm MATCH :query
+            {temporal_filter}
+            {type_filter}
+            {parent_filter}
+        """
+        # Add overlays if present
+        for schema_name, precedence in self.db.overlays:
+            sql += f"""
+                UNION ALL
+                SELECT {entity_cols}, fts.rank as rank, {precedence} AS precedence
+                FROM {schema_name}.aliases_fts fts
+                JOIN {schema_name}.aliases a ON fts.rowid = a.alias_id
+                JOIN {schema_name}.entities e ON a.entity_dcid = e.dcid
+                WHERE fts.alias_norm MATCH :query
+                {temporal_filter}
+                {type_filter}
+                {parent_filter}
+            """
+        # Order by rank (ascending, since negative), then precedence
+        # Apply generous SQL LIMIT to reduce memory/IO while leaving headroom for deduplication
+        sql += f"""
+            ORDER BY rank ASC, precedence DESC
+            LIMIT {limit * 10}
+        """
+        result = self.db.execute(sql, params)
+        # Dedupe by DCID (keep first occurrence = best rank + highest precedence)
+        seen_dcids = set()
+        matches = []
+        rank = 1
+        for row in result:
+            entity = self._row_to_entity(row)
+            if entity.dcid not in seen_dcids:
+                seen_dcids.add(entity.dcid)
+                # FTS rank is negative, convert to positive score
+                bm25_score = abs(row.rank)
+                matches.append((entity, bm25_score, rank))
+                rank += 1
+                # Stop once we have enough unique results
+                if len(matches) >= limit:
+                    break
+        return matches
+    def _row_to_entity(self, row: Any) -> Entity:
+        """
+        Convert database row to Entity model via Pydantic validation.
+        Args:
+            row: Database row object
+        Returns:
+            Validated Entity instance
+        """
+        # Convert SQLAlchemy Row to dict using only entity columns
+        row_dict = {col: getattr(row, col) for col in self.ENTITY_COLUMNS}
+        # Validate through EntityRow model
+        entity_row = EntityRow.model_validate(row_dict)
+        # Convert EntityRow to Entity (adding computed fields)
+        return Entity(
+            **entity_row.model_dump(),
+            codes={},
+            provenance={},
+        )

resolvekit/data/code_repository.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""Repository for code validation and lookup."""
+from resolvekit.data.db_manager import DatabaseManager
+from resolvekit.data.entity_repository import EntityRepository
+from resolvekit.matchers.code_validators import get_validator
+from resolvekit.types import CodeSystem, Entity, MatchContext
+class CodeRepository:
+    """Repository for code validation and lookup."""
+    def __init__(self, db_manager: DatabaseManager, entity_repo: EntityRepository):
+        """
+        Initialize repository.
+        Args:
+            db_manager: Database manager instance
+            entity_repo: Entity repository for lookups
+        """
+        self.db = db_manager
+        self.entity_repo = entity_repo
+    def validate_code(self, system: CodeSystem, value: str) -> tuple[bool, str | None]:
+        """
+        Validate code format using registered validator.
+        Args:
+            system: Code system
+            value: Code value to validate
+        Returns:
+            Tuple of (is_valid, error_message)
+        """
+        try:
+            validator = get_validator(system)
+            return validator.validate(value)
+        except KeyError:
+            return False, f"Unsupported code system: {system}"
+    def find_by_code(
+        self, system: CodeSystem, value: str, context: MatchContext | None = None
+    ) -> Entity | None:
+        """
+        Validate, normalize, then lookup entity by code.
+        Args:
+            system: Code system
+            value: Code value
+            context: Optional filtering context
+        Returns:
+            Entity if found, None otherwise
+        """
+        # Validate format
+        is_valid, error = self.validate_code(system, value)
+        if not is_valid:
+            return None
+        # Normalize code
+        validator = get_validator(system)
+        normalized_value = validator.normalize(value)
+        # Special case: DCID is the entity primary key, not stored in codes table
+        if system == CodeSystem.DCID:
+            entity = self.entity_repo.find_by_dcid(
+                dcid=normalized_value,
+                as_of=context.as_of if context else None,
+            )
+            # Apply remaining context filters
+            if entity and context:
+                # Check entity_type filter
+                if context.entity_type and entity.entity_type != context.entity_type:
+                    return None
+                # Check parent filter
+                if context.parent_dcid and entity.parent_dcid != context.parent_dcid:
+                    return None
+            return entity
+        # Lookup via EntityRepository.find_by_code (with context for SQL-level filtering)
+        entity = self.entity_repo.find_by_code(system.value, normalized_value, context)
+        return entity

resolvekit/data/context_filters.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Shared utilities for building SQL context filters."""
+from typing import Any
+from resolvekit.types import MatchContext
+class ContextFilterBuilder:
+    """Builder for SQL filter clauses from match context."""
+    @staticmethod
+    def build_filters(
+        context: MatchContext | None,
+        params: dict[str, Any],
+        table_prefix: str = "",
+    ) -> tuple[str, str, str]:
+        """
+        Build SQL filter clauses from match context.
+        Args:
+            context: Optional filtering context
+            params: Parameters dict to update with filter values
+            table_prefix: Optional table alias prefix (e.g., "e" or "")
+                         Will be converted to "e." format if non-empty
+        Returns:
+            Tuple of (temporal_filter, type_filter, parent_filter) SQL clauses
+        """
+        prefix = f"{table_prefix}." if table_prefix else ""
+        temporal_filter = ""
+        if context and context.as_of:
+            temporal_filter = f"""
+                AND ({prefix}valid_from IS NULL OR {prefix}valid_from <= :as_of)
+                AND ({prefix}valid_until IS NULL OR {prefix}valid_until >= :as_of)
+            """
+            params["as_of"] = context.as_of.isoformat()
+        type_filter = ""
+        if context and context.entity_type:
+            type_filter = f"AND {prefix}entity_type = :entity_type"
+            params["entity_type"] = context.entity_type.value
+        parent_filter = ""
+        if context and context.parent_dcid:
+            parent_filter = f"AND {prefix}parent_dcid = :parent_dcid"
+            params["parent_dcid"] = context.parent_dcid
+        return temporal_filter, type_filter, parent_filter