PyPI - resolvekit - Versions diffs - 0.0.1__py3-none-any.whl - Mend

resolvekit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

resolvekit/README.md +134 -0
resolvekit/__init__.py +67 -0
resolvekit/api/README.md +165 -0
resolvekit/api/__init__.py +10 -0
resolvekit/api/convenience.py +53 -0
resolvekit/api/resolver.py +457 -0
resolvekit/builders/README.md +173 -0
resolvekit/builders/__init__.py +0 -0
resolvekit/calibration/README.md +351 -0
resolvekit/calibration/__init__.py +12 -0
resolvekit/calibration/calibrator.py +184 -0
resolvekit/calibration/features.py +139 -0
resolvekit/calibration/models.py +78 -0
resolvekit/cli/README.md +215 -0
resolvekit/cli/__init__.py +0 -0
resolvekit/cli/main.py +18 -0
resolvekit/config.py +128 -0
resolvekit/constants.py +252 -0
resolvekit/constraints/README.md +102 -0
resolvekit/constraints/__init__.py +17 -0
resolvekit/constraints/constraint_engine.py +111 -0
resolvekit/constraints/hierarchy_validator.py +148 -0
resolvekit/constraints/membership_validator.py +60 -0
resolvekit/constraints/protocols.py +33 -0
resolvekit/constraints/temporal_validator.py +43 -0
resolvekit/constraints/type_validator.py +42 -0
resolvekit/data/README.md +165 -0
resolvekit/data/__init__.py +14 -0
resolvekit/data/alias_repository.py +206 -0
resolvekit/data/code_repository.py +85 -0
resolvekit/data/context_filters.py +49 -0
resolvekit/data/db_manager.py +196 -0
resolvekit/data/entity_repository.py +466 -0
resolvekit/data/membership_repository.py +107 -0
resolvekit/data/query_builder.py +177 -0
resolvekit/data/schema.py +122 -0
resolvekit/disambiguation/README.md +72 -0
resolvekit/disambiguation/__init__.py +0 -0
resolvekit/extraction/README.md +204 -0
resolvekit/extraction/__init__.py +0 -0
resolvekit/matchers/README.md +77 -0
resolvekit/matchers/__init__.py +65 -0
resolvekit/matchers/alias_exact.py +65 -0
resolvekit/matchers/canonical_name.py +62 -0
resolvekit/matchers/cascade.py +127 -0
resolvekit/matchers/code_validators.py +250 -0
resolvekit/matchers/exact_code.py +177 -0
resolvekit/matchers/fts_matcher.py +106 -0
resolvekit/matchers/fuzzy_matcher.py +142 -0
resolvekit/matchers/priorities.py +174 -0
resolvekit/matchers/protocols.py +75 -0
resolvekit/normalization/README.md +192 -0
resolvekit/normalization/__init__.py +8 -0
resolvekit/normalization/normalizer.py +164 -0
resolvekit/overlays/README.md +226 -0
resolvekit/overlays/__init__.py +0 -0
resolvekit/types.py +534 -0
resolvekit/utils/README.md +188 -0
resolvekit/utils/__init__.py +48 -0
resolvekit/utils/cache.py +109 -0
resolvekit/utils/dates.py +339 -0
resolvekit/utils/errors.py +145 -0
resolvekit/utils/files.py +366 -0
resolvekit/utils/logging.py +219 -0
resolvekit/utils/text.py +475 -0
resolvekit/utils/validation.py +301 -0
resolvekit-0.0.1.dist-info/METADATA +36 -0
resolvekit-0.0.1.dist-info/RECORD +70 -0
resolvekit-0.0.1.dist-info/WHEEL +4 -0
resolvekit-0.0.1.dist-info/entry_points.txt +3 -0

resolvekit/matchers/exact_code.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""Exact code matcher for deterministic code lookups."""
+from resolvekit.data.code_repository import CodeRepository
+from resolvekit.matchers.code_validators import get_validator
+from resolvekit.matchers.priorities import InferencePriority
+from resolvekit.types import Candidate, CodeSystem, MatchContext, MatcherType
+class ExactCodeMatcher:
+    """
+    Matches exact code lookups with validation.
+    Tier 1 matcher: Deterministic, returns single match, stops cascade.
+    Supports:
+    - Explicit code system: "iso2:US", "dcid:country/USA"
+    - Inferred code system: "US" (tries ISO2), "840" (tries ISO_NUMERIC)
+    """
+    def __init__(
+        self,
+        code_repo: CodeRepository,
+        priority_order: list[CodeSystem] | None = None,
+    ):
+        """
+        Initialize matcher.
+        Args:
+            code_repo: Code repository for validation and lookup
+            priority_order: Optional priority order for code system inference.
+                          If None, uses InferencePriority.default()
+        """
+        self.code_repo = code_repo
+        self.priority = priority_order or InferencePriority.default()
+    def match(
+        self,
+        query: str,
+        normalized_query: str,
+        limit: int = 10,
+        context: MatchContext | None = None,
+    ) -> list[Candidate]:
+        """
+        Try to parse query as code and lookup.
+        Args:
+            query: Original query string
+            normalized_query: Normalized query (unused for code matching)
+            limit: Maximum candidates (unused, always returns 0 or 1)
+            context: Optional filtering context
+        Returns:
+            List with single candidate if match found, empty list otherwise
+        """
+        # Parse code: "system:value" or just "value"
+        systems, value = self._parse_code(query)
+        if not systems or value is None:
+            return []
+        # Try each candidate system in priority order until lookup succeeds
+        for system in systems:
+            # Validate code format
+            is_valid, _ = self.code_repo.validate_code(system, value)
+            if not is_valid:
+                continue
+            # Lookup entity
+            entity = self.code_repo.find_by_code(system, value, context)
+            if entity:
+                # Return single candidate with exact_code feature
+                return [
+                    Candidate(
+                        entity=entity,
+                        score=1.0,
+                        matcher_type=MatcherType.EXACT_CODE,
+                        features={"exact_code": True, "code_system": system.value},
+                        matched_alias=None,
+                    )
+                ]
+        # No match found in any candidate system
+        return []
+    def _parse_code(self, query: str) -> tuple[list[CodeSystem], str | None]:
+        """
+        Parse query as code with optional system prefix.
+        Formats:
+            - "system:value" -> explicit system (single-item list)
+            - "value" -> infer systems from format (priority-ordered list)
+        Args:
+            query: Query string
+        Returns:
+            Tuple of (code_systems_list, code_value) or ([], None) if unparseable
+        """
+        # Check for explicit system
+        if ":" in query:
+            parts = query.split(":", 1)
+            if len(parts) == 2:
+                system_str, value = parts
+                try:
+                    system = CodeSystem(system_str.lower())
+                    return [system], value
+                except ValueError:
+                    return [], None
+        # Infer systems from format (returns priority-ordered list)
+        systems = self._infer_code_system(query)
+        return systems, query if systems else ([], None)
+    def _infer_code_system(self, value: str) -> list[CodeSystem]:
+        """
+        Infer candidate code systems from value format using hybrid approach.
+        Strategy:
+        1. Fast path: Check unique patterns (DCID with "/", Wikidata "Q123")
+           - If found, return single-item list (unambiguous)
+        2. Priority iteration: Collect all validators that pass in priority order
+           - Returns list of candidates to try
+        Args:
+            value: Code value
+        Returns:
+            List of candidate code systems in priority order (empty if none match)
+        """
+        # Fast path for unambiguous patterns (O(1) checks)
+        fast_path_result = self._fast_path_inference(value)
+        if fast_path_result is not None:
+            # Unambiguous pattern - return single candidate
+            return [fast_path_result]
+        # Priority-based validation: collect ALL matching systems in priority order
+        candidates = []
+        for system in self.priority:
+            # Skip systems already checked in fast path
+            if system in {CodeSystem.DCID, CodeSystem.WIKIDATA}:
+                continue
+            validator = get_validator(system)
+            is_valid, _ = validator.validate(value)
+            if is_valid:
+                candidates.append(system)
+        return candidates
+    def _fast_path_inference(self, value: str) -> CodeSystem | None:
+        """
+        Fast path for unique patterns that don't need validation.
+        These patterns are unambiguous and can be identified with simple
+        string operations, avoiding validator lookups.
+        Args:
+            value: Code value
+        Returns:
+            Code system if unique pattern detected, None otherwise
+        """
+        # DCID: Contains "/" (e.g., "country/USA", "geoId/06")
+        if "/" in value:
+            return CodeSystem.DCID
+        # Wikidata: Starts with "Q" followed by digits (e.g., "Q30")
+        if len(value) > 1 and value[0].upper() == "Q" and value[1:].isdigit():
+            return CodeSystem.WIKIDATA
+        # ISO3166-2: Contains "-" with 2-3 char country prefix (e.g., "US-CA")
+        if "-" in value:
+            parts = value.split("-", 1)
+            if len(parts) == 2 and 2 <= len(parts[0]) <= 3 and parts[0].isalpha():
+                return CodeSystem.ISO3166_2
+        return None

resolvekit/matchers/fts_matcher.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""FTS matcher using SQLite FTS5 for text search."""
+from resolvekit.data.alias_repository import AliasRepository
+from resolvekit.types import Candidate, MatchContext, MatcherType
+class FTSMatcher:
+    """
+    Full-text search using SQLite FTS5.
+    Tier 3 matcher: Returns top-K ranked candidates.
+    Uses FTS5 BM25 ranking with LIMIT pushed to SQL for performance.
+    """
+    def __init__(self, alias_repo: AliasRepository):
+        """
+        Initialize matcher.
+        Args:
+            alias_repo: Alias repository for FTS queries
+        """
+        self.alias_repo = alias_repo
+    def match(
+        self,
+        query: str,
+        normalized_query: str,
+        limit: int = 50,
+        context: MatchContext | None = None,
+    ) -> list[Candidate]:
+        """
+        FTS5 BM25 ranking with LIMIT pushed to SQL.
+        Performance:
+        - LIMIT in SQL (not Python filtering)
+        - UNION ALL for overlays
+        - Returns top-K candidates
+        Args:
+            query: Original query string (unused)
+            normalized_query: Normalized query string
+            limit: Maximum candidates to return (default 50 for Tier 3)
+            context: Optional filtering context
+        Returns:
+            List of candidates ordered by FTS score (descending)
+        """
+        # FTS query with limit
+        matches = self.alias_repo.search_fts(
+            query=normalized_query, limit=limit, context=context
+        )
+        if not matches:
+            return []
+        # Convert to candidates
+        candidates = []
+        for entity, bm25_score, rank in matches:
+            # Normalize BM25 score to 0-1 range
+            # FTS rank is relative, use inverse rank as approximation
+            normalized_score = self._normalize_bm25(bm25_score, rank, len(matches))
+            candidates.append(
+                Candidate(
+                    entity=entity,
+                    score=normalized_score,
+                    matcher_type=MatcherType.FTS,
+                    features={"fts_score": bm25_score, "fts_rank": rank},
+                    matched_alias=None,
+                )
+            )
+        return candidates
+    def _normalize_bm25(self, bm25_score: float, rank: int, total: int) -> float:
+        """
+        Normalize BM25 score to 0-1 range.
+        BM25 scores are unbounded, so we use a combination of:
+        - Rank position (1/rank)
+        - Total results (context)
+        This is a simple heuristic; calibration will learn better weights.
+        Args:
+            bm25_score: Raw BM25 score (positive)
+            rank: Rank position (1-indexed)
+            total: Total number of results
+        Returns:
+            Normalized score (0-1)
+        """
+        # Simple rank-based normalization
+        # Top result gets ~0.9, subsequent results decay
+        # This is a placeholder; calibration will learn actual mapping
+        if rank == 1:
+            return 0.9
+        elif rank <= 3:
+            return 0.8
+        elif rank <= 10:
+            return 0.7
+        elif rank <= 20:
+            return 0.6
+        else:
+            return 0.5

resolvekit/matchers/fuzzy_matcher.py ADDED Viewed

@@ -0,0 +1,142 @@
+"""Fuzzy matcher for refining FTS results with similarity scoring."""
+from rapidfuzz import fuzz
+from resolvekit.normalization.normalizer import TextNormalizer
+from resolvekit.types import Candidate, MatchContext
+class FuzzyMatcher:
+    """
+    Bounded fuzzy matching on FTS candidates.
+    Tier 3 matcher: Refines FTS results, returns top-K'.
+    Uses rapidfuzz for edit distance and trigram similarity.
+    Only operates on FTS candidates (bounded computation).
+    """
+    def __init__(self, normalizer: TextNormalizer):
+        """
+        Initialize matcher.
+        Args:
+            normalizer: Text normalizer for canonical name normalization
+        """
+        self.normalizer = normalizer
+    def match(
+        self,
+        query: str,
+        normalized_query: str,
+        limit: int = 12,
+        context: MatchContext | None = None,
+        fts_candidates: list[Candidate] | None = None,
+    ) -> list[Candidate]:
+        """
+        Refine FTS candidates with fuzzy scoring.
+        Performance:
+        - Only operates on FTS candidates (bounded)
+        - Uses rapidfuzz
+        - Combined score: 0.6 * edit + 0.4 * trigram
+        Args:
+            query: Original query string (unused)
+            normalized_query: Normalized query string
+            limit: Maximum candidates to return (top-K')
+            context: Optional filtering context (unused)
+            fts_candidates: FTS candidates to refine
+        Returns:
+            List of refined candidates ordered by fuzzy score (descending)
+        """
+        if not fts_candidates:
+            return []
+        # Compute fuzzy features for each candidate
+        scored = []
+        for candidate in fts_candidates:
+            # Edit similarity (rapidfuzz.fuzz.ratio returns 0-100)
+            edit_score = (
+                fuzz.ratio(
+                    normalized_query,
+                    self.normalizer.normalize(candidate.entity.canonical_name),
+                )
+                / 100.0
+            )
+            # Trigram Jaccard similarity
+            trigram_score = self._trigram_jaccard(
+                normalized_query, candidate.entity.canonical_name
+            )
+            # Combined fuzzy score
+            fuzzy_score = 0.6 * edit_score + 0.4 * trigram_score
+            # Update candidate features
+            candidate.features.update(
+                {
+                    "edit_similarity": edit_score,
+                    "trigram_jaccard": trigram_score,
+                    "fuzzy_score": fuzzy_score,
+                }
+            )
+            # Update score (take max of FTS score and fuzzy score)
+            candidate.score = max(candidate.score, fuzzy_score)
+            scored.append(candidate)
+        # Sort by score (descending) and return top-K'
+        scored.sort(key=lambda c: c.score, reverse=True)
+        return scored[:limit]
+    def _trigram_jaccard(self, s1: str, s2: str) -> float:
+        """
+        Compute trigram Jaccard similarity.
+        Trigrams are 3-character sequences.
+        Jaccard = |intersection| / |union|
+        Args:
+            s1: First string
+            s2: Second string
+        Returns:
+            Jaccard similarity (0-1)
+        """
+        # Normalize second string
+        s2_norm = self.normalizer.normalize(s2)
+        # Generate trigrams
+        trigrams1 = self._get_trigrams(s1)
+        trigrams2 = self._get_trigrams(s2_norm)
+        if not trigrams1 or not trigrams2:
+            return 0.0
+        # Compute Jaccard
+        intersection = trigrams1 & trigrams2
+        union = trigrams1 | trigrams2
+        return len(intersection) / len(union) if union else 0.0
+    def _get_trigrams(self, s: str) -> set[str]:
+        """
+        Generate trigrams from string.
+        Examples:
+            "abc" -> {"abc"}
+            "abcd" -> {"abc", "bcd"}
+        Args:
+            s: Input string
+        Returns:
+            Set of trigrams
+        """
+        if len(s) < 3:
+            return set()
+        return {s[i : i + 3] for i in range(len(s) - 2)}

resolvekit/matchers/priorities.py ADDED Viewed

@@ -0,0 +1,174 @@
+"""Code system inference priority presets for different domains."""
+from resolvekit.types import CodeSystem
+class InferencePriority:
+    """
+    Preset priority orderings for code system inference.
+    Different domains have different priorities for ambiguous codes.
+    For example, a 3-letter code "USA" could be ISO3 or World Bank -
+    the correct interpretation depends on context.
+    Usage:
+        # Use a preset
+        cascade = MatcherCascade(..., code_priority=InferencePriority.humanitarian())
+        # Or customize
+        custom_priority = [CodeSystem.WB, CodeSystem.ISO2, CodeSystem.ISO3]
+        cascade = MatcherCascade(..., code_priority=custom_priority)
+    """
+    @staticmethod
+    def default() -> list[CodeSystem]:
+        """
+        General-purpose priority order.
+        Prioritizes:
+        1. Most unique patterns (DCID, Wikidata)
+        2. Common standards (ISO codes)
+        3. Specialized systems (NUTS, P-codes, etc.)
+        This is the fallback when no domain context is specified.
+        """
+        return [
+            # Tier 1: Unique patterns (fast path handles these)
+            CodeSystem.DCID,  # "country/USA" - slash is unique
+            CodeSystem.WIKIDATA,  # "Q30" - Q prefix is unique
+            # Tier 2: Common standards
+            CodeSystem.ISO2,  # 2-letter country codes (most common)
+            CodeSystem.ISO_NUMERIC,  # 3-digit numeric (UN official)
+            CodeSystem.M49,  # 3-digit UN region codes
+            CodeSystem.ISO3,  # 3-letter country codes
+            CodeSystem.ISO3166_2,  # "US-CA" subdivision codes
+            # Tier 3: Domain-specific
+            CodeSystem.GEONAMES,  # Numeric GeoNames ID
+            CodeSystem.WB,  # World Bank codes (2-3 letters)
+            CodeSystem.DAC,  # OECD DAC codes
+            CodeSystem.NUTS,  # EU statistical regions
+            CodeSystem.LAU,  # EU local admin units
+            CodeSystem.PCODE,  # OCHA humanitarian P-codes
+            CodeSystem.FIPS,  # Deprecated FIPS codes (last resort)
+        ]
+    @staticmethod
+    def humanitarian() -> list[CodeSystem]:
+        """
+        Humanitarian/OCHA context priority.
+        Prioritizes OCHA P-codes for humanitarian coordination,
+        followed by UN standards and ISO codes.
+        Use when working with:
+        - UN OCHA data
+        - Humanitarian response datasets
+        - Emergency coordination systems
+        """
+        return [
+            CodeSystem.DCID,
+            CodeSystem.WIKIDATA,
+            CodeSystem.PCODE,  # OCHA P-codes prioritized
+            CodeSystem.M49,  # UN M49 regions
+            CodeSystem.ISO2,
+            CodeSystem.ISO3,
+            CodeSystem.ISO_NUMERIC,
+            CodeSystem.ISO3166_2,
+            CodeSystem.GEONAMES,
+            CodeSystem.WB,
+            CodeSystem.DAC,
+            CodeSystem.NUTS,
+            CodeSystem.LAU,
+            CodeSystem.FIPS,
+        ]
+    @staticmethod
+    def world_bank() -> list[CodeSystem]:
+        """
+        World Bank context priority.
+        Prioritizes World Bank codes for development datasets,
+        followed by OECD DAC and ISO standards.
+        Use when working with:
+        - World Bank datasets
+        - Development indicators
+        - OECD DAC data
+        """
+        return [
+            CodeSystem.DCID,
+            CodeSystem.WIKIDATA,
+            CodeSystem.WB,  # World Bank codes prioritized
+            CodeSystem.DAC,  # OECD DAC codes
+            CodeSystem.ISO2,
+            CodeSystem.ISO3,
+            CodeSystem.ISO_NUMERIC,
+            CodeSystem.M49,
+            CodeSystem.ISO3166_2,
+            CodeSystem.GEONAMES,
+            CodeSystem.PCODE,
+            CodeSystem.NUTS,
+            CodeSystem.LAU,
+            CodeSystem.FIPS,
+        ]
+    @staticmethod
+    def european_union() -> list[CodeSystem]:
+        """
+        European Union statistical context priority.
+        Prioritizes EU NUTS and LAU codes for European regional statistics,
+        followed by ISO codes.
+        Use when working with:
+        - Eurostat datasets
+        - EU regional statistics
+        - NUTS/LAU hierarchies
+        """
+        return [
+            CodeSystem.DCID,
+            CodeSystem.WIKIDATA,
+            CodeSystem.NUTS,  # EU NUTS regions prioritized
+            CodeSystem.LAU,  # EU local admin units
+            CodeSystem.ISO2,
+            CodeSystem.ISO3,
+            CodeSystem.ISO3166_2,
+            CodeSystem.ISO_NUMERIC,
+            CodeSystem.M49,
+            CodeSystem.GEONAMES,
+            CodeSystem.WB,
+            CodeSystem.DAC,
+            CodeSystem.PCODE,
+            CodeSystem.FIPS,
+        ]
+    @staticmethod
+    def academic() -> list[CodeSystem]:
+        """
+        Academic/research context priority.
+        Prioritizes Wikidata and GeoNames for research datasets,
+        followed by ISO standards.
+        Use when working with:
+        - Research datasets
+        - Wikidata-based knowledge graphs
+        - GeoNames geographic data
+        - Academic publications
+        """
+        return [
+            CodeSystem.WIKIDATA,  # Wikidata prioritized for research
+            CodeSystem.DCID,
+            CodeSystem.GEONAMES,  # GeoNames prioritized for geography
+            CodeSystem.ISO2,
+            CodeSystem.ISO3,
+            CodeSystem.ISO_NUMERIC,
+            CodeSystem.M49,
+            CodeSystem.ISO3166_2,
+            CodeSystem.WB,
+            CodeSystem.DAC,
+            CodeSystem.NUTS,
+            CodeSystem.LAU,
+            CodeSystem.PCODE,
+            CodeSystem.FIPS,
+        ]

resolvekit/matchers/protocols.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""Protocols for matcher implementations."""
+from typing import Protocol
+from resolvekit.types import Candidate, MatchContext
+class MatcherProtocol(Protocol):
+    """
+    Structural protocol for matchers.
+    Matchers do not need to inherit from this class.
+    Any class implementing the match() method signature
+    automatically satisfies this protocol (duck typing).
+    """
+    def match(
+        self,
+        query: str,
+        normalized_query: str,
+        limit: int = 10,
+        context: MatchContext | None = None,
+    ) -> list[Candidate]:
+        """
+        Find matching candidates for a query.
+        Args:
+            query: Original query string
+            normalized_query: Normalized version (from TextNormalizer)
+            limit: Maximum candidates to return
+            context: Optional filtering context
+        Returns:
+            List of candidates ordered by score (descending)
+        """
+        ...
+class CodeValidatorProtocol(Protocol):
+    """
+    Structural protocol for code system validators.
+    Each code system (ISO2, ISO3, M49, etc.) implements
+    this protocol to validate and normalize codes.
+    """
+    def validate(self, value: str) -> tuple[bool, str | None]:
+        """
+        Validate code format.
+        Args:
+            value: Code value to validate
+        Returns:
+            Tuple of (is_valid, error_message).
+            If valid, error_message is None.
+        """
+        ...
+    def normalize(self, value: str) -> str:
+        """
+        Normalize code to canonical form.
+        Examples:
+            - "us" -> "US" (ISO2)
+            - "fra" -> "FRA" (ISO3)
+            - "Q123" -> "Q123" (Wikidata, case-sensitive)
+        Args:
+            value: Code value to normalize
+        Returns:
+            Normalized code value
+        """
+        ...