resolvekit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. resolvekit/README.md +134 -0
  2. resolvekit/__init__.py +67 -0
  3. resolvekit/api/README.md +165 -0
  4. resolvekit/api/__init__.py +10 -0
  5. resolvekit/api/convenience.py +53 -0
  6. resolvekit/api/resolver.py +457 -0
  7. resolvekit/builders/README.md +173 -0
  8. resolvekit/builders/__init__.py +0 -0
  9. resolvekit/calibration/README.md +351 -0
  10. resolvekit/calibration/__init__.py +12 -0
  11. resolvekit/calibration/calibrator.py +184 -0
  12. resolvekit/calibration/features.py +139 -0
  13. resolvekit/calibration/models.py +78 -0
  14. resolvekit/cli/README.md +215 -0
  15. resolvekit/cli/__init__.py +0 -0
  16. resolvekit/cli/main.py +18 -0
  17. resolvekit/config.py +128 -0
  18. resolvekit/constants.py +252 -0
  19. resolvekit/constraints/README.md +102 -0
  20. resolvekit/constraints/__init__.py +17 -0
  21. resolvekit/constraints/constraint_engine.py +111 -0
  22. resolvekit/constraints/hierarchy_validator.py +148 -0
  23. resolvekit/constraints/membership_validator.py +60 -0
  24. resolvekit/constraints/protocols.py +33 -0
  25. resolvekit/constraints/temporal_validator.py +43 -0
  26. resolvekit/constraints/type_validator.py +42 -0
  27. resolvekit/data/README.md +165 -0
  28. resolvekit/data/__init__.py +14 -0
  29. resolvekit/data/alias_repository.py +206 -0
  30. resolvekit/data/code_repository.py +85 -0
  31. resolvekit/data/context_filters.py +49 -0
  32. resolvekit/data/db_manager.py +196 -0
  33. resolvekit/data/entity_repository.py +466 -0
  34. resolvekit/data/membership_repository.py +107 -0
  35. resolvekit/data/query_builder.py +177 -0
  36. resolvekit/data/schema.py +122 -0
  37. resolvekit/disambiguation/README.md +72 -0
  38. resolvekit/disambiguation/__init__.py +0 -0
  39. resolvekit/extraction/README.md +204 -0
  40. resolvekit/extraction/__init__.py +0 -0
  41. resolvekit/matchers/README.md +77 -0
  42. resolvekit/matchers/__init__.py +65 -0
  43. resolvekit/matchers/alias_exact.py +65 -0
  44. resolvekit/matchers/canonical_name.py +62 -0
  45. resolvekit/matchers/cascade.py +127 -0
  46. resolvekit/matchers/code_validators.py +250 -0
  47. resolvekit/matchers/exact_code.py +177 -0
  48. resolvekit/matchers/fts_matcher.py +106 -0
  49. resolvekit/matchers/fuzzy_matcher.py +142 -0
  50. resolvekit/matchers/priorities.py +174 -0
  51. resolvekit/matchers/protocols.py +75 -0
  52. resolvekit/normalization/README.md +192 -0
  53. resolvekit/normalization/__init__.py +8 -0
  54. resolvekit/normalization/normalizer.py +164 -0
  55. resolvekit/overlays/README.md +226 -0
  56. resolvekit/overlays/__init__.py +0 -0
  57. resolvekit/types.py +534 -0
  58. resolvekit/utils/README.md +188 -0
  59. resolvekit/utils/__init__.py +48 -0
  60. resolvekit/utils/cache.py +109 -0
  61. resolvekit/utils/dates.py +339 -0
  62. resolvekit/utils/errors.py +145 -0
  63. resolvekit/utils/files.py +366 -0
  64. resolvekit/utils/logging.py +219 -0
  65. resolvekit/utils/text.py +475 -0
  66. resolvekit/utils/validation.py +301 -0
  67. resolvekit-0.0.1.dist-info/METADATA +36 -0
  68. resolvekit-0.0.1.dist-info/RECORD +70 -0
  69. resolvekit-0.0.1.dist-info/WHEEL +4 -0
  70. resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
resolvekit/README.md ADDED
@@ -0,0 +1,134 @@
1
+ # resolvekit Package Structure
2
+
3
+ This directory contains the main resolvekit package implementation.
4
+
5
+ ## Module Overview
6
+
7
+ ### Core Resolution Pipeline
8
+
9
+ 1. **normalization/** - Text preprocessing and normalization
10
+ - Unicode normalization (NFKC/NFKD)
11
+ - Diacritic handling and case folding
12
+ - Transliteration support
13
+
14
+ 2. **matchers/** - Candidate generation cascade
15
+ - Exact code matcher (ISO, DCID, etc.)
16
+ - Canonical name matcher
17
+ - Alias exact matcher
18
+ - FTS matcher (SQLite FTS5)
19
+ - Fuzzy matcher (bounded)
20
+ - Semantic matcher (optional)
21
+
22
+ 3. **disambiguation/** - Ambiguity resolution
23
+ - Ambiguity detection
24
+ - Semantic sidecar (HNSW)
25
+ - Context analysis
26
+ - Default heuristics
27
+
28
+ 4. **constraints/** - KG and temporal validation
29
+ - Type validation
30
+ - Hierarchy validation
31
+ - Temporal validity
32
+ - Membership validation
33
+
34
+ 5. **calibration/** - Confidence scoring
35
+ - Feature extraction
36
+ - Calibration models
37
+ - Score fusion
38
+
39
+ ### Data Management
40
+
41
+ 6. **data/** - Data storage and access
42
+ - SQLite database management
43
+ - Schema definitions
44
+ - Data models and loaders
45
+ - Query builders
46
+
47
+ 7. **overlays/** - Custom data extensions
48
+ - Overlay management
49
+ - Precedence handling
50
+ - Overlay writers and validators
51
+
52
+ 8. **builders/** - Data pack building
53
+ - ETL pipelines
54
+ - Pack builders
55
+ - Calibration training
56
+ - Quality assurance
57
+
58
+ ### Interfaces
59
+
60
+ 9. **api/** - Python API
61
+ - Main Resolver class
62
+ - Resolution operations
63
+ - Code conversion
64
+ - Hierarchy navigation
65
+
66
+ 10. **cli/** - Command-line interface
67
+ - CLI commands
68
+ - Output formatters
69
+ - Interactive prompts
70
+
71
+ ### Additional Features
72
+
73
+ 11. **extraction/** - Entity extraction from text
74
+ - Dictionary matching
75
+ - NER assistance
76
+ - Context extraction
77
+
78
+ 12. **utils/** - Shared utilities
79
+ - Logging
80
+ - Validation
81
+ - Text utilities
82
+ - Caching
83
+
84
+ ## Key Files
85
+
86
+ - **types.py** - Type definitions and data classes
87
+ - **constants.py** - Constants and configuration defaults
88
+ - **__init__.py** - Package initialization and exports
89
+
90
+ ## Implementation Status
91
+
92
+ Current status: **Phase A - Core Resolver** (scaffolding complete)
93
+
94
+ See `implementation-plan.md` in the repository root for the full implementation roadmap.
95
+
96
+ ## Development Workflow
97
+
98
+ 1. Each module has its own README explaining its purpose and components
99
+ 2. Start with Phase A modules: normalization, matchers, data, calibration, api, cli
100
+ 3. Follow test-driven development (write tests first)
101
+ 4. Maintain type hints and docstrings for all public APIs
102
+ 5. Run linting and type checking before commits
103
+
104
+ ## Quick Start for Developers
105
+
106
+ ```bash
107
+ # Install dependencies
108
+ uv sync
109
+
110
+ # Run tests (when available)
111
+ uv run pytest
112
+
113
+ # Run linting
114
+ uv run ruff check src/resolvekit
115
+
116
+ # Run type checking
117
+ uv run mypy src/resolvekit
118
+
119
+ # Run CLI
120
+ uv run resolvekit
121
+ ```
122
+
123
+ ## Architecture Principles
124
+
125
+ 1. **Bounded cascade**: Fail-fast with early matchers, limit expensive operations
126
+ 2. **Offline-first**: Zero runtime network dependencies
127
+ 3. **Explainable**: Return confidence scores and alternatives with reasoning
128
+ 4. **Extensible**: Support overlays at data, config, and code levels
129
+ 5. **Temporal-aware**: Handle time-varying data from day one
130
+ 6. **Type-safe**: Full type annotations throughout
131
+
132
+ ## Next Steps
133
+
134
+ See individual module READMEs for implementation details and priorities.
resolvekit/__init__.py ADDED
@@ -0,0 +1,67 @@
1
+ """
2
+ resolvekit - An open offline resolver for places and entities.
3
+
4
+ resolvekit is a local, offline-first entity and place resolution system that maps
5
+ messy place/entity strings and codes to canonical entities with calibrated
6
+ confidence scores.
7
+ """
8
+
9
+ from resolvekit.api import Resolver, resolve, resolve_many
10
+ from resolvekit.config import ResolvekitConfig
11
+ from resolvekit.constants import VERSION
12
+ from resolvekit.types import (
13
+ AliasType,
14
+ Candidate,
15
+ CodeSystem,
16
+ Entity,
17
+ EntityType,
18
+ Explanation,
19
+ ExplanationMode,
20
+ ExtractedEntity,
21
+ MatchContext,
22
+ Membership,
23
+ OutputFormat,
24
+ Resolution,
25
+ )
26
+
27
+ __version__ = VERSION
28
+
29
+ __all__ = [
30
+ "VERSION",
31
+ "AliasType",
32
+ "Candidate",
33
+ "CodeSystem",
34
+ "Entity",
35
+ "EntityType",
36
+ "Explanation",
37
+ "ExplanationMode",
38
+ "ExtractedEntity",
39
+ "MatchContext",
40
+ "Membership",
41
+ "ResolvekitConfig",
42
+ "OutputFormat",
43
+ "Resolution",
44
+ "Resolver",
45
+ "__version__",
46
+ "resolve",
47
+ "resolve_many",
48
+ ]
49
+
50
+
51
+ def main() -> None:
52
+ """CLI entry point."""
53
+ # Import here to avoid circular imports
54
+ from resolvekit.cli.main import cli
55
+
56
+ cli()
57
+
58
+
59
+ # Package metadata
60
+ __author__ = "Jorge Rivera"
61
+ __email__ = "jorge.rivera@one.org"
62
+ __license__ = "MIT"
63
+ __description__ = (
64
+ "A local, offline-first entity and place resolution system that maps "
65
+ "messy place/entity strings and codes to canonical entities with "
66
+ "calibrated confidence scores"
67
+ )
@@ -0,0 +1,165 @@
1
+ # API Module
2
+
3
+ ## Purpose
4
+
5
+ The API module provides the primary Python programmatic interface for entity resolution, code conversion, and related operations.
6
+
7
+ ## Components
8
+
9
+ ### Core API Classes
10
+
11
+ 1. **Resolver** (`resolver.py`)
12
+ - Main entry point for all resolution operations
13
+ - Manages configuration and data pack loading
14
+ - Orchestrates matchers, constraints, and calibration
15
+
16
+ 2. **Resolution** (`resolution.py`)
17
+ - Data class representing resolution results
18
+ - Contains matched entity, alternatives, confidence, explanation
19
+ - Supports JSON serialization
20
+
21
+ 3. **Config** (`config.py`)
22
+ - Configuration management
23
+ - User-provided settings (thresholds, data paths, etc.)
24
+ - Environment variable support
25
+
26
+ ### API Operations
27
+
28
+ - `resolve.py`: Single entity resolution
29
+ - `batch.py`: Batch resolution operations
30
+ - `convert.py`: Code system conversion
31
+ - `hierarchy.py`: Hierarchy navigation
32
+ - `extract.py`: Entity extraction from text (Phase F)
33
+ - `membership.py`: Group membership queries
34
+
35
+ ## Primary API: Resolver Class
36
+
37
+ ```python
38
+ from resolvekit.api import Resolver
39
+ from datetime import date
40
+
41
+ # Initialize resolver
42
+ resolver = Resolver(
43
+ data_path="/path/to/data/packs",
44
+ custom_overlays=["custom_aliases.yaml"],
45
+ min_confidence=0.7
46
+ )
47
+
48
+ # Single entity resolution
49
+ result = resolver.resolve(
50
+ "Cote d Ivoire",
51
+ entity_type=None,
52
+ parent=None,
53
+ at=None,
54
+ context=None,
55
+ return_alternates=5,
56
+ explain=False
57
+ )
58
+
59
+ # Access results
60
+ print(result.entity.dcid) # "country/CIV"
61
+ print(result.entity.canonical_name) # "Côte d'Ivoire"
62
+ print(result.confidence) # 0.95
63
+ print(result.alternatives) # [Entity, Entity, ...]
64
+
65
+ # Code conversion
66
+ dcid = resolver.code_to_dcid("FRA", code_type="iso3")
67
+ codes = resolver.dcid_to_codes("country/FRA")
68
+
69
+ # Batch operations
70
+ import pandas as pd
71
+ df = pd.DataFrame({"location": ["France", "UK", "Türkiye"]})
72
+ results = resolver.resolve_many(df["location"].tolist())
73
+
74
+ # Temporal queries
75
+ eu_2004 = resolver.get_group_members("EU", as_of=date(2004, 1, 1))
76
+ was_member = resolver.check_membership(
77
+ "country/POL",
78
+ group="EU",
79
+ as_of=date(2003, 12, 31)
80
+ )
81
+
82
+ # Hierarchy navigation
83
+ children = resolver.get_children("country/FRA", admin_level=1)
84
+ parent = resolver.get_parent("geoId/06")
85
+ path = resolver.get_hierarchy_path("some/admin3/entity")
86
+ ```
87
+
88
+ ## Resolution Result Structure
89
+
90
+ ```python
91
+ @dataclass
92
+ class Resolution:
93
+ """Resolution result."""
94
+
95
+ entity: Entity | None # Primary match
96
+ confidence: float # Calibrated probability
97
+ alternatives: list[Entity] # Alternative candidates
98
+ explanation: Explanation | None # Why this match (if explain=True)
99
+
100
+ def to_dict(self) -> dict:
101
+ """Convert to dictionary."""
102
+ ...
103
+
104
+ def to_json(self) -> str:
105
+ """Serialize to JSON."""
106
+ ...
107
+
108
+ @dataclass
109
+ class Entity:
110
+ """Resolved entity."""
111
+
112
+ dcid: str
113
+ canonical_name: str
114
+ entity_type: str
115
+ codes: dict[str, str] # {system: code}
116
+ parent_dcid: str | None
117
+ valid_from: date | None
118
+ valid_until: date | None
119
+
120
+ @dataclass
121
+ class Explanation:
122
+ """Resolution explanation."""
123
+
124
+ stages: list[str] # Stages executed
125
+ candidates: list[Candidate] # All candidates with features
126
+ rules_applied: list[str] # Disambiguation rules
127
+ calibration: dict # Model details
128
+ ```
129
+
130
+ ## Design Principles
131
+
132
+ 1. **Pythonic**: Snake_case, context managers, type hints
133
+ 2. **Sensible defaults**: Works out of box without configuration
134
+ 3. **Progressive disclosure**: Simple for basic use, powerful for advanced
135
+ 4. **Type safety**: Full type annotations for IDE support
136
+ 5. **Pandas integration**: Native support for DataFrame operations
137
+
138
+ ## Error Handling
139
+
140
+ ```python
141
+ from resolvekit.api import Resolver, ResolutionError, ConfigError
142
+
143
+ try:
144
+ resolver = Resolver()
145
+ except ConfigError as e:
146
+ # Handle configuration errors
147
+ print(f"Config error: {e}")
148
+
149
+ try:
150
+ result = resolver.resolve("invalid input")
151
+ except ResolutionError as e:
152
+ # Handle resolution errors
153
+ print(f"Resolution failed: {e}")
154
+
155
+ # No match returns None, not exception
156
+ result = resolver.resolve("zzzzzz")
157
+ if result.entity is None:
158
+ print("No match found")
159
+ ```
160
+
161
+ ## Implementation Priority
162
+
163
+ **Phase A** - Core resolver (Resolver class, resolve, batch)
164
+ **Phase B** - Code conversion and hierarchy APIs
165
+ **Phase F** - Entity extraction
@@ -0,0 +1,10 @@
1
+ """Public Resolver API."""
2
+
3
+ from resolvekit.api.convenience import resolve, resolve_many
4
+ from resolvekit.api.resolver import Resolver
5
+
6
+ __all__ = [
7
+ "Resolver",
8
+ "resolve",
9
+ "resolve_many",
10
+ ]
@@ -0,0 +1,53 @@
1
+ """Convenience functions for quick one-off queries."""
2
+
3
+ from typing import Any
4
+
5
+ from resolvekit.api.resolver import Resolver
6
+ from resolvekit.types import MatchContext, Resolution
7
+
8
+
9
+ def resolve(
10
+ query: str,
11
+ context: MatchContext | None = None,
12
+ **resolver_kwargs: Any,
13
+ ) -> Resolution:
14
+ """
15
+ Resolve single entity (convenience function).
16
+
17
+ Creates ephemeral Resolver instance for one-off queries.
18
+ For repeated queries, create a Resolver instance for better performance.
19
+
20
+ Args:
21
+ query: Entity string to resolve
22
+ context: Optional match context
23
+ **resolver_kwargs: Passed to Resolver() constructor
24
+ (min_confidence, explanation_mode, etc.)
25
+
26
+ Returns:
27
+ Resolution result
28
+ """
29
+ with Resolver(**resolver_kwargs) as resolver:
30
+ return resolver.resolve(query, context=context)
31
+
32
+
33
+ def resolve_many(
34
+ queries: list[str],
35
+ context: MatchContext | list[MatchContext | None] | None = None,
36
+ **resolver_kwargs: Any,
37
+ ) -> list[Resolution]:
38
+ """
39
+ Resolve multiple entities (convenience function).
40
+
41
+ Creates ephemeral Resolver instance.
42
+ For repeated queries, create a Resolver instance for better performance.
43
+
44
+ Args:
45
+ queries: List of query strings
46
+ context: Optional context (shared or per-query list)
47
+ **resolver_kwargs: Passed to Resolver() constructor
48
+
49
+ Returns:
50
+ List of Resolution objects
51
+ """
52
+ with Resolver(**resolver_kwargs) as resolver:
53
+ return resolver.resolve_many(queries, context=context)