resolvekit 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- resolvekit/README.md +134 -0
- resolvekit/__init__.py +67 -0
- resolvekit/api/README.md +165 -0
- resolvekit/api/__init__.py +10 -0
- resolvekit/api/convenience.py +53 -0
- resolvekit/api/resolver.py +457 -0
- resolvekit/builders/README.md +173 -0
- resolvekit/builders/__init__.py +0 -0
- resolvekit/calibration/README.md +351 -0
- resolvekit/calibration/__init__.py +12 -0
- resolvekit/calibration/calibrator.py +184 -0
- resolvekit/calibration/features.py +139 -0
- resolvekit/calibration/models.py +78 -0
- resolvekit/cli/README.md +215 -0
- resolvekit/cli/__init__.py +0 -0
- resolvekit/cli/main.py +18 -0
- resolvekit/config.py +128 -0
- resolvekit/constants.py +252 -0
- resolvekit/constraints/README.md +102 -0
- resolvekit/constraints/__init__.py +17 -0
- resolvekit/constraints/constraint_engine.py +111 -0
- resolvekit/constraints/hierarchy_validator.py +148 -0
- resolvekit/constraints/membership_validator.py +60 -0
- resolvekit/constraints/protocols.py +33 -0
- resolvekit/constraints/temporal_validator.py +43 -0
- resolvekit/constraints/type_validator.py +42 -0
- resolvekit/data/README.md +165 -0
- resolvekit/data/__init__.py +14 -0
- resolvekit/data/alias_repository.py +206 -0
- resolvekit/data/code_repository.py +85 -0
- resolvekit/data/context_filters.py +49 -0
- resolvekit/data/db_manager.py +196 -0
- resolvekit/data/entity_repository.py +466 -0
- resolvekit/data/membership_repository.py +107 -0
- resolvekit/data/query_builder.py +177 -0
- resolvekit/data/schema.py +122 -0
- resolvekit/disambiguation/README.md +72 -0
- resolvekit/disambiguation/__init__.py +0 -0
- resolvekit/extraction/README.md +204 -0
- resolvekit/extraction/__init__.py +0 -0
- resolvekit/matchers/README.md +77 -0
- resolvekit/matchers/__init__.py +65 -0
- resolvekit/matchers/alias_exact.py +65 -0
- resolvekit/matchers/canonical_name.py +62 -0
- resolvekit/matchers/cascade.py +127 -0
- resolvekit/matchers/code_validators.py +250 -0
- resolvekit/matchers/exact_code.py +177 -0
- resolvekit/matchers/fts_matcher.py +106 -0
- resolvekit/matchers/fuzzy_matcher.py +142 -0
- resolvekit/matchers/priorities.py +174 -0
- resolvekit/matchers/protocols.py +75 -0
- resolvekit/normalization/README.md +192 -0
- resolvekit/normalization/__init__.py +8 -0
- resolvekit/normalization/normalizer.py +164 -0
- resolvekit/overlays/README.md +226 -0
- resolvekit/overlays/__init__.py +0 -0
- resolvekit/types.py +534 -0
- resolvekit/utils/README.md +188 -0
- resolvekit/utils/__init__.py +48 -0
- resolvekit/utils/cache.py +109 -0
- resolvekit/utils/dates.py +339 -0
- resolvekit/utils/errors.py +145 -0
- resolvekit/utils/files.py +366 -0
- resolvekit/utils/logging.py +219 -0
- resolvekit/utils/text.py +475 -0
- resolvekit/utils/validation.py +301 -0
- resolvekit-0.0.1.dist-info/METADATA +36 -0
- resolvekit-0.0.1.dist-info/RECORD +70 -0
- resolvekit-0.0.1.dist-info/WHEEL +4 -0
- resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
resolvekit/README.md
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# resolvekit Package Structure
|
|
2
|
+
|
|
3
|
+
This directory contains the main resolvekit package implementation.
|
|
4
|
+
|
|
5
|
+
## Module Overview
|
|
6
|
+
|
|
7
|
+
### Core Resolution Pipeline
|
|
8
|
+
|
|
9
|
+
1. **normalization/** - Text preprocessing and normalization
|
|
10
|
+
- Unicode normalization (NFKC/NFKD)
|
|
11
|
+
- Diacritic handling and case folding
|
|
12
|
+
- Transliteration support
|
|
13
|
+
|
|
14
|
+
2. **matchers/** - Candidate generation cascade
|
|
15
|
+
- Exact code matcher (ISO, DCID, etc.)
|
|
16
|
+
- Canonical name matcher
|
|
17
|
+
- Alias exact matcher
|
|
18
|
+
- FTS matcher (SQLite FTS5)
|
|
19
|
+
- Fuzzy matcher (bounded)
|
|
20
|
+
- Semantic matcher (optional)
|
|
21
|
+
|
|
22
|
+
3. **disambiguation/** - Ambiguity resolution
|
|
23
|
+
- Ambiguity detection
|
|
24
|
+
- Semantic sidecar (HNSW)
|
|
25
|
+
- Context analysis
|
|
26
|
+
- Default heuristics
|
|
27
|
+
|
|
28
|
+
4. **constraints/** - KG and temporal validation
|
|
29
|
+
- Type validation
|
|
30
|
+
- Hierarchy validation
|
|
31
|
+
- Temporal validity
|
|
32
|
+
- Membership validation
|
|
33
|
+
|
|
34
|
+
5. **calibration/** - Confidence scoring
|
|
35
|
+
- Feature extraction
|
|
36
|
+
- Calibration models
|
|
37
|
+
- Score fusion
|
|
38
|
+
|
|
39
|
+
### Data Management
|
|
40
|
+
|
|
41
|
+
6. **data/** - Data storage and access
|
|
42
|
+
- SQLite database management
|
|
43
|
+
- Schema definitions
|
|
44
|
+
- Data models and loaders
|
|
45
|
+
- Query builders
|
|
46
|
+
|
|
47
|
+
7. **overlays/** - Custom data extensions
|
|
48
|
+
- Overlay management
|
|
49
|
+
- Precedence handling
|
|
50
|
+
- Overlay writers and validators
|
|
51
|
+
|
|
52
|
+
8. **builders/** - Data pack building
|
|
53
|
+
- ETL pipelines
|
|
54
|
+
- Pack builders
|
|
55
|
+
- Calibration training
|
|
56
|
+
- Quality assurance
|
|
57
|
+
|
|
58
|
+
### Interfaces
|
|
59
|
+
|
|
60
|
+
9. **api/** - Python API
|
|
61
|
+
- Main Resolver class
|
|
62
|
+
- Resolution operations
|
|
63
|
+
- Code conversion
|
|
64
|
+
- Hierarchy navigation
|
|
65
|
+
|
|
66
|
+
10. **cli/** - Command-line interface
|
|
67
|
+
- CLI commands
|
|
68
|
+
- Output formatters
|
|
69
|
+
- Interactive prompts
|
|
70
|
+
|
|
71
|
+
### Additional Features
|
|
72
|
+
|
|
73
|
+
11. **extraction/** - Entity extraction from text
|
|
74
|
+
- Dictionary matching
|
|
75
|
+
- NER assistance
|
|
76
|
+
- Context extraction
|
|
77
|
+
|
|
78
|
+
12. **utils/** - Shared utilities
|
|
79
|
+
- Logging
|
|
80
|
+
- Validation
|
|
81
|
+
- Text utilities
|
|
82
|
+
- Caching
|
|
83
|
+
|
|
84
|
+
## Key Files
|
|
85
|
+
|
|
86
|
+
- **types.py** - Type definitions and data classes
|
|
87
|
+
- **constants.py** - Constants and configuration defaults
|
|
88
|
+
- **__init__.py** - Package initialization and exports
|
|
89
|
+
|
|
90
|
+
## Implementation Status
|
|
91
|
+
|
|
92
|
+
Current status: **Phase A - Core Resolver** (scaffolding complete)
|
|
93
|
+
|
|
94
|
+
See `implementation-plan.md` in the repository root for the full implementation roadmap.
|
|
95
|
+
|
|
96
|
+
## Development Workflow
|
|
97
|
+
|
|
98
|
+
1. Each module has its own README explaining its purpose and components
|
|
99
|
+
2. Start with Phase A modules: normalization, matchers, data, calibration, api, cli
|
|
100
|
+
3. Follow test-driven development (write tests first)
|
|
101
|
+
4. Maintain type hints and docstrings for all public APIs
|
|
102
|
+
5. Run linting and type checking before commits
|
|
103
|
+
|
|
104
|
+
## Quick Start for Developers
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
# Install dependencies
|
|
108
|
+
uv sync
|
|
109
|
+
|
|
110
|
+
# Run tests (when available)
|
|
111
|
+
uv run pytest
|
|
112
|
+
|
|
113
|
+
# Run linting
|
|
114
|
+
uv run ruff check src/resolvekit
|
|
115
|
+
|
|
116
|
+
# Run type checking
|
|
117
|
+
uv run mypy src/resolvekit
|
|
118
|
+
|
|
119
|
+
# Run CLI
|
|
120
|
+
uv run resolvekit
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Architecture Principles
|
|
124
|
+
|
|
125
|
+
1. **Bounded cascade**: Fail-fast with early matchers, limit expensive operations
|
|
126
|
+
2. **Offline-first**: Zero runtime network dependencies
|
|
127
|
+
3. **Explainable**: Return confidence scores and alternatives with reasoning
|
|
128
|
+
4. **Extensible**: Support overlays at data, config, and code levels
|
|
129
|
+
5. **Temporal-aware**: Handle time-varying data from day one
|
|
130
|
+
6. **Type-safe**: Full type annotations throughout
|
|
131
|
+
|
|
132
|
+
## Next Steps
|
|
133
|
+
|
|
134
|
+
See individual module READMEs for implementation details and priorities.
|
resolvekit/__init__.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""
|
|
2
|
+
resolvekit - An open offline resolver for places and entities.
|
|
3
|
+
|
|
4
|
+
resolvekit is a local, offline-first entity and place resolution system that maps
|
|
5
|
+
messy place/entity strings and codes to canonical entities with calibrated
|
|
6
|
+
confidence scores.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from resolvekit.api import Resolver, resolve, resolve_many
|
|
10
|
+
from resolvekit.config import ResolvekitConfig
|
|
11
|
+
from resolvekit.constants import VERSION
|
|
12
|
+
from resolvekit.types import (
|
|
13
|
+
AliasType,
|
|
14
|
+
Candidate,
|
|
15
|
+
CodeSystem,
|
|
16
|
+
Entity,
|
|
17
|
+
EntityType,
|
|
18
|
+
Explanation,
|
|
19
|
+
ExplanationMode,
|
|
20
|
+
ExtractedEntity,
|
|
21
|
+
MatchContext,
|
|
22
|
+
Membership,
|
|
23
|
+
OutputFormat,
|
|
24
|
+
Resolution,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
__version__ = VERSION
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"VERSION",
|
|
31
|
+
"AliasType",
|
|
32
|
+
"Candidate",
|
|
33
|
+
"CodeSystem",
|
|
34
|
+
"Entity",
|
|
35
|
+
"EntityType",
|
|
36
|
+
"Explanation",
|
|
37
|
+
"ExplanationMode",
|
|
38
|
+
"ExtractedEntity",
|
|
39
|
+
"MatchContext",
|
|
40
|
+
"Membership",
|
|
41
|
+
"ResolvekitConfig",
|
|
42
|
+
"OutputFormat",
|
|
43
|
+
"Resolution",
|
|
44
|
+
"Resolver",
|
|
45
|
+
"__version__",
|
|
46
|
+
"resolve",
|
|
47
|
+
"resolve_many",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def main() -> None:
|
|
52
|
+
"""CLI entry point."""
|
|
53
|
+
# Import here to avoid circular imports
|
|
54
|
+
from resolvekit.cli.main import cli
|
|
55
|
+
|
|
56
|
+
cli()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# Package metadata
|
|
60
|
+
__author__ = "Jorge Rivera"
|
|
61
|
+
__email__ = "jorge.rivera@one.org"
|
|
62
|
+
__license__ = "MIT"
|
|
63
|
+
__description__ = (
|
|
64
|
+
"A local, offline-first entity and place resolution system that maps "
|
|
65
|
+
"messy place/entity strings and codes to canonical entities with "
|
|
66
|
+
"calibrated confidence scores"
|
|
67
|
+
)
|
resolvekit/api/README.md
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# API Module
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
|
|
5
|
+
The API module provides the primary Python programmatic interface for entity resolution, code conversion, and related operations.
|
|
6
|
+
|
|
7
|
+
## Components
|
|
8
|
+
|
|
9
|
+
### Core API Classes
|
|
10
|
+
|
|
11
|
+
1. **Resolver** (`resolver.py`)
|
|
12
|
+
- Main entry point for all resolution operations
|
|
13
|
+
- Manages configuration and data pack loading
|
|
14
|
+
- Orchestrates matchers, constraints, and calibration
|
|
15
|
+
|
|
16
|
+
2. **Resolution** (`resolution.py`)
|
|
17
|
+
- Data class representing resolution results
|
|
18
|
+
- Contains matched entity, alternatives, confidence, explanation
|
|
19
|
+
- Supports JSON serialization
|
|
20
|
+
|
|
21
|
+
3. **Config** (`config.py`)
|
|
22
|
+
- Configuration management
|
|
23
|
+
- User-provided settings (thresholds, data paths, etc.)
|
|
24
|
+
- Environment variable support
|
|
25
|
+
|
|
26
|
+
### API Operations
|
|
27
|
+
|
|
28
|
+
- `resolve.py`: Single entity resolution
|
|
29
|
+
- `batch.py`: Batch resolution operations
|
|
30
|
+
- `convert.py`: Code system conversion
|
|
31
|
+
- `hierarchy.py`: Hierarchy navigation
|
|
32
|
+
- `extract.py`: Entity extraction from text (Phase F)
|
|
33
|
+
- `membership.py`: Group membership queries
|
|
34
|
+
|
|
35
|
+
## Primary API: Resolver Class
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from resolvekit.api import Resolver
|
|
39
|
+
from datetime import date
|
|
40
|
+
|
|
41
|
+
# Initialize resolver
|
|
42
|
+
resolver = Resolver(
|
|
43
|
+
data_path="/path/to/data/packs",
|
|
44
|
+
custom_overlays=["custom_aliases.yaml"],
|
|
45
|
+
min_confidence=0.7
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Single entity resolution
|
|
49
|
+
result = resolver.resolve(
|
|
50
|
+
"Cote d Ivoire",
|
|
51
|
+
entity_type=None,
|
|
52
|
+
parent=None,
|
|
53
|
+
at=None,
|
|
54
|
+
context=None,
|
|
55
|
+
return_alternates=5,
|
|
56
|
+
explain=False
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Access results
|
|
60
|
+
print(result.entity.dcid) # "country/CIV"
|
|
61
|
+
print(result.entity.canonical_name) # "Côte d'Ivoire"
|
|
62
|
+
print(result.confidence) # 0.95
|
|
63
|
+
print(result.alternatives) # [Entity, Entity, ...]
|
|
64
|
+
|
|
65
|
+
# Code conversion
|
|
66
|
+
dcid = resolver.code_to_dcid("FRA", code_type="iso3")
|
|
67
|
+
codes = resolver.dcid_to_codes("country/FRA")
|
|
68
|
+
|
|
69
|
+
# Batch operations
|
|
70
|
+
import pandas as pd
|
|
71
|
+
df = pd.DataFrame({"location": ["France", "UK", "Türkiye"]})
|
|
72
|
+
results = resolver.resolve_many(df["location"].tolist())
|
|
73
|
+
|
|
74
|
+
# Temporal queries
|
|
75
|
+
eu_2004 = resolver.get_group_members("EU", as_of=date(2004, 1, 1))
|
|
76
|
+
was_member = resolver.check_membership(
|
|
77
|
+
"country/POL",
|
|
78
|
+
group="EU",
|
|
79
|
+
as_of=date(2003, 12, 31)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Hierarchy navigation
|
|
83
|
+
children = resolver.get_children("country/FRA", admin_level=1)
|
|
84
|
+
parent = resolver.get_parent("geoId/06")
|
|
85
|
+
path = resolver.get_hierarchy_path("some/admin3/entity")
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Resolution Result Structure
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
@dataclass
|
|
92
|
+
class Resolution:
|
|
93
|
+
"""Resolution result."""
|
|
94
|
+
|
|
95
|
+
entity: Entity | None # Primary match
|
|
96
|
+
confidence: float # Calibrated probability
|
|
97
|
+
alternatives: list[Entity] # Alternative candidates
|
|
98
|
+
explanation: Explanation | None # Why this match (if explain=True)
|
|
99
|
+
|
|
100
|
+
def to_dict(self) -> dict:
|
|
101
|
+
"""Convert to dictionary."""
|
|
102
|
+
...
|
|
103
|
+
|
|
104
|
+
def to_json(self) -> str:
|
|
105
|
+
"""Serialize to JSON."""
|
|
106
|
+
...
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class Entity:
|
|
110
|
+
"""Resolved entity."""
|
|
111
|
+
|
|
112
|
+
dcid: str
|
|
113
|
+
canonical_name: str
|
|
114
|
+
entity_type: str
|
|
115
|
+
codes: dict[str, str] # {system: code}
|
|
116
|
+
parent_dcid: str | None
|
|
117
|
+
valid_from: date | None
|
|
118
|
+
valid_until: date | None
|
|
119
|
+
|
|
120
|
+
@dataclass
|
|
121
|
+
class Explanation:
|
|
122
|
+
"""Resolution explanation."""
|
|
123
|
+
|
|
124
|
+
stages: list[str] # Stages executed
|
|
125
|
+
candidates: list[Candidate] # All candidates with features
|
|
126
|
+
rules_applied: list[str] # Disambiguation rules
|
|
127
|
+
calibration: dict # Model details
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Design Principles
|
|
131
|
+
|
|
132
|
+
1. **Pythonic**: Snake_case, context managers, type hints
|
|
133
|
+
2. **Sensible defaults**: Works out of box without configuration
|
|
134
|
+
3. **Progressive disclosure**: Simple for basic use, powerful for advanced
|
|
135
|
+
4. **Type safety**: Full type annotations for IDE support
|
|
136
|
+
5. **Pandas integration**: Native support for DataFrame operations
|
|
137
|
+
|
|
138
|
+
## Error Handling
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from resolvekit.api import Resolver, ResolutionError, ConfigError
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
resolver = Resolver()
|
|
145
|
+
except ConfigError as e:
|
|
146
|
+
# Handle configuration errors
|
|
147
|
+
print(f"Config error: {e}")
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
result = resolver.resolve("invalid input")
|
|
151
|
+
except ResolutionError as e:
|
|
152
|
+
# Handle resolution errors
|
|
153
|
+
print(f"Resolution failed: {e}")
|
|
154
|
+
|
|
155
|
+
# No match returns None, not exception
|
|
156
|
+
result = resolver.resolve("zzzzzz")
|
|
157
|
+
if result.entity is None:
|
|
158
|
+
print("No match found")
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Implementation Priority
|
|
162
|
+
|
|
163
|
+
**Phase A** - Core resolver (Resolver class, resolve, batch)
|
|
164
|
+
**Phase B** - Code conversion and hierarchy APIs
|
|
165
|
+
**Phase F** - Entity extraction
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Convenience functions for quick one-off queries."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from resolvekit.api.resolver import Resolver
|
|
6
|
+
from resolvekit.types import MatchContext, Resolution
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def resolve(
|
|
10
|
+
query: str,
|
|
11
|
+
context: MatchContext | None = None,
|
|
12
|
+
**resolver_kwargs: Any,
|
|
13
|
+
) -> Resolution:
|
|
14
|
+
"""
|
|
15
|
+
Resolve single entity (convenience function).
|
|
16
|
+
|
|
17
|
+
Creates ephemeral Resolver instance for one-off queries.
|
|
18
|
+
For repeated queries, create a Resolver instance for better performance.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
query: Entity string to resolve
|
|
22
|
+
context: Optional match context
|
|
23
|
+
**resolver_kwargs: Passed to Resolver() constructor
|
|
24
|
+
(min_confidence, explanation_mode, etc.)
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Resolution result
|
|
28
|
+
"""
|
|
29
|
+
with Resolver(**resolver_kwargs) as resolver:
|
|
30
|
+
return resolver.resolve(query, context=context)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def resolve_many(
|
|
34
|
+
queries: list[str],
|
|
35
|
+
context: MatchContext | list[MatchContext | None] | None = None,
|
|
36
|
+
**resolver_kwargs: Any,
|
|
37
|
+
) -> list[Resolution]:
|
|
38
|
+
"""
|
|
39
|
+
Resolve multiple entities (convenience function).
|
|
40
|
+
|
|
41
|
+
Creates ephemeral Resolver instance.
|
|
42
|
+
For repeated queries, create a Resolver instance for better performance.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
queries: List of query strings
|
|
46
|
+
context: Optional context (shared or per-query list)
|
|
47
|
+
**resolver_kwargs: Passed to Resolver() constructor
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
List of Resolution objects
|
|
51
|
+
"""
|
|
52
|
+
with Resolver(**resolver_kwargs) as resolver:
|
|
53
|
+
return resolver.resolve_many(queries, context=context)
|