resolvekit 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- resolvekit-0.0.1/PKG-INFO +36 -0
- resolvekit-0.0.1/README.md +2 -0
- resolvekit-0.0.1/pyproject.toml +214 -0
- resolvekit-0.0.1/src/resolvekit/README.md +134 -0
- resolvekit-0.0.1/src/resolvekit/__init__.py +67 -0
- resolvekit-0.0.1/src/resolvekit/api/README.md +165 -0
- resolvekit-0.0.1/src/resolvekit/api/__init__.py +10 -0
- resolvekit-0.0.1/src/resolvekit/api/convenience.py +53 -0
- resolvekit-0.0.1/src/resolvekit/api/resolver.py +457 -0
- resolvekit-0.0.1/src/resolvekit/builders/README.md +173 -0
- resolvekit-0.0.1/src/resolvekit/builders/__init__.py +0 -0
- resolvekit-0.0.1/src/resolvekit/calibration/README.md +351 -0
- resolvekit-0.0.1/src/resolvekit/calibration/__init__.py +12 -0
- resolvekit-0.0.1/src/resolvekit/calibration/calibrator.py +184 -0
- resolvekit-0.0.1/src/resolvekit/calibration/features.py +139 -0
- resolvekit-0.0.1/src/resolvekit/calibration/models.py +78 -0
- resolvekit-0.0.1/src/resolvekit/cli/README.md +215 -0
- resolvekit-0.0.1/src/resolvekit/cli/__init__.py +0 -0
- resolvekit-0.0.1/src/resolvekit/cli/main.py +18 -0
- resolvekit-0.0.1/src/resolvekit/config.py +128 -0
- resolvekit-0.0.1/src/resolvekit/constants.py +252 -0
- resolvekit-0.0.1/src/resolvekit/constraints/README.md +102 -0
- resolvekit-0.0.1/src/resolvekit/constraints/__init__.py +17 -0
- resolvekit-0.0.1/src/resolvekit/constraints/constraint_engine.py +111 -0
- resolvekit-0.0.1/src/resolvekit/constraints/hierarchy_validator.py +148 -0
- resolvekit-0.0.1/src/resolvekit/constraints/membership_validator.py +60 -0
- resolvekit-0.0.1/src/resolvekit/constraints/protocols.py +33 -0
- resolvekit-0.0.1/src/resolvekit/constraints/temporal_validator.py +43 -0
- resolvekit-0.0.1/src/resolvekit/constraints/type_validator.py +42 -0
- resolvekit-0.0.1/src/resolvekit/data/README.md +165 -0
- resolvekit-0.0.1/src/resolvekit/data/__init__.py +14 -0
- resolvekit-0.0.1/src/resolvekit/data/alias_repository.py +206 -0
- resolvekit-0.0.1/src/resolvekit/data/code_repository.py +85 -0
- resolvekit-0.0.1/src/resolvekit/data/context_filters.py +49 -0
- resolvekit-0.0.1/src/resolvekit/data/db_manager.py +196 -0
- resolvekit-0.0.1/src/resolvekit/data/entity_repository.py +466 -0
- resolvekit-0.0.1/src/resolvekit/data/membership_repository.py +107 -0
- resolvekit-0.0.1/src/resolvekit/data/query_builder.py +177 -0
- resolvekit-0.0.1/src/resolvekit/data/schema.py +122 -0
- resolvekit-0.0.1/src/resolvekit/disambiguation/README.md +72 -0
- resolvekit-0.0.1/src/resolvekit/disambiguation/__init__.py +0 -0
- resolvekit-0.0.1/src/resolvekit/extraction/README.md +204 -0
- resolvekit-0.0.1/src/resolvekit/extraction/__init__.py +0 -0
- resolvekit-0.0.1/src/resolvekit/matchers/README.md +77 -0
- resolvekit-0.0.1/src/resolvekit/matchers/__init__.py +65 -0
- resolvekit-0.0.1/src/resolvekit/matchers/alias_exact.py +65 -0
- resolvekit-0.0.1/src/resolvekit/matchers/canonical_name.py +62 -0
- resolvekit-0.0.1/src/resolvekit/matchers/cascade.py +127 -0
- resolvekit-0.0.1/src/resolvekit/matchers/code_validators.py +250 -0
- resolvekit-0.0.1/src/resolvekit/matchers/exact_code.py +177 -0
- resolvekit-0.0.1/src/resolvekit/matchers/fts_matcher.py +106 -0
- resolvekit-0.0.1/src/resolvekit/matchers/fuzzy_matcher.py +142 -0
- resolvekit-0.0.1/src/resolvekit/matchers/priorities.py +174 -0
- resolvekit-0.0.1/src/resolvekit/matchers/protocols.py +75 -0
- resolvekit-0.0.1/src/resolvekit/normalization/README.md +192 -0
- resolvekit-0.0.1/src/resolvekit/normalization/__init__.py +8 -0
- resolvekit-0.0.1/src/resolvekit/normalization/normalizer.py +164 -0
- resolvekit-0.0.1/src/resolvekit/overlays/README.md +226 -0
- resolvekit-0.0.1/src/resolvekit/overlays/__init__.py +0 -0
- resolvekit-0.0.1/src/resolvekit/types.py +534 -0
- resolvekit-0.0.1/src/resolvekit/utils/README.md +188 -0
- resolvekit-0.0.1/src/resolvekit/utils/__init__.py +48 -0
- resolvekit-0.0.1/src/resolvekit/utils/cache.py +109 -0
- resolvekit-0.0.1/src/resolvekit/utils/dates.py +339 -0
- resolvekit-0.0.1/src/resolvekit/utils/errors.py +145 -0
- resolvekit-0.0.1/src/resolvekit/utils/files.py +366 -0
- resolvekit-0.0.1/src/resolvekit/utils/logging.py +219 -0
- resolvekit-0.0.1/src/resolvekit/utils/text.py +475 -0
- resolvekit-0.0.1/src/resolvekit/utils/validation.py +301 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: resolvekit
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A local, offline-first entity and place resolution system that maps messy place/entity strings and codes to canonical entities with calibrated confidence scores
|
|
5
|
+
Keywords: entity-resolution,geocoding,place-names,data-commons,iso-codes,offline,disambiguation,normalization
|
|
6
|
+
Author: Jorge Rivera
|
|
7
|
+
Author-email: Jorge Rivera <jorge.rivera@one.org>
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: GIS
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
21
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Dist: pydantic>=2.0.0
|
|
24
|
+
Requires-Dist: pydantic-settings>=2.0.0
|
|
25
|
+
Requires-Dist: rapidfuzz>=3.0.0
|
|
26
|
+
Requires-Dist: python-dateutil>=2.8.0
|
|
27
|
+
Requires-Dist: sqlalchemy>=2.0.0
|
|
28
|
+
Requires-Dist: sqlmodel>=0.0.24
|
|
29
|
+
Requires-Python: >=3.11
|
|
30
|
+
Project-URL: Documentation, https://github.com/jm-rivera/resolvekit
|
|
31
|
+
Project-URL: Homepage, https://github.com/jm-rivera/resolvekit
|
|
32
|
+
Project-URL: Repository, https://github.com/jm-rivera/resolvekit
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# resolvekit
|
|
36
|
+
An open offline resolver for places and entities
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "resolvekit"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "A local, offline-first entity and place resolution system that maps messy place/entity strings and codes to canonical entities with calibrated confidence scores"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
|
|
7
|
+
license = {text = "MIT"}
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
authors = [
|
|
10
|
+
{name = "Jorge Rivera", email = "jorge.rivera@one.org"}
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 3 - Alpha",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
24
|
+
"Topic :: Scientific/Engineering :: GIS",
|
|
25
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
26
|
+
"Topic :: Text Processing :: Linguistic",
|
|
27
|
+
"Typing :: Typed"
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
keywords = [
|
|
31
|
+
"entity-resolution",
|
|
32
|
+
"geocoding",
|
|
33
|
+
"place-names",
|
|
34
|
+
"data-commons",
|
|
35
|
+
"iso-codes",
|
|
36
|
+
"offline",
|
|
37
|
+
"disambiguation",
|
|
38
|
+
"normalization"
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
dependencies = [
|
|
42
|
+
"pydantic>=2.0.0",
|
|
43
|
+
"pydantic-settings>=2.0.0",
|
|
44
|
+
"rapidfuzz>=3.0.0",
|
|
45
|
+
"python-dateutil>=2.8.0",
|
|
46
|
+
"sqlalchemy>=2.0.0",
|
|
47
|
+
"sqlmodel>=0.0.24",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
[project.urls]
|
|
51
|
+
Homepage = "https://github.com/jm-rivera/resolvekit"
|
|
52
|
+
Repository = "https://github.com/jm-rivera/resolvekit"
|
|
53
|
+
Documentation = "https://github.com/jm-rivera/resolvekit"
|
|
54
|
+
|
|
55
|
+
[project.scripts]
|
|
56
|
+
resolvekit = "resolvekit:main"
|
|
57
|
+
|
|
58
|
+
[build-system]
|
|
59
|
+
requires = ["uv_build>=0.8.22,<0.9.0"]
|
|
60
|
+
build-backend = "uv_build"
|
|
61
|
+
|
|
62
|
+
[dependency-groups]
|
|
63
|
+
dev = [
|
|
64
|
+
"uv-build>=0.8.22",
|
|
65
|
+
"pre-commit>=4.0.0",
|
|
66
|
+
"ruff>=0.14.0",
|
|
67
|
+
"mypy>=1.8.0",
|
|
68
|
+
"pandas-stubs>=2.3.2.250926",
|
|
69
|
+
]
|
|
70
|
+
test = [
|
|
71
|
+
"pytest>=8.0.0",
|
|
72
|
+
"pytest-benchmark>=5.1.0",
|
|
73
|
+
"pytest-cov>=4.1.0",
|
|
74
|
+
"pytest-xdist>=3.5.0",
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
[tool.ruff]
|
|
78
|
+
# Set the maximum line length
|
|
79
|
+
line-length = 88
|
|
80
|
+
|
|
81
|
+
# Target Python 3.10+
|
|
82
|
+
target-version = "py313"
|
|
83
|
+
|
|
84
|
+
# Exclude common directories
|
|
85
|
+
exclude = [
|
|
86
|
+
".git",
|
|
87
|
+
".venv",
|
|
88
|
+
"venv",
|
|
89
|
+
"__pycache__",
|
|
90
|
+
"build",
|
|
91
|
+
"dist",
|
|
92
|
+
"*.egg-info",
|
|
93
|
+
".pytest_cache",
|
|
94
|
+
"htmlcov",
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
[tool.ruff.lint]
|
|
98
|
+
# Enable recommended rules plus extras
|
|
99
|
+
select = [
|
|
100
|
+
"E", # pycodestyle errors
|
|
101
|
+
"W", # pycodestyle warnings
|
|
102
|
+
"F", # pyflakes
|
|
103
|
+
"I", # isort
|
|
104
|
+
"N", # pep8-naming
|
|
105
|
+
"UP", # pyupgrade
|
|
106
|
+
"B", # flake8-bugbear
|
|
107
|
+
"C4", # flake8-comprehensions
|
|
108
|
+
"SIM", # flake8-simplify
|
|
109
|
+
"PL", # pylint
|
|
110
|
+
"RUF", # ruff-specific rules
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
# Ignore specific rules that might be too strict
|
|
114
|
+
ignore = [
|
|
115
|
+
"E501", # Line too long (handled by formatter)
|
|
116
|
+
"PLR0913", # Too many arguments
|
|
117
|
+
"PLR2004", # Magic value used in comparison
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
# Allow autofix for all enabled rules
|
|
121
|
+
fixable = ["ALL"]
|
|
122
|
+
unfixable = []
|
|
123
|
+
|
|
124
|
+
[tool.ruff.lint.per-file-ignores]
|
|
125
|
+
# Allow test files to have more relaxed rules
|
|
126
|
+
"tests/**/*.py" = ["E501", "PLR2004"]
|
|
127
|
+
# Allow imports not at top level in __init__.py (used to avoid circular imports)
|
|
128
|
+
"src/resolvekit/__init__.py" = ["PLC0415"]
|
|
129
|
+
|
|
130
|
+
[tool.ruff.format]
|
|
131
|
+
# Use double quotes for strings
|
|
132
|
+
quote-style = "double"
|
|
133
|
+
|
|
134
|
+
# Use spaces for indentation
|
|
135
|
+
indent-style = "space"
|
|
136
|
+
|
|
137
|
+
# Like Black, respect magic trailing commas
|
|
138
|
+
skip-magic-trailing-comma = false
|
|
139
|
+
|
|
140
|
+
# Like Black, automatically detect line endings
|
|
141
|
+
line-ending = "auto"
|
|
142
|
+
|
|
143
|
+
[tool.mypy]
|
|
144
|
+
python_version = "3.13"
|
|
145
|
+
warn_return_any = true
|
|
146
|
+
warn_unused_configs = true
|
|
147
|
+
disallow_untyped_defs = true
|
|
148
|
+
disallow_incomplete_defs = true
|
|
149
|
+
check_untyped_defs = true
|
|
150
|
+
no_implicit_optional = true
|
|
151
|
+
warn_redundant_casts = true
|
|
152
|
+
warn_unused_ignores = true
|
|
153
|
+
warn_no_return = true
|
|
154
|
+
strict_equality = true
|
|
155
|
+
|
|
156
|
+
[tool.pytest.ini_options]
|
|
157
|
+
# Test discovery patterns
|
|
158
|
+
testpaths = ["tests"]
|
|
159
|
+
python_files = ["test_*.py"]
|
|
160
|
+
python_classes = ["Test*"]
|
|
161
|
+
python_functions = ["test_*"]
|
|
162
|
+
|
|
163
|
+
# Test execution options
|
|
164
|
+
addopts = [
|
|
165
|
+
"-v",
|
|
166
|
+
"--strict-markers",
|
|
167
|
+
"--strict-config",
|
|
168
|
+
"--tb=short",
|
|
169
|
+
"--cov=src/resolvekit",
|
|
170
|
+
"--cov-report=term-missing",
|
|
171
|
+
"--cov-report=html",
|
|
172
|
+
"--cov-branch",
|
|
173
|
+
]
|
|
174
|
+
|
|
175
|
+
# Custom markers for organizing tests
|
|
176
|
+
markers = [
|
|
177
|
+
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
|
178
|
+
"integration: marks tests as integration tests",
|
|
179
|
+
"unit: marks tests as unit tests",
|
|
180
|
+
"matcher: marks tests related to matcher functionality",
|
|
181
|
+
"calibration: marks tests related to confidence calibration",
|
|
182
|
+
"temporal: marks tests related to temporal validation",
|
|
183
|
+
]
|
|
184
|
+
|
|
185
|
+
# Ignore patterns
|
|
186
|
+
norecursedirs = [
|
|
187
|
+
".git",
|
|
188
|
+
".venv",
|
|
189
|
+
"venv",
|
|
190
|
+
"__pycache__",
|
|
191
|
+
"*.egg-info",
|
|
192
|
+
".pytest_cache",
|
|
193
|
+
"htmlcov",
|
|
194
|
+
]
|
|
195
|
+
|
|
196
|
+
[tool.coverage.run]
|
|
197
|
+
source = ["src/resolvekit"]
|
|
198
|
+
omit = [
|
|
199
|
+
"*/tests/*",
|
|
200
|
+
"*/__pycache__/*",
|
|
201
|
+
"*/__init__.py",
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
[tool.coverage.report]
|
|
205
|
+
precision = 2
|
|
206
|
+
exclude_lines = [
|
|
207
|
+
"pragma: no cover",
|
|
208
|
+
"def __repr__",
|
|
209
|
+
"raise AssertionError",
|
|
210
|
+
"raise NotImplementedError",
|
|
211
|
+
"if __name__ == .__main__.:",
|
|
212
|
+
"if TYPE_CHECKING:",
|
|
213
|
+
"@abstractmethod",
|
|
214
|
+
]
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# resolvekit Package Structure
|
|
2
|
+
|
|
3
|
+
This directory contains the main resolvekit package implementation.
|
|
4
|
+
|
|
5
|
+
## Module Overview
|
|
6
|
+
|
|
7
|
+
### Core Resolution Pipeline
|
|
8
|
+
|
|
9
|
+
1. **normalization/** - Text preprocessing and normalization
|
|
10
|
+
- Unicode normalization (NFKC/NFKD)
|
|
11
|
+
- Diacritic handling and case folding
|
|
12
|
+
- Transliteration support
|
|
13
|
+
|
|
14
|
+
2. **matchers/** - Candidate generation cascade
|
|
15
|
+
- Exact code matcher (ISO, DCID, etc.)
|
|
16
|
+
- Canonical name matcher
|
|
17
|
+
- Alias exact matcher
|
|
18
|
+
- FTS matcher (SQLite FTS5)
|
|
19
|
+
- Fuzzy matcher (bounded)
|
|
20
|
+
- Semantic matcher (optional)
|
|
21
|
+
|
|
22
|
+
3. **disambiguation/** - Ambiguity resolution
|
|
23
|
+
- Ambiguity detection
|
|
24
|
+
- Semantic sidecar (HNSW)
|
|
25
|
+
- Context analysis
|
|
26
|
+
- Default heuristics
|
|
27
|
+
|
|
28
|
+
4. **constraints/** - KG and temporal validation
|
|
29
|
+
- Type validation
|
|
30
|
+
- Hierarchy validation
|
|
31
|
+
- Temporal validity
|
|
32
|
+
- Membership validation
|
|
33
|
+
|
|
34
|
+
5. **calibration/** - Confidence scoring
|
|
35
|
+
- Feature extraction
|
|
36
|
+
- Calibration models
|
|
37
|
+
- Score fusion
|
|
38
|
+
|
|
39
|
+
### Data Management
|
|
40
|
+
|
|
41
|
+
6. **data/** - Data storage and access
|
|
42
|
+
- SQLite database management
|
|
43
|
+
- Schema definitions
|
|
44
|
+
- Data models and loaders
|
|
45
|
+
- Query builders
|
|
46
|
+
|
|
47
|
+
7. **overlays/** - Custom data extensions
|
|
48
|
+
- Overlay management
|
|
49
|
+
- Precedence handling
|
|
50
|
+
- Overlay writers and validators
|
|
51
|
+
|
|
52
|
+
8. **builders/** - Data pack building
|
|
53
|
+
- ETL pipelines
|
|
54
|
+
- Pack builders
|
|
55
|
+
- Calibration training
|
|
56
|
+
- Quality assurance
|
|
57
|
+
|
|
58
|
+
### Interfaces
|
|
59
|
+
|
|
60
|
+
9. **api/** - Python API
|
|
61
|
+
- Main Resolver class
|
|
62
|
+
- Resolution operations
|
|
63
|
+
- Code conversion
|
|
64
|
+
- Hierarchy navigation
|
|
65
|
+
|
|
66
|
+
10. **cli/** - Command-line interface
|
|
67
|
+
- CLI commands
|
|
68
|
+
- Output formatters
|
|
69
|
+
- Interactive prompts
|
|
70
|
+
|
|
71
|
+
### Additional Features
|
|
72
|
+
|
|
73
|
+
11. **extraction/** - Entity extraction from text
|
|
74
|
+
- Dictionary matching
|
|
75
|
+
- NER assistance
|
|
76
|
+
- Context extraction
|
|
77
|
+
|
|
78
|
+
12. **utils/** - Shared utilities
|
|
79
|
+
- Logging
|
|
80
|
+
- Validation
|
|
81
|
+
- Text utilities
|
|
82
|
+
- Caching
|
|
83
|
+
|
|
84
|
+
## Key Files
|
|
85
|
+
|
|
86
|
+
- **types.py** - Type definitions and data classes
|
|
87
|
+
- **constants.py** - Constants and configuration defaults
|
|
88
|
+
- **__init__.py** - Package initialization and exports
|
|
89
|
+
|
|
90
|
+
## Implementation Status
|
|
91
|
+
|
|
92
|
+
Current status: **Phase A - Core Resolver** (scaffolding complete)
|
|
93
|
+
|
|
94
|
+
See `implementation-plan.md` in the repository root for the full implementation roadmap.
|
|
95
|
+
|
|
96
|
+
## Development Workflow
|
|
97
|
+
|
|
98
|
+
1. Each module has its own README explaining its purpose and components
|
|
99
|
+
2. Start with Phase A modules: normalization, matchers, data, calibration, api, cli
|
|
100
|
+
3. Follow test-driven development (write tests first)
|
|
101
|
+
4. Maintain type hints and docstrings for all public APIs
|
|
102
|
+
5. Run linting and type checking before commits
|
|
103
|
+
|
|
104
|
+
## Quick Start for Developers
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
# Install dependencies
|
|
108
|
+
uv sync
|
|
109
|
+
|
|
110
|
+
# Run tests (when available)
|
|
111
|
+
uv run pytest
|
|
112
|
+
|
|
113
|
+
# Run linting
|
|
114
|
+
uv run ruff check src/resolvekit
|
|
115
|
+
|
|
116
|
+
# Run type checking
|
|
117
|
+
uv run mypy src/resolvekit
|
|
118
|
+
|
|
119
|
+
# Run CLI
|
|
120
|
+
uv run resolvekit
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Architecture Principles
|
|
124
|
+
|
|
125
|
+
1. **Bounded cascade**: Fail-fast with early matchers, limit expensive operations
|
|
126
|
+
2. **Offline-first**: Zero runtime network dependencies
|
|
127
|
+
3. **Explainable**: Return confidence scores and alternatives with reasoning
|
|
128
|
+
4. **Extensible**: Support overlays at data, config, and code levels
|
|
129
|
+
5. **Temporal-aware**: Handle time-varying data from day one
|
|
130
|
+
6. **Type-safe**: Full type annotations throughout
|
|
131
|
+
|
|
132
|
+
## Next Steps
|
|
133
|
+
|
|
134
|
+
See individual module READMEs for implementation details and priorities.
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""
|
|
2
|
+
resolvekit - An open offline resolver for places and entities.
|
|
3
|
+
|
|
4
|
+
resolvekit is a local, offline-first entity and place resolution system that maps
|
|
5
|
+
messy place/entity strings and codes to canonical entities with calibrated
|
|
6
|
+
confidence scores.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from resolvekit.api import Resolver, resolve, resolve_many
|
|
10
|
+
from resolvekit.config import ResolvekitConfig
|
|
11
|
+
from resolvekit.constants import VERSION
|
|
12
|
+
from resolvekit.types import (
|
|
13
|
+
AliasType,
|
|
14
|
+
Candidate,
|
|
15
|
+
CodeSystem,
|
|
16
|
+
Entity,
|
|
17
|
+
EntityType,
|
|
18
|
+
Explanation,
|
|
19
|
+
ExplanationMode,
|
|
20
|
+
ExtractedEntity,
|
|
21
|
+
MatchContext,
|
|
22
|
+
Membership,
|
|
23
|
+
OutputFormat,
|
|
24
|
+
Resolution,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
__version__ = VERSION
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"VERSION",
|
|
31
|
+
"AliasType",
|
|
32
|
+
"Candidate",
|
|
33
|
+
"CodeSystem",
|
|
34
|
+
"Entity",
|
|
35
|
+
"EntityType",
|
|
36
|
+
"Explanation",
|
|
37
|
+
"ExplanationMode",
|
|
38
|
+
"ExtractedEntity",
|
|
39
|
+
"MatchContext",
|
|
40
|
+
"Membership",
|
|
41
|
+
"ResolvekitConfig",
|
|
42
|
+
"OutputFormat",
|
|
43
|
+
"Resolution",
|
|
44
|
+
"Resolver",
|
|
45
|
+
"__version__",
|
|
46
|
+
"resolve",
|
|
47
|
+
"resolve_many",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def main() -> None:
|
|
52
|
+
"""CLI entry point."""
|
|
53
|
+
# Import here to avoid circular imports
|
|
54
|
+
from resolvekit.cli.main import cli
|
|
55
|
+
|
|
56
|
+
cli()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# Package metadata
|
|
60
|
+
__author__ = "Jorge Rivera"
|
|
61
|
+
__email__ = "jorge.rivera@one.org"
|
|
62
|
+
__license__ = "MIT"
|
|
63
|
+
__description__ = (
|
|
64
|
+
"A local, offline-first entity and place resolution system that maps "
|
|
65
|
+
"messy place/entity strings and codes to canonical entities with "
|
|
66
|
+
"calibrated confidence scores"
|
|
67
|
+
)
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# API Module
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
|
|
5
|
+
The API module provides the primary Python programmatic interface for entity resolution, code conversion, and related operations.
|
|
6
|
+
|
|
7
|
+
## Components
|
|
8
|
+
|
|
9
|
+
### Core API Classes
|
|
10
|
+
|
|
11
|
+
1. **Resolver** (`resolver.py`)
|
|
12
|
+
- Main entry point for all resolution operations
|
|
13
|
+
- Manages configuration and data pack loading
|
|
14
|
+
- Orchestrates matchers, constraints, and calibration
|
|
15
|
+
|
|
16
|
+
2. **Resolution** (`resolution.py`)
|
|
17
|
+
- Data class representing resolution results
|
|
18
|
+
- Contains matched entity, alternatives, confidence, explanation
|
|
19
|
+
- Supports JSON serialization
|
|
20
|
+
|
|
21
|
+
3. **Config** (`config.py`)
|
|
22
|
+
- Configuration management
|
|
23
|
+
- User-provided settings (thresholds, data paths, etc.)
|
|
24
|
+
- Environment variable support
|
|
25
|
+
|
|
26
|
+
### API Operations
|
|
27
|
+
|
|
28
|
+
- `resolve.py`: Single entity resolution
|
|
29
|
+
- `batch.py`: Batch resolution operations
|
|
30
|
+
- `convert.py`: Code system conversion
|
|
31
|
+
- `hierarchy.py`: Hierarchy navigation
|
|
32
|
+
- `extract.py`: Entity extraction from text (Phase F)
|
|
33
|
+
- `membership.py`: Group membership queries
|
|
34
|
+
|
|
35
|
+
## Primary API: Resolver Class
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from resolvekit.api import Resolver
|
|
39
|
+
from datetime import date
|
|
40
|
+
|
|
41
|
+
# Initialize resolver
|
|
42
|
+
resolver = Resolver(
|
|
43
|
+
data_path="/path/to/data/packs",
|
|
44
|
+
custom_overlays=["custom_aliases.yaml"],
|
|
45
|
+
min_confidence=0.7
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Single entity resolution
|
|
49
|
+
result = resolver.resolve(
|
|
50
|
+
"Cote d Ivoire",
|
|
51
|
+
entity_type=None,
|
|
52
|
+
parent=None,
|
|
53
|
+
at=None,
|
|
54
|
+
context=None,
|
|
55
|
+
return_alternates=5,
|
|
56
|
+
explain=False
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Access results
|
|
60
|
+
print(result.entity.dcid) # "country/CIV"
|
|
61
|
+
print(result.entity.canonical_name) # "Côte d'Ivoire"
|
|
62
|
+
print(result.confidence) # 0.95
|
|
63
|
+
print(result.alternatives) # [Entity, Entity, ...]
|
|
64
|
+
|
|
65
|
+
# Code conversion
|
|
66
|
+
dcid = resolver.code_to_dcid("FRA", code_type="iso3")
|
|
67
|
+
codes = resolver.dcid_to_codes("country/FRA")
|
|
68
|
+
|
|
69
|
+
# Batch operations
|
|
70
|
+
import pandas as pd
|
|
71
|
+
df = pd.DataFrame({"location": ["France", "UK", "Türkiye"]})
|
|
72
|
+
results = resolver.resolve_many(df["location"].tolist())
|
|
73
|
+
|
|
74
|
+
# Temporal queries
|
|
75
|
+
eu_2004 = resolver.get_group_members("EU", as_of=date(2004, 1, 1))
|
|
76
|
+
was_member = resolver.check_membership(
|
|
77
|
+
"country/POL",
|
|
78
|
+
group="EU",
|
|
79
|
+
as_of=date(2003, 12, 31)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Hierarchy navigation
|
|
83
|
+
children = resolver.get_children("country/FRA", admin_level=1)
|
|
84
|
+
parent = resolver.get_parent("geoId/06")
|
|
85
|
+
path = resolver.get_hierarchy_path("some/admin3/entity")
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Resolution Result Structure
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
@dataclass
|
|
92
|
+
class Resolution:
|
|
93
|
+
"""Resolution result."""
|
|
94
|
+
|
|
95
|
+
entity: Entity | None # Primary match
|
|
96
|
+
confidence: float # Calibrated probability
|
|
97
|
+
alternatives: list[Entity] # Alternative candidates
|
|
98
|
+
explanation: Explanation | None # Why this match (if explain=True)
|
|
99
|
+
|
|
100
|
+
def to_dict(self) -> dict:
|
|
101
|
+
"""Convert to dictionary."""
|
|
102
|
+
...
|
|
103
|
+
|
|
104
|
+
def to_json(self) -> str:
|
|
105
|
+
"""Serialize to JSON."""
|
|
106
|
+
...
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class Entity:
|
|
110
|
+
"""Resolved entity."""
|
|
111
|
+
|
|
112
|
+
dcid: str
|
|
113
|
+
canonical_name: str
|
|
114
|
+
entity_type: str
|
|
115
|
+
codes: dict[str, str] # {system: code}
|
|
116
|
+
parent_dcid: str | None
|
|
117
|
+
valid_from: date | None
|
|
118
|
+
valid_until: date | None
|
|
119
|
+
|
|
120
|
+
@dataclass
|
|
121
|
+
class Explanation:
|
|
122
|
+
"""Resolution explanation."""
|
|
123
|
+
|
|
124
|
+
stages: list[str] # Stages executed
|
|
125
|
+
candidates: list[Candidate] # All candidates with features
|
|
126
|
+
rules_applied: list[str] # Disambiguation rules
|
|
127
|
+
calibration: dict # Model details
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Design Principles
|
|
131
|
+
|
|
132
|
+
1. **Pythonic**: Snake_case, context managers, type hints
|
|
133
|
+
2. **Sensible defaults**: Works out of box without configuration
|
|
134
|
+
3. **Progressive disclosure**: Simple for basic use, powerful for advanced
|
|
135
|
+
4. **Type safety**: Full type annotations for IDE support
|
|
136
|
+
5. **Pandas integration**: Native support for DataFrame operations
|
|
137
|
+
|
|
138
|
+
## Error Handling
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from resolvekit.api import Resolver, ResolutionError, ConfigError
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
resolver = Resolver()
|
|
145
|
+
except ConfigError as e:
|
|
146
|
+
# Handle configuration errors
|
|
147
|
+
print(f"Config error: {e}")
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
result = resolver.resolve("invalid input")
|
|
151
|
+
except ResolutionError as e:
|
|
152
|
+
# Handle resolution errors
|
|
153
|
+
print(f"Resolution failed: {e}")
|
|
154
|
+
|
|
155
|
+
# No match returns None, not exception
|
|
156
|
+
result = resolver.resolve("zzzzzz")
|
|
157
|
+
if result.entity is None:
|
|
158
|
+
print("No match found")
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Implementation Priority
|
|
162
|
+
|
|
163
|
+
**Phase A** - Core resolver (Resolver class, resolve, batch)
|
|
164
|
+
**Phase B** - Code conversion and hierarchy APIs
|
|
165
|
+
**Phase F** - Entity extraction
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Convenience functions for quick one-off queries."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from resolvekit.api.resolver import Resolver
|
|
6
|
+
from resolvekit.types import MatchContext, Resolution
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def resolve(
|
|
10
|
+
query: str,
|
|
11
|
+
context: MatchContext | None = None,
|
|
12
|
+
**resolver_kwargs: Any,
|
|
13
|
+
) -> Resolution:
|
|
14
|
+
"""
|
|
15
|
+
Resolve single entity (convenience function).
|
|
16
|
+
|
|
17
|
+
Creates ephemeral Resolver instance for one-off queries.
|
|
18
|
+
For repeated queries, create a Resolver instance for better performance.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
query: Entity string to resolve
|
|
22
|
+
context: Optional match context
|
|
23
|
+
**resolver_kwargs: Passed to Resolver() constructor
|
|
24
|
+
(min_confidence, explanation_mode, etc.)
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Resolution result
|
|
28
|
+
"""
|
|
29
|
+
with Resolver(**resolver_kwargs) as resolver:
|
|
30
|
+
return resolver.resolve(query, context=context)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def resolve_many(
|
|
34
|
+
queries: list[str],
|
|
35
|
+
context: MatchContext | list[MatchContext | None] | None = None,
|
|
36
|
+
**resolver_kwargs: Any,
|
|
37
|
+
) -> list[Resolution]:
|
|
38
|
+
"""
|
|
39
|
+
Resolve multiple entities (convenience function).
|
|
40
|
+
|
|
41
|
+
Creates ephemeral Resolver instance.
|
|
42
|
+
For repeated queries, create a Resolver instance for better performance.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
queries: List of query strings
|
|
46
|
+
context: Optional context (shared or per-query list)
|
|
47
|
+
**resolver_kwargs: Passed to Resolver() constructor
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
List of Resolution objects
|
|
51
|
+
"""
|
|
52
|
+
with Resolver(**resolver_kwargs) as resolver:
|
|
53
|
+
return resolver.resolve_many(queries, context=context)
|