resolvekit 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- resolvekit/README.md +134 -0
- resolvekit/__init__.py +67 -0
- resolvekit/api/README.md +165 -0
- resolvekit/api/__init__.py +10 -0
- resolvekit/api/convenience.py +53 -0
- resolvekit/api/resolver.py +457 -0
- resolvekit/builders/README.md +173 -0
- resolvekit/builders/__init__.py +0 -0
- resolvekit/calibration/README.md +351 -0
- resolvekit/calibration/__init__.py +12 -0
- resolvekit/calibration/calibrator.py +184 -0
- resolvekit/calibration/features.py +139 -0
- resolvekit/calibration/models.py +78 -0
- resolvekit/cli/README.md +215 -0
- resolvekit/cli/__init__.py +0 -0
- resolvekit/cli/main.py +18 -0
- resolvekit/config.py +128 -0
- resolvekit/constants.py +252 -0
- resolvekit/constraints/README.md +102 -0
- resolvekit/constraints/__init__.py +17 -0
- resolvekit/constraints/constraint_engine.py +111 -0
- resolvekit/constraints/hierarchy_validator.py +148 -0
- resolvekit/constraints/membership_validator.py +60 -0
- resolvekit/constraints/protocols.py +33 -0
- resolvekit/constraints/temporal_validator.py +43 -0
- resolvekit/constraints/type_validator.py +42 -0
- resolvekit/data/README.md +165 -0
- resolvekit/data/__init__.py +14 -0
- resolvekit/data/alias_repository.py +206 -0
- resolvekit/data/code_repository.py +85 -0
- resolvekit/data/context_filters.py +49 -0
- resolvekit/data/db_manager.py +196 -0
- resolvekit/data/entity_repository.py +466 -0
- resolvekit/data/membership_repository.py +107 -0
- resolvekit/data/query_builder.py +177 -0
- resolvekit/data/schema.py +122 -0
- resolvekit/disambiguation/README.md +72 -0
- resolvekit/disambiguation/__init__.py +0 -0
- resolvekit/extraction/README.md +204 -0
- resolvekit/extraction/__init__.py +0 -0
- resolvekit/matchers/README.md +77 -0
- resolvekit/matchers/__init__.py +65 -0
- resolvekit/matchers/alias_exact.py +65 -0
- resolvekit/matchers/canonical_name.py +62 -0
- resolvekit/matchers/cascade.py +127 -0
- resolvekit/matchers/code_validators.py +250 -0
- resolvekit/matchers/exact_code.py +177 -0
- resolvekit/matchers/fts_matcher.py +106 -0
- resolvekit/matchers/fuzzy_matcher.py +142 -0
- resolvekit/matchers/priorities.py +174 -0
- resolvekit/matchers/protocols.py +75 -0
- resolvekit/normalization/README.md +192 -0
- resolvekit/normalization/__init__.py +8 -0
- resolvekit/normalization/normalizer.py +164 -0
- resolvekit/overlays/README.md +226 -0
- resolvekit/overlays/__init__.py +0 -0
- resolvekit/types.py +534 -0
- resolvekit/utils/README.md +188 -0
- resolvekit/utils/__init__.py +48 -0
- resolvekit/utils/cache.py +109 -0
- resolvekit/utils/dates.py +339 -0
- resolvekit/utils/errors.py +145 -0
- resolvekit/utils/files.py +366 -0
- resolvekit/utils/logging.py +219 -0
- resolvekit/utils/text.py +475 -0
- resolvekit/utils/validation.py +301 -0
- resolvekit-0.0.1.dist-info/METADATA +36 -0
- resolvekit-0.0.1.dist-info/RECORD +70 -0
- resolvekit-0.0.1.dist-info/WHEEL +4 -0
- resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Calibration model data structures."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import date
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field, ValidationError, model_validator
|
|
9
|
+
|
|
10
|
+
from resolvekit.utils.logging import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CalibrationModel(BaseModel):
|
|
16
|
+
"""
|
|
17
|
+
Calibration model configuration.
|
|
18
|
+
|
|
19
|
+
Loaded from JSON files in data packs. Defines logistic regression
|
|
20
|
+
parameters for converting features to calibrated probabilities.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
type: Model type (currently only "logistic" supported)
|
|
24
|
+
features: Ordered list of feature names
|
|
25
|
+
weights: Logistic regression weights (one per feature)
|
|
26
|
+
bias: Logistic regression bias term
|
|
27
|
+
ece: Expected Calibration Error (optional, for model quality tracking)
|
|
28
|
+
trained_on: Date model was trained (optional)
|
|
29
|
+
notes: Human-readable notes (optional)
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
model_config = ConfigDict(frozen=True)
|
|
33
|
+
|
|
34
|
+
type: Literal["logistic"] = Field(
|
|
35
|
+
default="logistic", description="Calibration model type"
|
|
36
|
+
)
|
|
37
|
+
features: list[str] = Field(..., min_length=1, description="Feature names in order")
|
|
38
|
+
weights: list[float] = Field(..., min_length=1, description="Logistic weights")
|
|
39
|
+
bias: float = Field(..., description="Logistic bias term")
|
|
40
|
+
ece: float | None = Field(
|
|
41
|
+
None, ge=0.0, le=1.0, description="Expected Calibration Error"
|
|
42
|
+
)
|
|
43
|
+
trained_on: date | None = Field(None, description="Training date")
|
|
44
|
+
notes: str | None = Field(None, description="Human-readable notes")
|
|
45
|
+
|
|
46
|
+
@model_validator(mode="after")
|
|
47
|
+
def validate_weights_match_features(self) -> "CalibrationModel":
|
|
48
|
+
"""Validate that weights count matches features count."""
|
|
49
|
+
if len(self.weights) != len(self.features):
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f"Weights count ({len(self.weights)}) must match "
|
|
52
|
+
f"features count ({len(self.features)})"
|
|
53
|
+
)
|
|
54
|
+
return self
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def load_calibration_model(path: Path) -> CalibrationModel | None:
|
|
58
|
+
"""
|
|
59
|
+
Load calibration model from JSON file.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
path: Path to calibration_model.json file
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
CalibrationModel if file exists and is valid, None otherwise
|
|
66
|
+
"""
|
|
67
|
+
if not path.exists():
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
data = json.loads(path.read_text())
|
|
72
|
+
return CalibrationModel.model_validate(data)
|
|
73
|
+
except json.JSONDecodeError as e:
|
|
74
|
+
logger.warning(f"Invalid JSON in calibration model at {path}: {e}")
|
|
75
|
+
return None
|
|
76
|
+
except ValidationError as e:
|
|
77
|
+
logger.warning(f"Invalid calibration model schema at {path}: {e}")
|
|
78
|
+
return None
|
resolvekit/cli/README.md
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# CLI Module
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
|
|
5
|
+
The CLI module provides a command-line interface for interactive entity resolution, batch processing, and data pack management.
|
|
6
|
+
|
|
7
|
+
## Components
|
|
8
|
+
|
|
9
|
+
### Command Modules
|
|
10
|
+
|
|
11
|
+
1. **Main CLI** (`main.py`)
|
|
12
|
+
- Entry point for CLI application
|
|
13
|
+
- Uses `click` or `typer` for argument parsing
|
|
14
|
+
- Global options and configuration
|
|
15
|
+
|
|
16
|
+
2. **Query Command** (`query.py`)
|
|
17
|
+
- Single entity resolution from command line
|
|
18
|
+
- Interactive disambiguation prompts
|
|
19
|
+
- Human-readable output with colors
|
|
20
|
+
|
|
21
|
+
3. **Convert Command** (`convert.py`)
|
|
22
|
+
- Code system conversion
|
|
23
|
+
- Supports single codes or batch files
|
|
24
|
+
|
|
25
|
+
4. **Batch Command** (`batch.py`)
|
|
26
|
+
- Batch processing of CSV/JSON files
|
|
27
|
+
- Progress bars for large files
|
|
28
|
+
- Error reporting and partial results
|
|
29
|
+
|
|
30
|
+
5. **Extract Command** (`extract.py`)
|
|
31
|
+
- Entity extraction from text files
|
|
32
|
+
- Supports various input formats
|
|
33
|
+
- (Phase F)
|
|
34
|
+
|
|
35
|
+
6. **Info Command** (`info.py`)
|
|
36
|
+
- Display detailed entity information
|
|
37
|
+
- Show codes, hierarchy, memberships
|
|
38
|
+
|
|
39
|
+
7. **Members Command** (`members.py`)
|
|
40
|
+
- List group members
|
|
41
|
+
- Temporal support (as_of date)
|
|
42
|
+
|
|
43
|
+
8. **Update Command** (`update.py`)
|
|
44
|
+
- Check for data pack updates
|
|
45
|
+
- Download and install new versions
|
|
46
|
+
|
|
47
|
+
9. **Version Command** (`version.py`)
|
|
48
|
+
- Show tool and data pack versions
|
|
49
|
+
- Component information
|
|
50
|
+
|
|
51
|
+
### Output Formatters
|
|
52
|
+
|
|
53
|
+
- `formatters.py`: Output formatting (table, JSON, CSV)
|
|
54
|
+
- `colors.py`: Terminal color support
|
|
55
|
+
- `progress.py`: Progress bars for long operations
|
|
56
|
+
|
|
57
|
+
## CLI Commands
|
|
58
|
+
|
|
59
|
+
### Query
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Simple query
|
|
63
|
+
resolvekit query "Georgia"
|
|
64
|
+
|
|
65
|
+
# Output:
|
|
66
|
+
# ⚠️ Ambiguous input. Top matches:
|
|
67
|
+
#
|
|
68
|
+
# 1. Georgia (Country) [dcid: country/GEO]
|
|
69
|
+
# - Confidence: 0.60
|
|
70
|
+
# - Codes: ISO2=GE, ISO3=GEO, M49=268
|
|
71
|
+
#
|
|
72
|
+
# 2. Georgia (US State) [dcid: geoId/13]
|
|
73
|
+
# - Confidence: 0.40
|
|
74
|
+
# - Codes: FIPS=13
|
|
75
|
+
|
|
76
|
+
# With context
|
|
77
|
+
resolvekit query "Georgia" --context "Tbilisi"
|
|
78
|
+
|
|
79
|
+
# With constraints
|
|
80
|
+
resolvekit query "Paris" --entity-type country
|
|
81
|
+
resolvekit query "Ontario" --parent country/CAN
|
|
82
|
+
|
|
83
|
+
# JSON output
|
|
84
|
+
resolvekit query "France" --format json
|
|
85
|
+
|
|
86
|
+
# With explanation
|
|
87
|
+
resolvekit query "Türkiye" --explain
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Convert
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
# Single code conversion
|
|
94
|
+
resolvekit convert FRA --from iso3 --to dcid
|
|
95
|
+
# Output: country/FRA
|
|
96
|
+
|
|
97
|
+
# Get all codes
|
|
98
|
+
resolvekit convert country/FRA --from dcid
|
|
99
|
+
# Output:
|
|
100
|
+
# iso2: FR
|
|
101
|
+
# iso3: FRA
|
|
102
|
+
# m49: 250
|
|
103
|
+
# nuts: FR
|
|
104
|
+
# wikidata: Q142
|
|
105
|
+
|
|
106
|
+
# Batch file
|
|
107
|
+
resolvekit convert codes.csv --from iso3 --to dcid --output results.csv
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Batch
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
# Process CSV file
|
|
114
|
+
resolvekit batch input.csv \
|
|
115
|
+
--column location \
|
|
116
|
+
--output enriched.csv \
|
|
117
|
+
--min-confidence 0.8
|
|
118
|
+
|
|
119
|
+
# With additional context columns
|
|
120
|
+
resolvekit batch data.csv \
|
|
121
|
+
--column country_name \
|
|
122
|
+
--context-column region \
|
|
123
|
+
--output results.csv
|
|
124
|
+
|
|
125
|
+
# JSON input/output
|
|
126
|
+
resolvekit batch input.json --format json --output results.json
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Info
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
# Entity details
|
|
133
|
+
resolvekit info country/FRA
|
|
134
|
+
|
|
135
|
+
# Output:
|
|
136
|
+
# Entity: France
|
|
137
|
+
# DCID: country/FRA
|
|
138
|
+
# Type: country
|
|
139
|
+
# Codes:
|
|
140
|
+
# ISO2: FR
|
|
141
|
+
# ISO3: FRA
|
|
142
|
+
# M49: 250
|
|
143
|
+
# Parent: None
|
|
144
|
+
# Children (ADM1): 18 regions
|
|
145
|
+
# Memberships: EU, OECD, G7, G20, ...
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Members
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
# Current members
|
|
152
|
+
resolvekit members EU
|
|
153
|
+
|
|
154
|
+
# Historical members
|
|
155
|
+
resolvekit members EU --as-of 2004-01-01
|
|
156
|
+
|
|
157
|
+
# Output as list of DCIDs
|
|
158
|
+
resolvekit members OECD --format dcids
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Update
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
# Check for updates
|
|
165
|
+
resolvekit update --check
|
|
166
|
+
|
|
167
|
+
# Download and install latest
|
|
168
|
+
resolvekit update --install
|
|
169
|
+
|
|
170
|
+
# List available versions
|
|
171
|
+
resolvekit update --list
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Version
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
# Show versions
|
|
178
|
+
resolvekit version
|
|
179
|
+
|
|
180
|
+
# Output:
|
|
181
|
+
# resolvekit: 0.1.0
|
|
182
|
+
# Data pack: 1.2.0 (2025-10-20)
|
|
183
|
+
# Python: 3.13
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Output Formats
|
|
187
|
+
|
|
188
|
+
### Human-readable (default)
|
|
189
|
+
- Colored terminal output
|
|
190
|
+
- Formatted tables
|
|
191
|
+
- Progress indicators
|
|
192
|
+
|
|
193
|
+
### JSON
|
|
194
|
+
- Machine-readable
|
|
195
|
+
- Full result structure
|
|
196
|
+
- Suitable for piping
|
|
197
|
+
|
|
198
|
+
### CSV
|
|
199
|
+
- Batch results
|
|
200
|
+
- Compatible with spreadsheets
|
|
201
|
+
- Configurable columns
|
|
202
|
+
|
|
203
|
+
## Design Principles
|
|
204
|
+
|
|
205
|
+
1. **Unix-friendly**: Composable with pipes, appropriate exit codes
|
|
206
|
+
2. **Interactive**: Clear prompts for ambiguous cases
|
|
207
|
+
3. **Informative**: Helpful error messages with suggestions
|
|
208
|
+
4. **Fast feedback**: Show progress for long operations
|
|
209
|
+
|
|
210
|
+
## Implementation Priority
|
|
211
|
+
|
|
212
|
+
**Phase A** - Core resolver (query, batch, version)
|
|
213
|
+
**Phase B** - Code conversion (convert, info, members)
|
|
214
|
+
**Phase D** - Package updates (update)
|
|
215
|
+
**Phase F** - Entity extraction (extract)
|
|
File without changes
|
resolvekit/cli/main.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""CLI entry point for resolvekit."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def cli() -> None:
|
|
7
|
+
"""Main CLI entry point."""
|
|
8
|
+
print("resolvekit CLI - Coming soon!")
|
|
9
|
+
print()
|
|
10
|
+
print("Available commands (to be implemented):")
|
|
11
|
+
print(" resolvekit query <text> - Resolve entity from text")
|
|
12
|
+
print(" resolvekit convert <code> - Convert between code systems")
|
|
13
|
+
print(" resolvekit batch <file> - Batch process CSV/JSON file")
|
|
14
|
+
print(" resolvekit info <dcid> - Show entity information")
|
|
15
|
+
print(" resolvekit members <group> - List group members")
|
|
16
|
+
print(" resolvekit update - Update data packs")
|
|
17
|
+
print(" resolvekit version - Show version information")
|
|
18
|
+
sys.exit(0)
|
resolvekit/config.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Configuration management for resolvekit using Pydantic Settings."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from pydantic import Field
|
|
6
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ResolvekitConfig(BaseSettings):
|
|
10
|
+
"""
|
|
11
|
+
Configuration for resolvekit resolver.
|
|
12
|
+
|
|
13
|
+
This uses Pydantic Settings to read configuration from:
|
|
14
|
+
1. Environment variables (prefixed with RESOLVEKIT_)
|
|
15
|
+
2. .env file (if present)
|
|
16
|
+
3. Default values
|
|
17
|
+
|
|
18
|
+
Example:
|
|
19
|
+
# Set via environment
|
|
20
|
+
export RESOLVEKIT_DATA_DIR="/path/to/data"
|
|
21
|
+
export RESOLVEKIT_MIN_CONFIDENCE=0.8
|
|
22
|
+
|
|
23
|
+
# Or in .env file
|
|
24
|
+
RESOLVEKIT_DATA_DIR=/path/to/data
|
|
25
|
+
RESOLVEKIT_MIN_CONFIDENCE=0.8
|
|
26
|
+
|
|
27
|
+
# Use in code
|
|
28
|
+
config = ResolvekitConfig()
|
|
29
|
+
print(config.data_dir) # Path('/path/to/data')
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
model_config = SettingsConfigDict(
|
|
33
|
+
env_prefix="RESOLVEKIT_",
|
|
34
|
+
env_file=".env",
|
|
35
|
+
env_file_encoding="utf-8",
|
|
36
|
+
case_sensitive=False,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Paths
|
|
40
|
+
data_dir: Path = Field(
|
|
41
|
+
default=Path.home() / ".resolvekit" / "data",
|
|
42
|
+
description="Directory containing data packs",
|
|
43
|
+
)
|
|
44
|
+
cache_dir: Path = Field(
|
|
45
|
+
default=Path.home() / ".resolvekit" / "cache",
|
|
46
|
+
description="Cache directory",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Resolution settings
|
|
50
|
+
min_confidence: float = Field(
|
|
51
|
+
default=0.7,
|
|
52
|
+
ge=0.0,
|
|
53
|
+
le=1.0,
|
|
54
|
+
description="Minimum confidence threshold for matches",
|
|
55
|
+
)
|
|
56
|
+
max_candidates: int = Field(
|
|
57
|
+
default=50,
|
|
58
|
+
gt=0,
|
|
59
|
+
le=1000,
|
|
60
|
+
description="Maximum number of candidates to consider",
|
|
61
|
+
)
|
|
62
|
+
max_alternates: int = Field(
|
|
63
|
+
default=5,
|
|
64
|
+
gt=0,
|
|
65
|
+
le=100,
|
|
66
|
+
description="Maximum number of alternative matches to return",
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# FTS settings
|
|
70
|
+
fts_top_k: int = Field(
|
|
71
|
+
default=50,
|
|
72
|
+
gt=0,
|
|
73
|
+
le=1000,
|
|
74
|
+
description="Top K results from FTS",
|
|
75
|
+
)
|
|
76
|
+
fuzzy_top_k: int = Field(
|
|
77
|
+
default=12,
|
|
78
|
+
gt=0,
|
|
79
|
+
le=100,
|
|
80
|
+
description="Top K results after fuzzy matching",
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Ambiguity detection
|
|
84
|
+
margin_threshold: float = Field(
|
|
85
|
+
default=0.15,
|
|
86
|
+
ge=0.0,
|
|
87
|
+
le=1.0,
|
|
88
|
+
description="Score margin threshold for ambiguity detection",
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Logging
|
|
92
|
+
log_level: str = Field(
|
|
93
|
+
default="INFO",
|
|
94
|
+
pattern=r"^(DEBUG|INFO|WARNING|ERROR|CRITICAL)$",
|
|
95
|
+
description="Logging level",
|
|
96
|
+
)
|
|
97
|
+
log_format: str = Field(
|
|
98
|
+
default="human",
|
|
99
|
+
pattern=r"^(human|json)$",
|
|
100
|
+
description="Log format (human or json)",
|
|
101
|
+
)
|
|
102
|
+
redact_queries: bool = Field(
|
|
103
|
+
default=True,
|
|
104
|
+
description="Redact query content in logs for privacy",
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Performance
|
|
108
|
+
batch_size: int = Field(
|
|
109
|
+
default=1000,
|
|
110
|
+
gt=0,
|
|
111
|
+
le=100000,
|
|
112
|
+
description="Batch size for batch processing",
|
|
113
|
+
)
|
|
114
|
+
num_workers: int = Field(
|
|
115
|
+
default=4,
|
|
116
|
+
gt=0,
|
|
117
|
+
le=64,
|
|
118
|
+
description="Number of worker processes for parallel processing",
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
def ensure_dirs(self) -> None:
|
|
122
|
+
"""Create data and cache directories if they don't exist."""
|
|
123
|
+
self.data_dir.mkdir(parents=True, exist_ok=True)
|
|
124
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# Global config instance (can be overridden)
|
|
128
|
+
config = ResolvekitConfig()
|
resolvekit/constants.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"""Constants for resolvekit."""
|
|
2
|
+
|
|
3
|
+
from resolvekit.types import EntityType
|
|
4
|
+
|
|
5
|
+
# Version
|
|
6
|
+
VERSION = "0.0.1"
|
|
7
|
+
|
|
8
|
+
# Default configuration
|
|
9
|
+
DEFAULT_MIN_CONFIDENCE = 0.7
|
|
10
|
+
DEFAULT_MAX_CANDIDATES = 50
|
|
11
|
+
DEFAULT_MAX_ALTERNATES = 5
|
|
12
|
+
DEFAULT_FTS_TOP_K = 50
|
|
13
|
+
DEFAULT_FUZZY_TOP_K = 12
|
|
14
|
+
|
|
15
|
+
# Ambiguity detection
|
|
16
|
+
DEFAULT_MARGIN_THRESHOLD = 0.15 # Trigger semantic if margin < this
|
|
17
|
+
|
|
18
|
+
# File paths
|
|
19
|
+
DEFAULT_DATA_DIR = "~/.resolvekit/data"
|
|
20
|
+
DEFAULT_CACHE_DIR = "~/.resolvekit/cache"
|
|
21
|
+
|
|
22
|
+
# Database configuration
|
|
23
|
+
DB_PRAGMAS = {
|
|
24
|
+
"journal_mode": "OFF",
|
|
25
|
+
"synchronous": "OFF",
|
|
26
|
+
"temp_store": "MEMORY",
|
|
27
|
+
"mmap_size": 268435456, # ~256MB
|
|
28
|
+
"cache_size": -100000, # ~100MB
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
# FTS5 configuration
|
|
32
|
+
FTS5_TOKENIZER = "unicode61 remove_diacritics 2 tokenchars '.-'"
|
|
33
|
+
FTS5_PREFIX = "2,3"
|
|
34
|
+
|
|
35
|
+
# Performance targets (milliseconds)
|
|
36
|
+
TARGET_LATENCY_P50 = 10
|
|
37
|
+
TARGET_LATENCY_P95 = 50
|
|
38
|
+
TARGET_STARTUP_TIME = 5000 # 5 seconds
|
|
39
|
+
|
|
40
|
+
# Memory targets (MB)
|
|
41
|
+
TARGET_MEMORY_BASE = 2048 # 2GB
|
|
42
|
+
TARGET_MEMORY_EMBEDDINGS = 4096 # 4GB
|
|
43
|
+
TARGET_MEMORY_FULL = 6144 # 6GB
|
|
44
|
+
|
|
45
|
+
# Batch processing
|
|
46
|
+
DEFAULT_BATCH_SIZE = 1000
|
|
47
|
+
DEFAULT_NUM_WORKERS = 4
|
|
48
|
+
|
|
49
|
+
# Entity type hierarchy
|
|
50
|
+
ENTITY_TYPE_HIERARCHY = {
|
|
51
|
+
EntityType.COUNTRY: [
|
|
52
|
+
EntityType.ADMIN1,
|
|
53
|
+
EntityType.ADMIN2,
|
|
54
|
+
EntityType.ADMIN3,
|
|
55
|
+
EntityType.ADMIN4,
|
|
56
|
+
EntityType.CITY,
|
|
57
|
+
],
|
|
58
|
+
EntityType.ADMIN1: [
|
|
59
|
+
EntityType.ADMIN2,
|
|
60
|
+
EntityType.ADMIN3,
|
|
61
|
+
EntityType.ADMIN4,
|
|
62
|
+
EntityType.CITY,
|
|
63
|
+
],
|
|
64
|
+
EntityType.ADMIN2: [EntityType.ADMIN3, EntityType.ADMIN4, EntityType.CITY],
|
|
65
|
+
EntityType.ADMIN3: [EntityType.ADMIN4, EntityType.CITY],
|
|
66
|
+
EntityType.ADMIN4: [EntityType.CITY],
|
|
67
|
+
EntityType.ORGANIZATION: [EntityType.GROUP],
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
# Code system metadata
|
|
71
|
+
CODE_SYSTEMS = {
|
|
72
|
+
"dcid": {
|
|
73
|
+
"name": "Data Commons ID",
|
|
74
|
+
"authority": "Google Data Commons",
|
|
75
|
+
"format": r"^[a-zA-Z]+/[A-Z0-9]+$",
|
|
76
|
+
"example": "country/USA",
|
|
77
|
+
},
|
|
78
|
+
"iso2": {
|
|
79
|
+
"name": "ISO 3166-1 alpha-2",
|
|
80
|
+
"authority": "ISO",
|
|
81
|
+
"format": r"^[A-Z]{2}$",
|
|
82
|
+
"example": "US",
|
|
83
|
+
},
|
|
84
|
+
"iso3": {
|
|
85
|
+
"name": "ISO 3166-1 alpha-3",
|
|
86
|
+
"authority": "ISO",
|
|
87
|
+
"format": r"^[A-Z]{3}$",
|
|
88
|
+
"example": "USA",
|
|
89
|
+
},
|
|
90
|
+
"iso_numeric": {
|
|
91
|
+
"name": "ISO 3166-1 numeric",
|
|
92
|
+
"authority": "ISO",
|
|
93
|
+
"format": r"^\d{3}$",
|
|
94
|
+
"example": "840",
|
|
95
|
+
},
|
|
96
|
+
"m49": {
|
|
97
|
+
"name": "UN M49",
|
|
98
|
+
"authority": "UN Statistics Division",
|
|
99
|
+
"format": r"^\d{3}$",
|
|
100
|
+
"example": "840",
|
|
101
|
+
},
|
|
102
|
+
"nuts": {
|
|
103
|
+
"name": "NUTS codes",
|
|
104
|
+
"authority": "Eurostat",
|
|
105
|
+
"format": r"^[A-Z]{2}[A-Z0-9]{0,3}$",
|
|
106
|
+
"example": "FR",
|
|
107
|
+
},
|
|
108
|
+
"pcode": {
|
|
109
|
+
"name": "OCHA P-codes",
|
|
110
|
+
"authority": "UN OCHA",
|
|
111
|
+
"format": r"^[A-Z]{2}[A-Z0-9]+$",
|
|
112
|
+
"example": "FR01",
|
|
113
|
+
},
|
|
114
|
+
"wikidata": {
|
|
115
|
+
"name": "Wikidata QID",
|
|
116
|
+
"authority": "Wikidata",
|
|
117
|
+
"format": r"^Q\d+$",
|
|
118
|
+
"example": "Q30",
|
|
119
|
+
},
|
|
120
|
+
"geonames": {
|
|
121
|
+
"name": "GeoNames ID",
|
|
122
|
+
"authority": "GeoNames",
|
|
123
|
+
"format": r"^\d+$",
|
|
124
|
+
"example": "6252001",
|
|
125
|
+
},
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
# Source precedence (higher = higher priority)
|
|
129
|
+
SOURCE_PRECEDENCE = {
|
|
130
|
+
"user": 100,
|
|
131
|
+
"custom": 90,
|
|
132
|
+
"datacommons": 80,
|
|
133
|
+
"iso": 70,
|
|
134
|
+
"un": 60,
|
|
135
|
+
"eurostat": 60,
|
|
136
|
+
"ocha": 60,
|
|
137
|
+
"wikidata": 50,
|
|
138
|
+
"geonames": 40,
|
|
139
|
+
"other": 10,
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
# Overlay precedence ranges
|
|
143
|
+
OVERLAY_PRECEDENCE = {
|
|
144
|
+
"user": (100, 999),
|
|
145
|
+
"org": (10, 99),
|
|
146
|
+
"base": (0, 0),
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
# Calibration features
|
|
150
|
+
CALIBRATION_FEATURES = [
|
|
151
|
+
"f_exact_code",
|
|
152
|
+
"f_canonical_exact",
|
|
153
|
+
"f_alias_exact",
|
|
154
|
+
"f_alias_type_canonical",
|
|
155
|
+
"f_alias_type_endonym",
|
|
156
|
+
"f_alias_type_exonym",
|
|
157
|
+
"f_alias_type_abbr",
|
|
158
|
+
"f_alias_type_code",
|
|
159
|
+
"f_fts_score",
|
|
160
|
+
"f_fts_rank_inv",
|
|
161
|
+
"f_edit_distance_norm",
|
|
162
|
+
"f_trigram_jaccard",
|
|
163
|
+
"f_parent_valid",
|
|
164
|
+
"f_type_valid",
|
|
165
|
+
"f_date_valid",
|
|
166
|
+
"f_sem_used",
|
|
167
|
+
"f_sem_sim",
|
|
168
|
+
"f_ambiguity_flag",
|
|
169
|
+
"f_region_hint_match",
|
|
170
|
+
]
|
|
171
|
+
|
|
172
|
+
# Known ambiguous terms (partial list - full list in ambiguity registry)
|
|
173
|
+
KNOWN_AMBIGUOUS = {
|
|
174
|
+
"georgia": ["country/GEO", "geoId/13"],
|
|
175
|
+
"congo": ["country/COD", "country/COG"],
|
|
176
|
+
"guinea": ["country/GIN", "country/GNQ", "country/GNB", "country/PNG"],
|
|
177
|
+
"korea": ["country/KOR", "country/PRK"],
|
|
178
|
+
"macedonia": ["country/MKD", "geoId/GR-MAC"],
|
|
179
|
+
"springfield": [], # Too many to list
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
# Common stopwords that aren't entities
|
|
183
|
+
ENTITY_STOPWORDS = {
|
|
184
|
+
# Months
|
|
185
|
+
"january",
|
|
186
|
+
"february",
|
|
187
|
+
"march",
|
|
188
|
+
"april",
|
|
189
|
+
"may",
|
|
190
|
+
"june",
|
|
191
|
+
"july",
|
|
192
|
+
"august",
|
|
193
|
+
"september",
|
|
194
|
+
"october",
|
|
195
|
+
"november",
|
|
196
|
+
"december",
|
|
197
|
+
# Days
|
|
198
|
+
"monday",
|
|
199
|
+
"tuesday",
|
|
200
|
+
"wednesday",
|
|
201
|
+
"thursday",
|
|
202
|
+
"friday",
|
|
203
|
+
"saturday",
|
|
204
|
+
"sunday",
|
|
205
|
+
# Common words that look like places
|
|
206
|
+
"reading",
|
|
207
|
+
"normal",
|
|
208
|
+
"mobile",
|
|
209
|
+
"phoenix", # Problematic: city vs. mythical bird
|
|
210
|
+
"aurora",
|
|
211
|
+
"victoria", # Problematic: can be place
|
|
212
|
+
# Directions
|
|
213
|
+
"north",
|
|
214
|
+
"south",
|
|
215
|
+
"east",
|
|
216
|
+
"west",
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
# Data pack manifest schema version
|
|
220
|
+
MANIFEST_SCHEMA_VERSION = "1.0.0"
|
|
221
|
+
|
|
222
|
+
# Supported languages (ISO 639-1 codes)
|
|
223
|
+
SUPPORTED_LANGUAGES = [
|
|
224
|
+
"en", # English
|
|
225
|
+
"es", # Spanish
|
|
226
|
+
"fr", # French
|
|
227
|
+
"de", # German
|
|
228
|
+
"it", # Italian
|
|
229
|
+
"pt", # Portuguese
|
|
230
|
+
"nl", # Dutch
|
|
231
|
+
"ru", # Russian (Cyrillic script)
|
|
232
|
+
"ar", # Arabic (Arabic script)
|
|
233
|
+
"zh", # Chinese (CJK)
|
|
234
|
+
"ja", # Japanese (CJK)
|
|
235
|
+
"ko", # Korean (Hangul)
|
|
236
|
+
]
|
|
237
|
+
|
|
238
|
+
# Logging configuration
|
|
239
|
+
LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
240
|
+
LOG_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
|
|
241
|
+
|
|
242
|
+
# API defaults
|
|
243
|
+
API_RATE_LIMIT = 100 # requests per second
|
|
244
|
+
API_TIMEOUT = 30 # seconds
|
|
245
|
+
API_MAX_BATCH_SIZE = 10000
|
|
246
|
+
|
|
247
|
+
# CLI defaults
|
|
248
|
+
CLI_MAX_DISPLAY_ALTERNATES = 5
|
|
249
|
+
CLI_COLOR_PRIMARY = "blue"
|
|
250
|
+
CLI_COLOR_SUCCESS = "green"
|
|
251
|
+
CLI_COLOR_WARNING = "yellow"
|
|
252
|
+
CLI_COLOR_ERROR = "red"
|