resolvekit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. resolvekit/README.md +134 -0
  2. resolvekit/__init__.py +67 -0
  3. resolvekit/api/README.md +165 -0
  4. resolvekit/api/__init__.py +10 -0
  5. resolvekit/api/convenience.py +53 -0
  6. resolvekit/api/resolver.py +457 -0
  7. resolvekit/builders/README.md +173 -0
  8. resolvekit/builders/__init__.py +0 -0
  9. resolvekit/calibration/README.md +351 -0
  10. resolvekit/calibration/__init__.py +12 -0
  11. resolvekit/calibration/calibrator.py +184 -0
  12. resolvekit/calibration/features.py +139 -0
  13. resolvekit/calibration/models.py +78 -0
  14. resolvekit/cli/README.md +215 -0
  15. resolvekit/cli/__init__.py +0 -0
  16. resolvekit/cli/main.py +18 -0
  17. resolvekit/config.py +128 -0
  18. resolvekit/constants.py +252 -0
  19. resolvekit/constraints/README.md +102 -0
  20. resolvekit/constraints/__init__.py +17 -0
  21. resolvekit/constraints/constraint_engine.py +111 -0
  22. resolvekit/constraints/hierarchy_validator.py +148 -0
  23. resolvekit/constraints/membership_validator.py +60 -0
  24. resolvekit/constraints/protocols.py +33 -0
  25. resolvekit/constraints/temporal_validator.py +43 -0
  26. resolvekit/constraints/type_validator.py +42 -0
  27. resolvekit/data/README.md +165 -0
  28. resolvekit/data/__init__.py +14 -0
  29. resolvekit/data/alias_repository.py +206 -0
  30. resolvekit/data/code_repository.py +85 -0
  31. resolvekit/data/context_filters.py +49 -0
  32. resolvekit/data/db_manager.py +196 -0
  33. resolvekit/data/entity_repository.py +466 -0
  34. resolvekit/data/membership_repository.py +107 -0
  35. resolvekit/data/query_builder.py +177 -0
  36. resolvekit/data/schema.py +122 -0
  37. resolvekit/disambiguation/README.md +72 -0
  38. resolvekit/disambiguation/__init__.py +0 -0
  39. resolvekit/extraction/README.md +204 -0
  40. resolvekit/extraction/__init__.py +0 -0
  41. resolvekit/matchers/README.md +77 -0
  42. resolvekit/matchers/__init__.py +65 -0
  43. resolvekit/matchers/alias_exact.py +65 -0
  44. resolvekit/matchers/canonical_name.py +62 -0
  45. resolvekit/matchers/cascade.py +127 -0
  46. resolvekit/matchers/code_validators.py +250 -0
  47. resolvekit/matchers/exact_code.py +177 -0
  48. resolvekit/matchers/fts_matcher.py +106 -0
  49. resolvekit/matchers/fuzzy_matcher.py +142 -0
  50. resolvekit/matchers/priorities.py +174 -0
  51. resolvekit/matchers/protocols.py +75 -0
  52. resolvekit/normalization/README.md +192 -0
  53. resolvekit/normalization/__init__.py +8 -0
  54. resolvekit/normalization/normalizer.py +164 -0
  55. resolvekit/overlays/README.md +226 -0
  56. resolvekit/overlays/__init__.py +0 -0
  57. resolvekit/types.py +534 -0
  58. resolvekit/utils/README.md +188 -0
  59. resolvekit/utils/__init__.py +48 -0
  60. resolvekit/utils/cache.py +109 -0
  61. resolvekit/utils/dates.py +339 -0
  62. resolvekit/utils/errors.py +145 -0
  63. resolvekit/utils/files.py +366 -0
  64. resolvekit/utils/logging.py +219 -0
  65. resolvekit/utils/text.py +475 -0
  66. resolvekit/utils/validation.py +301 -0
  67. resolvekit-0.0.1.dist-info/METADATA +36 -0
  68. resolvekit-0.0.1.dist-info/RECORD +70 -0
  69. resolvekit-0.0.1.dist-info/WHEEL +4 -0
  70. resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,78 @@
1
+ """Calibration model data structures."""
2
+
3
+ import json
4
+ from datetime import date
5
+ from pathlib import Path
6
+ from typing import Literal
7
+
8
+ from pydantic import BaseModel, ConfigDict, Field, ValidationError, model_validator
9
+
10
+ from resolvekit.utils.logging import get_logger
11
+
12
+ logger = get_logger(__name__)
13
+
14
+
15
+ class CalibrationModel(BaseModel):
16
+ """
17
+ Calibration model configuration.
18
+
19
+ Loaded from JSON files in data packs. Defines logistic regression
20
+ parameters for converting features to calibrated probabilities.
21
+
22
+ Attributes:
23
+ type: Model type (currently only "logistic" supported)
24
+ features: Ordered list of feature names
25
+ weights: Logistic regression weights (one per feature)
26
+ bias: Logistic regression bias term
27
+ ece: Expected Calibration Error (optional, for model quality tracking)
28
+ trained_on: Date model was trained (optional)
29
+ notes: Human-readable notes (optional)
30
+ """
31
+
32
+ model_config = ConfigDict(frozen=True)
33
+
34
+ type: Literal["logistic"] = Field(
35
+ default="logistic", description="Calibration model type"
36
+ )
37
+ features: list[str] = Field(..., min_length=1, description="Feature names in order")
38
+ weights: list[float] = Field(..., min_length=1, description="Logistic weights")
39
+ bias: float = Field(..., description="Logistic bias term")
40
+ ece: float | None = Field(
41
+ None, ge=0.0, le=1.0, description="Expected Calibration Error"
42
+ )
43
+ trained_on: date | None = Field(None, description="Training date")
44
+ notes: str | None = Field(None, description="Human-readable notes")
45
+
46
+ @model_validator(mode="after")
47
+ def validate_weights_match_features(self) -> "CalibrationModel":
48
+ """Validate that weights count matches features count."""
49
+ if len(self.weights) != len(self.features):
50
+ raise ValueError(
51
+ f"Weights count ({len(self.weights)}) must match "
52
+ f"features count ({len(self.features)})"
53
+ )
54
+ return self
55
+
56
+
57
+ def load_calibration_model(path: Path) -> CalibrationModel | None:
58
+ """
59
+ Load calibration model from JSON file.
60
+
61
+ Args:
62
+ path: Path to calibration_model.json file
63
+
64
+ Returns:
65
+ CalibrationModel if file exists and is valid, None otherwise
66
+ """
67
+ if not path.exists():
68
+ return None
69
+
70
+ try:
71
+ data = json.loads(path.read_text())
72
+ return CalibrationModel.model_validate(data)
73
+ except json.JSONDecodeError as e:
74
+ logger.warning(f"Invalid JSON in calibration model at {path}: {e}")
75
+ return None
76
+ except ValidationError as e:
77
+ logger.warning(f"Invalid calibration model schema at {path}: {e}")
78
+ return None
@@ -0,0 +1,215 @@
1
+ # CLI Module
2
+
3
+ ## Purpose
4
+
5
+ The CLI module provides a command-line interface for interactive entity resolution, batch processing, and data pack management.
6
+
7
+ ## Components
8
+
9
+ ### Command Modules
10
+
11
+ 1. **Main CLI** (`main.py`)
12
+ - Entry point for CLI application
13
+ - Uses `click` or `typer` for argument parsing
14
+ - Global options and configuration
15
+
16
+ 2. **Query Command** (`query.py`)
17
+ - Single entity resolution from command line
18
+ - Interactive disambiguation prompts
19
+ - Human-readable output with colors
20
+
21
+ 3. **Convert Command** (`convert.py`)
22
+ - Code system conversion
23
+ - Supports single codes or batch files
24
+
25
+ 4. **Batch Command** (`batch.py`)
26
+ - Batch processing of CSV/JSON files
27
+ - Progress bars for large files
28
+ - Error reporting and partial results
29
+
30
+ 5. **Extract Command** (`extract.py`)
31
+ - Entity extraction from text files
32
+ - Supports various input formats
33
+ - (Phase F)
34
+
35
+ 6. **Info Command** (`info.py`)
36
+ - Display detailed entity information
37
+ - Show codes, hierarchy, memberships
38
+
39
+ 7. **Members Command** (`members.py`)
40
+ - List group members
41
+ - Temporal support (as_of date)
42
+
43
+ 8. **Update Command** (`update.py`)
44
+ - Check for data pack updates
45
+ - Download and install new versions
46
+
47
+ 9. **Version Command** (`version.py`)
48
+ - Show tool and data pack versions
49
+ - Component information
50
+
51
+ ### Output Formatters
52
+
53
+ - `formatters.py`: Output formatting (table, JSON, CSV)
54
+ - `colors.py`: Terminal color support
55
+ - `progress.py`: Progress bars for long operations
56
+
57
+ ## CLI Commands
58
+
59
+ ### Query
60
+
61
+ ```bash
62
+ # Simple query
63
+ resolvekit query "Georgia"
64
+
65
+ # Output:
66
+ # ⚠️ Ambiguous input. Top matches:
67
+ #
68
+ # 1. Georgia (Country) [dcid: country/GEO]
69
+ # - Confidence: 0.60
70
+ # - Codes: ISO2=GE, ISO3=GEO, M49=268
71
+ #
72
+ # 2. Georgia (US State) [dcid: geoId/13]
73
+ # - Confidence: 0.40
74
+ # - Codes: FIPS=13
75
+
76
+ # With context
77
+ resolvekit query "Georgia" --context "Tbilisi"
78
+
79
+ # With constraints
80
+ resolvekit query "Paris" --entity-type country
81
+ resolvekit query "Ontario" --parent country/CAN
82
+
83
+ # JSON output
84
+ resolvekit query "France" --format json
85
+
86
+ # With explanation
87
+ resolvekit query "Türkiye" --explain
88
+ ```
89
+
90
+ ### Convert
91
+
92
+ ```bash
93
+ # Single code conversion
94
+ resolvekit convert FRA --from iso3 --to dcid
95
+ # Output: country/FRA
96
+
97
+ # Get all codes
98
+ resolvekit convert country/FRA --from dcid
99
+ # Output:
100
+ # iso2: FR
101
+ # iso3: FRA
102
+ # m49: 250
103
+ # nuts: FR
104
+ # wikidata: Q142
105
+
106
+ # Batch file
107
+ resolvekit convert codes.csv --from iso3 --to dcid --output results.csv
108
+ ```
109
+
110
+ ### Batch
111
+
112
+ ```bash
113
+ # Process CSV file
114
+ resolvekit batch input.csv \
115
+ --column location \
116
+ --output enriched.csv \
117
+ --min-confidence 0.8
118
+
119
+ # With additional context columns
120
+ resolvekit batch data.csv \
121
+ --column country_name \
122
+ --context-column region \
123
+ --output results.csv
124
+
125
+ # JSON input/output
126
+ resolvekit batch input.json --format json --output results.json
127
+ ```
128
+
129
+ ### Info
130
+
131
+ ```bash
132
+ # Entity details
133
+ resolvekit info country/FRA
134
+
135
+ # Output:
136
+ # Entity: France
137
+ # DCID: country/FRA
138
+ # Type: country
139
+ # Codes:
140
+ # ISO2: FR
141
+ # ISO3: FRA
142
+ # M49: 250
143
+ # Parent: None
144
+ # Children (ADM1): 18 regions
145
+ # Memberships: EU, OECD, G7, G20, ...
146
+ ```
147
+
148
+ ### Members
149
+
150
+ ```bash
151
+ # Current members
152
+ resolvekit members EU
153
+
154
+ # Historical members
155
+ resolvekit members EU --as-of 2004-01-01
156
+
157
+ # Output as list of DCIDs
158
+ resolvekit members OECD --format dcids
159
+ ```
160
+
161
+ ### Update
162
+
163
+ ```bash
164
+ # Check for updates
165
+ resolvekit update --check
166
+
167
+ # Download and install latest
168
+ resolvekit update --install
169
+
170
+ # List available versions
171
+ resolvekit update --list
172
+ ```
173
+
174
+ ### Version
175
+
176
+ ```bash
177
+ # Show versions
178
+ resolvekit version
179
+
180
+ # Output:
181
+ # resolvekit: 0.1.0
182
+ # Data pack: 1.2.0 (2025-10-20)
183
+ # Python: 3.13
184
+ ```
185
+
186
+ ## Output Formats
187
+
188
+ ### Human-readable (default)
189
+ - Colored terminal output
190
+ - Formatted tables
191
+ - Progress indicators
192
+
193
+ ### JSON
194
+ - Machine-readable
195
+ - Full result structure
196
+ - Suitable for piping
197
+
198
+ ### CSV
199
+ - Batch results
200
+ - Compatible with spreadsheets
201
+ - Configurable columns
202
+
203
+ ## Design Principles
204
+
205
+ 1. **Unix-friendly**: Composable with pipes, appropriate exit codes
206
+ 2. **Interactive**: Clear prompts for ambiguous cases
207
+ 3. **Informative**: Helpful error messages with suggestions
208
+ 4. **Fast feedback**: Show progress for long operations
209
+
210
+ ## Implementation Priority
211
+
212
+ **Phase A** - Core resolver (query, batch, version)
213
+ **Phase B** - Code conversion (convert, info, members)
214
+ **Phase D** - Package updates (update)
215
+ **Phase F** - Entity extraction (extract)
File without changes
resolvekit/cli/main.py ADDED
@@ -0,0 +1,18 @@
1
+ """CLI entry point for resolvekit."""
2
+
3
+ import sys
4
+
5
+
6
+ def cli() -> None:
7
+ """Main CLI entry point."""
8
+ print("resolvekit CLI - Coming soon!")
9
+ print()
10
+ print("Available commands (to be implemented):")
11
+ print(" resolvekit query <text> - Resolve entity from text")
12
+ print(" resolvekit convert <code> - Convert between code systems")
13
+ print(" resolvekit batch <file> - Batch process CSV/JSON file")
14
+ print(" resolvekit info <dcid> - Show entity information")
15
+ print(" resolvekit members <group> - List group members")
16
+ print(" resolvekit update - Update data packs")
17
+ print(" resolvekit version - Show version information")
18
+ sys.exit(0)
resolvekit/config.py ADDED
@@ -0,0 +1,128 @@
1
+ """Configuration management for resolvekit using Pydantic Settings."""
2
+
3
+ from pathlib import Path
4
+
5
+ from pydantic import Field
6
+ from pydantic_settings import BaseSettings, SettingsConfigDict
7
+
8
+
9
+ class ResolvekitConfig(BaseSettings):
10
+ """
11
+ Configuration for resolvekit resolver.
12
+
13
+ This uses Pydantic Settings to read configuration from:
14
+ 1. Environment variables (prefixed with RESOLVEKIT_)
15
+ 2. .env file (if present)
16
+ 3. Default values
17
+
18
+ Example:
19
+ # Set via environment
20
+ export RESOLVEKIT_DATA_DIR="/path/to/data"
21
+ export RESOLVEKIT_MIN_CONFIDENCE=0.8
22
+
23
+ # Or in .env file
24
+ RESOLVEKIT_DATA_DIR=/path/to/data
25
+ RESOLVEKIT_MIN_CONFIDENCE=0.8
26
+
27
+ # Use in code
28
+ config = ResolvekitConfig()
29
+ print(config.data_dir) # Path('/path/to/data')
30
+ """
31
+
32
+ model_config = SettingsConfigDict(
33
+ env_prefix="RESOLVEKIT_",
34
+ env_file=".env",
35
+ env_file_encoding="utf-8",
36
+ case_sensitive=False,
37
+ )
38
+
39
+ # Paths
40
+ data_dir: Path = Field(
41
+ default=Path.home() / ".resolvekit" / "data",
42
+ description="Directory containing data packs",
43
+ )
44
+ cache_dir: Path = Field(
45
+ default=Path.home() / ".resolvekit" / "cache",
46
+ description="Cache directory",
47
+ )
48
+
49
+ # Resolution settings
50
+ min_confidence: float = Field(
51
+ default=0.7,
52
+ ge=0.0,
53
+ le=1.0,
54
+ description="Minimum confidence threshold for matches",
55
+ )
56
+ max_candidates: int = Field(
57
+ default=50,
58
+ gt=0,
59
+ le=1000,
60
+ description="Maximum number of candidates to consider",
61
+ )
62
+ max_alternates: int = Field(
63
+ default=5,
64
+ gt=0,
65
+ le=100,
66
+ description="Maximum number of alternative matches to return",
67
+ )
68
+
69
+ # FTS settings
70
+ fts_top_k: int = Field(
71
+ default=50,
72
+ gt=0,
73
+ le=1000,
74
+ description="Top K results from FTS",
75
+ )
76
+ fuzzy_top_k: int = Field(
77
+ default=12,
78
+ gt=0,
79
+ le=100,
80
+ description="Top K results after fuzzy matching",
81
+ )
82
+
83
+ # Ambiguity detection
84
+ margin_threshold: float = Field(
85
+ default=0.15,
86
+ ge=0.0,
87
+ le=1.0,
88
+ description="Score margin threshold for ambiguity detection",
89
+ )
90
+
91
+ # Logging
92
+ log_level: str = Field(
93
+ default="INFO",
94
+ pattern=r"^(DEBUG|INFO|WARNING|ERROR|CRITICAL)$",
95
+ description="Logging level",
96
+ )
97
+ log_format: str = Field(
98
+ default="human",
99
+ pattern=r"^(human|json)$",
100
+ description="Log format (human or json)",
101
+ )
102
+ redact_queries: bool = Field(
103
+ default=True,
104
+ description="Redact query content in logs for privacy",
105
+ )
106
+
107
+ # Performance
108
+ batch_size: int = Field(
109
+ default=1000,
110
+ gt=0,
111
+ le=100000,
112
+ description="Batch size for batch processing",
113
+ )
114
+ num_workers: int = Field(
115
+ default=4,
116
+ gt=0,
117
+ le=64,
118
+ description="Number of worker processes for parallel processing",
119
+ )
120
+
121
+ def ensure_dirs(self) -> None:
122
+ """Create data and cache directories if they don't exist."""
123
+ self.data_dir.mkdir(parents=True, exist_ok=True)
124
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
125
+
126
+
127
+ # Global config instance (can be overridden)
128
+ config = ResolvekitConfig()
@@ -0,0 +1,252 @@
1
+ """Constants for resolvekit."""
2
+
3
+ from resolvekit.types import EntityType
4
+
5
+ # Version
6
+ VERSION = "0.0.1"
7
+
8
+ # Default configuration
9
+ DEFAULT_MIN_CONFIDENCE = 0.7
10
+ DEFAULT_MAX_CANDIDATES = 50
11
+ DEFAULT_MAX_ALTERNATES = 5
12
+ DEFAULT_FTS_TOP_K = 50
13
+ DEFAULT_FUZZY_TOP_K = 12
14
+
15
+ # Ambiguity detection
16
+ DEFAULT_MARGIN_THRESHOLD = 0.15 # Trigger semantic if margin < this
17
+
18
+ # File paths
19
+ DEFAULT_DATA_DIR = "~/.resolvekit/data"
20
+ DEFAULT_CACHE_DIR = "~/.resolvekit/cache"
21
+
22
+ # Database configuration
23
+ DB_PRAGMAS = {
24
+ "journal_mode": "OFF",
25
+ "synchronous": "OFF",
26
+ "temp_store": "MEMORY",
27
+ "mmap_size": 268435456, # ~256MB
28
+ "cache_size": -100000, # ~100MB
29
+ }
30
+
31
+ # FTS5 configuration
32
+ FTS5_TOKENIZER = "unicode61 remove_diacritics 2 tokenchars '.-'"
33
+ FTS5_PREFIX = "2,3"
34
+
35
+ # Performance targets (milliseconds)
36
+ TARGET_LATENCY_P50 = 10
37
+ TARGET_LATENCY_P95 = 50
38
+ TARGET_STARTUP_TIME = 5000 # 5 seconds
39
+
40
+ # Memory targets (MB)
41
+ TARGET_MEMORY_BASE = 2048 # 2GB
42
+ TARGET_MEMORY_EMBEDDINGS = 4096 # 4GB
43
+ TARGET_MEMORY_FULL = 6144 # 6GB
44
+
45
+ # Batch processing
46
+ DEFAULT_BATCH_SIZE = 1000
47
+ DEFAULT_NUM_WORKERS = 4
48
+
49
+ # Entity type hierarchy
50
+ ENTITY_TYPE_HIERARCHY = {
51
+ EntityType.COUNTRY: [
52
+ EntityType.ADMIN1,
53
+ EntityType.ADMIN2,
54
+ EntityType.ADMIN3,
55
+ EntityType.ADMIN4,
56
+ EntityType.CITY,
57
+ ],
58
+ EntityType.ADMIN1: [
59
+ EntityType.ADMIN2,
60
+ EntityType.ADMIN3,
61
+ EntityType.ADMIN4,
62
+ EntityType.CITY,
63
+ ],
64
+ EntityType.ADMIN2: [EntityType.ADMIN3, EntityType.ADMIN4, EntityType.CITY],
65
+ EntityType.ADMIN3: [EntityType.ADMIN4, EntityType.CITY],
66
+ EntityType.ADMIN4: [EntityType.CITY],
67
+ EntityType.ORGANIZATION: [EntityType.GROUP],
68
+ }
69
+
70
+ # Code system metadata
71
+ CODE_SYSTEMS = {
72
+ "dcid": {
73
+ "name": "Data Commons ID",
74
+ "authority": "Google Data Commons",
75
+ "format": r"^[a-zA-Z]+/[A-Z0-9]+$",
76
+ "example": "country/USA",
77
+ },
78
+ "iso2": {
79
+ "name": "ISO 3166-1 alpha-2",
80
+ "authority": "ISO",
81
+ "format": r"^[A-Z]{2}$",
82
+ "example": "US",
83
+ },
84
+ "iso3": {
85
+ "name": "ISO 3166-1 alpha-3",
86
+ "authority": "ISO",
87
+ "format": r"^[A-Z]{3}$",
88
+ "example": "USA",
89
+ },
90
+ "iso_numeric": {
91
+ "name": "ISO 3166-1 numeric",
92
+ "authority": "ISO",
93
+ "format": r"^\d{3}$",
94
+ "example": "840",
95
+ },
96
+ "m49": {
97
+ "name": "UN M49",
98
+ "authority": "UN Statistics Division",
99
+ "format": r"^\d{3}$",
100
+ "example": "840",
101
+ },
102
+ "nuts": {
103
+ "name": "NUTS codes",
104
+ "authority": "Eurostat",
105
+ "format": r"^[A-Z]{2}[A-Z0-9]{0,3}$",
106
+ "example": "FR",
107
+ },
108
+ "pcode": {
109
+ "name": "OCHA P-codes",
110
+ "authority": "UN OCHA",
111
+ "format": r"^[A-Z]{2}[A-Z0-9]+$",
112
+ "example": "FR01",
113
+ },
114
+ "wikidata": {
115
+ "name": "Wikidata QID",
116
+ "authority": "Wikidata",
117
+ "format": r"^Q\d+$",
118
+ "example": "Q30",
119
+ },
120
+ "geonames": {
121
+ "name": "GeoNames ID",
122
+ "authority": "GeoNames",
123
+ "format": r"^\d+$",
124
+ "example": "6252001",
125
+ },
126
+ }
127
+
128
+ # Source precedence (higher = higher priority)
129
+ SOURCE_PRECEDENCE = {
130
+ "user": 100,
131
+ "custom": 90,
132
+ "datacommons": 80,
133
+ "iso": 70,
134
+ "un": 60,
135
+ "eurostat": 60,
136
+ "ocha": 60,
137
+ "wikidata": 50,
138
+ "geonames": 40,
139
+ "other": 10,
140
+ }
141
+
142
+ # Overlay precedence ranges
143
+ OVERLAY_PRECEDENCE = {
144
+ "user": (100, 999),
145
+ "org": (10, 99),
146
+ "base": (0, 0),
147
+ }
148
+
149
+ # Calibration features
150
+ CALIBRATION_FEATURES = [
151
+ "f_exact_code",
152
+ "f_canonical_exact",
153
+ "f_alias_exact",
154
+ "f_alias_type_canonical",
155
+ "f_alias_type_endonym",
156
+ "f_alias_type_exonym",
157
+ "f_alias_type_abbr",
158
+ "f_alias_type_code",
159
+ "f_fts_score",
160
+ "f_fts_rank_inv",
161
+ "f_edit_distance_norm",
162
+ "f_trigram_jaccard",
163
+ "f_parent_valid",
164
+ "f_type_valid",
165
+ "f_date_valid",
166
+ "f_sem_used",
167
+ "f_sem_sim",
168
+ "f_ambiguity_flag",
169
+ "f_region_hint_match",
170
+ ]
171
+
172
+ # Known ambiguous terms (partial list - full list in ambiguity registry)
173
+ KNOWN_AMBIGUOUS = {
174
+ "georgia": ["country/GEO", "geoId/13"],
175
+ "congo": ["country/COD", "country/COG"],
176
+ "guinea": ["country/GIN", "country/GNQ", "country/GNB", "country/PNG"],
177
+ "korea": ["country/KOR", "country/PRK"],
178
+ "macedonia": ["country/MKD", "geoId/GR-MAC"],
179
+ "springfield": [], # Too many to list
180
+ }
181
+
182
+ # Common stopwords that aren't entities
183
+ ENTITY_STOPWORDS = {
184
+ # Months
185
+ "january",
186
+ "february",
187
+ "march",
188
+ "april",
189
+ "may",
190
+ "june",
191
+ "july",
192
+ "august",
193
+ "september",
194
+ "october",
195
+ "november",
196
+ "december",
197
+ # Days
198
+ "monday",
199
+ "tuesday",
200
+ "wednesday",
201
+ "thursday",
202
+ "friday",
203
+ "saturday",
204
+ "sunday",
205
+ # Common words that look like places
206
+ "reading",
207
+ "normal",
208
+ "mobile",
209
+ "phoenix", # Problematic: city vs. mythical bird
210
+ "aurora",
211
+ "victoria", # Problematic: can be place
212
+ # Directions
213
+ "north",
214
+ "south",
215
+ "east",
216
+ "west",
217
+ }
218
+
219
+ # Data pack manifest schema version
220
+ MANIFEST_SCHEMA_VERSION = "1.0.0"
221
+
222
+ # Supported languages (ISO 639-1 codes)
223
+ SUPPORTED_LANGUAGES = [
224
+ "en", # English
225
+ "es", # Spanish
226
+ "fr", # French
227
+ "de", # German
228
+ "it", # Italian
229
+ "pt", # Portuguese
230
+ "nl", # Dutch
231
+ "ru", # Russian (Cyrillic script)
232
+ "ar", # Arabic (Arabic script)
233
+ "zh", # Chinese (CJK)
234
+ "ja", # Japanese (CJK)
235
+ "ko", # Korean (Hangul)
236
+ ]
237
+
238
+ # Logging configuration
239
+ LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
240
+ LOG_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
241
+
242
+ # API defaults
243
+ API_RATE_LIMIT = 100 # requests per second
244
+ API_TIMEOUT = 30 # seconds
245
+ API_MAX_BATCH_SIZE = 10000
246
+
247
+ # CLI defaults
248
+ CLI_MAX_DISPLAY_ALTERNATES = 5
249
+ CLI_COLOR_PRIMARY = "blue"
250
+ CLI_COLOR_SUCCESS = "green"
251
+ CLI_COLOR_WARNING = "yellow"
252
+ CLI_COLOR_ERROR = "red"