resolvekit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. resolvekit/README.md +134 -0
  2. resolvekit/__init__.py +67 -0
  3. resolvekit/api/README.md +165 -0
  4. resolvekit/api/__init__.py +10 -0
  5. resolvekit/api/convenience.py +53 -0
  6. resolvekit/api/resolver.py +457 -0
  7. resolvekit/builders/README.md +173 -0
  8. resolvekit/builders/__init__.py +0 -0
  9. resolvekit/calibration/README.md +351 -0
  10. resolvekit/calibration/__init__.py +12 -0
  11. resolvekit/calibration/calibrator.py +184 -0
  12. resolvekit/calibration/features.py +139 -0
  13. resolvekit/calibration/models.py +78 -0
  14. resolvekit/cli/README.md +215 -0
  15. resolvekit/cli/__init__.py +0 -0
  16. resolvekit/cli/main.py +18 -0
  17. resolvekit/config.py +128 -0
  18. resolvekit/constants.py +252 -0
  19. resolvekit/constraints/README.md +102 -0
  20. resolvekit/constraints/__init__.py +17 -0
  21. resolvekit/constraints/constraint_engine.py +111 -0
  22. resolvekit/constraints/hierarchy_validator.py +148 -0
  23. resolvekit/constraints/membership_validator.py +60 -0
  24. resolvekit/constraints/protocols.py +33 -0
  25. resolvekit/constraints/temporal_validator.py +43 -0
  26. resolvekit/constraints/type_validator.py +42 -0
  27. resolvekit/data/README.md +165 -0
  28. resolvekit/data/__init__.py +14 -0
  29. resolvekit/data/alias_repository.py +206 -0
  30. resolvekit/data/code_repository.py +85 -0
  31. resolvekit/data/context_filters.py +49 -0
  32. resolvekit/data/db_manager.py +196 -0
  33. resolvekit/data/entity_repository.py +466 -0
  34. resolvekit/data/membership_repository.py +107 -0
  35. resolvekit/data/query_builder.py +177 -0
  36. resolvekit/data/schema.py +122 -0
  37. resolvekit/disambiguation/README.md +72 -0
  38. resolvekit/disambiguation/__init__.py +0 -0
  39. resolvekit/extraction/README.md +204 -0
  40. resolvekit/extraction/__init__.py +0 -0
  41. resolvekit/matchers/README.md +77 -0
  42. resolvekit/matchers/__init__.py +65 -0
  43. resolvekit/matchers/alias_exact.py +65 -0
  44. resolvekit/matchers/canonical_name.py +62 -0
  45. resolvekit/matchers/cascade.py +127 -0
  46. resolvekit/matchers/code_validators.py +250 -0
  47. resolvekit/matchers/exact_code.py +177 -0
  48. resolvekit/matchers/fts_matcher.py +106 -0
  49. resolvekit/matchers/fuzzy_matcher.py +142 -0
  50. resolvekit/matchers/priorities.py +174 -0
  51. resolvekit/matchers/protocols.py +75 -0
  52. resolvekit/normalization/README.md +192 -0
  53. resolvekit/normalization/__init__.py +8 -0
  54. resolvekit/normalization/normalizer.py +164 -0
  55. resolvekit/overlays/README.md +226 -0
  56. resolvekit/overlays/__init__.py +0 -0
  57. resolvekit/types.py +534 -0
  58. resolvekit/utils/README.md +188 -0
  59. resolvekit/utils/__init__.py +48 -0
  60. resolvekit/utils/cache.py +109 -0
  61. resolvekit/utils/dates.py +339 -0
  62. resolvekit/utils/errors.py +145 -0
  63. resolvekit/utils/files.py +366 -0
  64. resolvekit/utils/logging.py +219 -0
  65. resolvekit/utils/text.py +475 -0
  66. resolvekit/utils/validation.py +301 -0
  67. resolvekit-0.0.1.dist-info/METADATA +36 -0
  68. resolvekit-0.0.1.dist-info/RECORD +70 -0
  69. resolvekit-0.0.1.dist-info/WHEEL +4 -0
  70. resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,165 @@
1
+ # Data Module
2
+
3
+ ## Purpose
4
+
5
+ The data module handles all data storage, retrieval, and management including SQLite database operations, data pack loading, and overlay management.
6
+
7
+ ## Components
8
+
9
+ ### Core Components
10
+
11
+ 1. **Database Manager** (`db_manager.py`)
12
+ - SQLite connection management
13
+ - PRAGMA configuration for performance
14
+ - Database attachment (base + overlays)
15
+ - Transaction management
16
+
17
+ 2. **Schema** (`schema.py`)
18
+ - SQL schema definitions for all tables
19
+ - FTS5 virtual table configuration
20
+ - Indexes and constraints
21
+ - Migration utilities
22
+
23
+ 3. **Models** (`models.py`)
24
+ - Python data classes for entities, aliases, codes, etc.
25
+ - ORM-like interface (or raw SQL with dataclass mapping)
26
+ - Type-safe data access
27
+
28
+ 4. **Loaders** (`loaders.py`)
29
+ - Load entities, aliases, codes from SQLite
30
+ - Cache frequently accessed data
31
+ - Lazy loading for large datasets
32
+
33
+ 5. **Query Builder** (`query_builder.py`)
34
+ - Construct SQL queries for various operations
35
+ - Handle union queries across base + overlays
36
+ - FTS query construction with proper escaping
37
+
38
+ ### Data Access Layer
39
+
40
+ - `entities.py`: Entity CRUD operations
41
+ - `aliases.py`: Alias lookup and search
42
+ - `codes.py`: Code system lookups
43
+ - `hierarchies.py`: Hierarchy traversal queries
44
+ - `memberships.py`: Group membership queries
45
+ - `provenance.py`: Data source attribution
46
+
47
+ ## Database Schema
48
+
49
+ ### Main Tables
50
+
51
+ ```sql
52
+ -- Entities table
53
+ CREATE TABLE entities (
54
+ dcid TEXT PRIMARY KEY,
55
+ canonical_name TEXT NOT NULL,
56
+ entity_type TEXT NOT NULL,
57
+ parent_dcid TEXT,
58
+ centroid_lat REAL,
59
+ centroid_lon REAL,
60
+ valid_from TEXT,
61
+ valid_until TEXT,
62
+ FOREIGN KEY (parent_dcid) REFERENCES entities(dcid)
63
+ );
64
+
65
+ -- Aliases table
66
+ CREATE TABLE aliases (
67
+ alias_id INTEGER PRIMARY KEY,
68
+ entity_dcid TEXT NOT NULL,
69
+ alias_text TEXT NOT NULL,
70
+ alias_norm TEXT NOT NULL,
71
+ language TEXT,
72
+ alias_type TEXT CHECK(alias_type IN ('canonical','endonym','exonym','abbr','code')),
73
+ valid_from TEXT,
74
+ valid_until TEXT,
75
+ source TEXT,
76
+ alias_uid TEXT UNIQUE,
77
+ FOREIGN KEY (entity_dcid) REFERENCES entities(dcid)
78
+ );
79
+
80
+ -- FTS5 virtual table
81
+ CREATE VIRTUAL TABLE aliases_fts USING fts5(
82
+ alias_norm,
83
+ content='aliases',
84
+ content_rowid='alias_id',
85
+ tokenize = "unicode61 remove_diacritics 2 tokenchars '.-'",
86
+ prefix='2,3'
87
+ );
88
+
89
+ -- Codes table
90
+ CREATE TABLE codes (
91
+ entity_dcid TEXT NOT NULL,
92
+ code_system TEXT NOT NULL,
93
+ code_value TEXT NOT NULL,
94
+ valid_from TEXT,
95
+ valid_until TEXT,
96
+ source TEXT,
97
+ PRIMARY KEY (entity_dcid, code_system),
98
+ FOREIGN KEY (entity_dcid) REFERENCES entities(dcid)
99
+ );
100
+
101
+ -- Memberships table
102
+ CREATE TABLE memberships (
103
+ id INTEGER PRIMARY KEY,
104
+ entity_dcid TEXT NOT NULL,
105
+ group_dcid TEXT NOT NULL,
106
+ valid_from TEXT NOT NULL,
107
+ valid_until TEXT,
108
+ source TEXT,
109
+ FOREIGN KEY (entity_dcid) REFERENCES entities(dcid),
110
+ FOREIGN KEY (group_dcid) REFERENCES entities(dcid)
111
+ );
112
+
113
+ -- Ambiguity registry
114
+ CREATE TABLE ambiguity (
115
+ surface TEXT PRIMARY KEY,
116
+ types TEXT,
117
+ notes TEXT
118
+ );
119
+
120
+ -- Provenance table
121
+ CREATE TABLE provenance (
122
+ id INTEGER PRIMARY KEY,
123
+ entity_dcid TEXT NOT NULL,
124
+ field TEXT,
125
+ source TEXT,
126
+ license TEXT,
127
+ quality INTEGER,
128
+ last_updated TEXT,
129
+ FOREIGN KEY (entity_dcid) REFERENCES entities(dcid)
130
+ );
131
+ ```
132
+
133
+ ## SQLite Configuration
134
+
135
+ Performance pragmas applied on connection:
136
+
137
+ ```python
138
+ PRAGMA journal_mode=OFF;
139
+ PRAGMA synchronous=OFF;
140
+ PRAGMA temp_store=MEMORY;
141
+ PRAGMA mmap_size=268435456; # ~256MB
142
+ PRAGMA cache_size=-100000; # ~100MB
143
+ ```
144
+
145
+ ## Overlay Precedence
146
+
147
+ When querying across base + overlays:
148
+
149
+ 1. User overlays (precedence: 100+)
150
+ 2. Organization overlays (precedence: 10-99)
151
+ 3. Base pack (precedence: 0)
152
+
153
+ Queries use `UNION ALL` with deduplication by `alias_uid` or `dcid`, keeping highest precedence.
154
+
155
+ ## Design Principles
156
+
157
+ 1. **Read-optimized**: Pre-built indexes, no writes at runtime
158
+ 2. **Efficient caching**: Cache hot data (codes, popular entities)
159
+ 3. **Overlay transparency**: Queries automatically span all attached databases
160
+ 4. **Type safety**: Use dataclasses for structured data access
161
+
162
+ ## Implementation Priority
163
+
164
+ **Phase A** - Core resolver (schema, basic loaders)
165
+ **Phase C** - Overlay system
@@ -0,0 +1,14 @@
1
+ """Data layer for resolvekit."""
2
+
3
+ from resolvekit.data.db_manager import DatabaseManager
4
+ from resolvekit.data.entity_repository import EntityRepository
5
+ from resolvekit.data.query_builder import QueryBuilder
6
+ from resolvekit.data.schema import create_schema, get_schema_version
7
+
8
+ __all__ = [
9
+ "DatabaseManager",
10
+ "EntityRepository",
11
+ "QueryBuilder",
12
+ "create_schema",
13
+ "get_schema_version",
14
+ ]
@@ -0,0 +1,206 @@
1
+ """Repository for alias and FTS operations."""
2
+
3
+ from typing import Any, ClassVar
4
+
5
+ from resolvekit.data.context_filters import ContextFilterBuilder
6
+ from resolvekit.data.db_manager import DatabaseManager
7
+ from resolvekit.data.query_builder import QueryBuilder
8
+ from resolvekit.normalization.normalizer import TextNormalizer
9
+ from resolvekit.types import Entity, EntityRow, MatchContext
10
+
11
+
12
+ class AliasRepository:
13
+ """Repository for alias and FTS operations."""
14
+
15
+ # Derive column list from EntityRow model (single source of truth)
16
+ ENTITY_COLUMNS: ClassVar[list[str]] = list(EntityRow.model_fields.keys())
17
+
18
+ def __init__(self, db_manager: DatabaseManager, normalizer: TextNormalizer):
19
+ """
20
+ Initialize repository.
21
+
22
+ Args:
23
+ db_manager: Database manager instance
24
+ normalizer: Text normalizer instance
25
+ """
26
+ self.db = db_manager
27
+ self.normalizer = normalizer
28
+ self.query_builder = QueryBuilder(db_manager)
29
+
30
+ def find_exact_normalized(
31
+ self, normalized: str, context: MatchContext | None = None
32
+ ) -> list[tuple[Entity, str]]:
33
+ """
34
+ Find entities with exact normalized alias match.
35
+
36
+ Performance:
37
+ - Uses idx_aliases_norm index
38
+ - UNION ALL for overlays (no deduplication in SQL)
39
+ - Dedupe in Python by DCID (keep highest precedence)
40
+
41
+ Args:
42
+ normalized: Normalized alias string to match
43
+ context: Optional filtering context
44
+
45
+ Returns:
46
+ List of (entity, matched_alias) tuples
47
+ """
48
+ params: dict[str, Any] = {"normalized": normalized}
49
+
50
+ # Build context filters using shared utility
51
+ temporal_filter, type_filter, parent_filter = (
52
+ ContextFilterBuilder.build_filters(context, params, table_prefix="e")
53
+ )
54
+
55
+ # Query from main database
56
+ entity_cols = ", ".join(f"e.{col}" for col in self.ENTITY_COLUMNS)
57
+ sql = f"""
58
+ SELECT {entity_cols}, a.alias_text, 0 AS precedence
59
+ FROM main.aliases a
60
+ JOIN main.entities e ON a.entity_dcid = e.dcid
61
+ WHERE a.alias_norm = :normalized
62
+ {temporal_filter}
63
+ {type_filter}
64
+ {parent_filter}
65
+ """
66
+
67
+ # Add overlays if present
68
+ for schema_name, precedence in self.db.overlays:
69
+ sql += f"""
70
+ UNION ALL
71
+ SELECT {entity_cols}, a.alias_text, {precedence} AS precedence
72
+ FROM {schema_name}.aliases a
73
+ JOIN {schema_name}.entities e ON a.entity_dcid = e.dcid
74
+ WHERE a.alias_norm = :normalized
75
+ {temporal_filter}
76
+ {type_filter}
77
+ {parent_filter}
78
+ """
79
+
80
+ # Order by precedence to prioritize overlays
81
+ sql += "\nORDER BY precedence DESC"
82
+
83
+ result = self.db.execute(sql, params)
84
+
85
+ # Dedupe by DCID (keep first occurrence = highest precedence)
86
+ seen_dcids = set()
87
+ matches = []
88
+
89
+ for row in result:
90
+ entity = self._row_to_entity(row)
91
+
92
+ if entity.dcid not in seen_dcids:
93
+ seen_dcids.add(entity.dcid)
94
+ matches.append((entity, row.alias_text))
95
+
96
+ return matches
97
+
98
+ def search_fts(
99
+ self, query: str, limit: int = 50, context: MatchContext | None = None
100
+ ) -> list[tuple[Entity, float, int]]:
101
+ """
102
+ FTS5 search with BM25 ranking.
103
+
104
+ Performance:
105
+ - LIMIT pushed to SQL
106
+ - UNION ALL for overlays
107
+ - ORDER BY rank LIMIT in SQL
108
+
109
+ Args:
110
+ query: Query string for FTS search
111
+ limit: Maximum results to return
112
+ context: Optional filtering context
113
+
114
+ Returns:
115
+ List of (entity, bm25_score, rank) tuples
116
+ """
117
+ params: dict[str, Any] = {"query": query}
118
+
119
+ # Build context filters using shared utility
120
+ temporal_filter, type_filter, parent_filter = (
121
+ ContextFilterBuilder.build_filters(context, params, table_prefix="e")
122
+ )
123
+
124
+ # Entity columns for SELECT
125
+ entity_cols = ", ".join(f"e.{col}" for col in self.ENTITY_COLUMNS)
126
+
127
+ # Query from main database
128
+ # Note: FTS5 rank is negative (closer to 0 = better)
129
+ # fts.rank is equivalent to bm25(aliases_fts) in SQLite 3.20.0+
130
+ sql = f"""
131
+ SELECT {entity_cols}, fts.rank as rank, 0 AS precedence
132
+ FROM main.aliases_fts fts
133
+ JOIN main.aliases a ON fts.rowid = a.alias_id
134
+ JOIN main.entities e ON a.entity_dcid = e.dcid
135
+ WHERE fts.alias_norm MATCH :query
136
+ {temporal_filter}
137
+ {type_filter}
138
+ {parent_filter}
139
+ """
140
+
141
+ # Add overlays if present
142
+ for schema_name, precedence in self.db.overlays:
143
+ sql += f"""
144
+ UNION ALL
145
+ SELECT {entity_cols}, fts.rank as rank, {precedence} AS precedence
146
+ FROM {schema_name}.aliases_fts fts
147
+ JOIN {schema_name}.aliases a ON fts.rowid = a.alias_id
148
+ JOIN {schema_name}.entities e ON a.entity_dcid = e.dcid
149
+ WHERE fts.alias_norm MATCH :query
150
+ {temporal_filter}
151
+ {type_filter}
152
+ {parent_filter}
153
+ """
154
+
155
+ # Order by rank (ascending, since negative), then precedence
156
+ # Apply generous SQL LIMIT to reduce memory/IO while leaving headroom for deduplication
157
+ sql += f"""
158
+ ORDER BY rank ASC, precedence DESC
159
+ LIMIT {limit * 10}
160
+ """
161
+
162
+ result = self.db.execute(sql, params)
163
+
164
+ # Dedupe by DCID (keep first occurrence = best rank + highest precedence)
165
+ seen_dcids = set()
166
+ matches = []
167
+ rank = 1
168
+
169
+ for row in result:
170
+ entity = self._row_to_entity(row)
171
+
172
+ if entity.dcid not in seen_dcids:
173
+ seen_dcids.add(entity.dcid)
174
+ # FTS rank is negative, convert to positive score
175
+ bm25_score = abs(row.rank)
176
+ matches.append((entity, bm25_score, rank))
177
+ rank += 1
178
+
179
+ # Stop once we have enough unique results
180
+ if len(matches) >= limit:
181
+ break
182
+
183
+ return matches
184
+
185
+ def _row_to_entity(self, row: Any) -> Entity:
186
+ """
187
+ Convert database row to Entity model via Pydantic validation.
188
+
189
+ Args:
190
+ row: Database row object
191
+
192
+ Returns:
193
+ Validated Entity instance
194
+ """
195
+ # Convert SQLAlchemy Row to dict using only entity columns
196
+ row_dict = {col: getattr(row, col) for col in self.ENTITY_COLUMNS}
197
+
198
+ # Validate through EntityRow model
199
+ entity_row = EntityRow.model_validate(row_dict)
200
+
201
+ # Convert EntityRow to Entity (adding computed fields)
202
+ return Entity(
203
+ **entity_row.model_dump(),
204
+ codes={},
205
+ provenance={},
206
+ )
@@ -0,0 +1,85 @@
1
+ """Repository for code validation and lookup."""
2
+
3
+ from resolvekit.data.db_manager import DatabaseManager
4
+ from resolvekit.data.entity_repository import EntityRepository
5
+ from resolvekit.matchers.code_validators import get_validator
6
+ from resolvekit.types import CodeSystem, Entity, MatchContext
7
+
8
+
9
+ class CodeRepository:
10
+ """Repository for code validation and lookup."""
11
+
12
+ def __init__(self, db_manager: DatabaseManager, entity_repo: EntityRepository):
13
+ """
14
+ Initialize repository.
15
+
16
+ Args:
17
+ db_manager: Database manager instance
18
+ entity_repo: Entity repository for lookups
19
+ """
20
+ self.db = db_manager
21
+ self.entity_repo = entity_repo
22
+
23
+ def validate_code(self, system: CodeSystem, value: str) -> tuple[bool, str | None]:
24
+ """
25
+ Validate code format using registered validator.
26
+
27
+ Args:
28
+ system: Code system
29
+ value: Code value to validate
30
+
31
+ Returns:
32
+ Tuple of (is_valid, error_message)
33
+ """
34
+ try:
35
+ validator = get_validator(system)
36
+ return validator.validate(value)
37
+ except KeyError:
38
+ return False, f"Unsupported code system: {system}"
39
+
40
+ def find_by_code(
41
+ self, system: CodeSystem, value: str, context: MatchContext | None = None
42
+ ) -> Entity | None:
43
+ """
44
+ Validate, normalize, then lookup entity by code.
45
+
46
+ Args:
47
+ system: Code system
48
+ value: Code value
49
+ context: Optional filtering context
50
+
51
+ Returns:
52
+ Entity if found, None otherwise
53
+ """
54
+ # Validate format
55
+ is_valid, error = self.validate_code(system, value)
56
+ if not is_valid:
57
+ return None
58
+
59
+ # Normalize code
60
+ validator = get_validator(system)
61
+ normalized_value = validator.normalize(value)
62
+
63
+ # Special case: DCID is the entity primary key, not stored in codes table
64
+ if system == CodeSystem.DCID:
65
+ entity = self.entity_repo.find_by_dcid(
66
+ dcid=normalized_value,
67
+ as_of=context.as_of if context else None,
68
+ )
69
+
70
+ # Apply remaining context filters
71
+ if entity and context:
72
+ # Check entity_type filter
73
+ if context.entity_type and entity.entity_type != context.entity_type:
74
+ return None
75
+
76
+ # Check parent filter
77
+ if context.parent_dcid and entity.parent_dcid != context.parent_dcid:
78
+ return None
79
+
80
+ return entity
81
+
82
+ # Lookup via EntityRepository.find_by_code (with context for SQL-level filtering)
83
+ entity = self.entity_repo.find_by_code(system.value, normalized_value, context)
84
+
85
+ return entity
@@ -0,0 +1,49 @@
1
+ """Shared utilities for building SQL context filters."""
2
+
3
+ from typing import Any
4
+
5
+ from resolvekit.types import MatchContext
6
+
7
+
8
+ class ContextFilterBuilder:
9
+ """Builder for SQL filter clauses from match context."""
10
+
11
+ @staticmethod
12
+ def build_filters(
13
+ context: MatchContext | None,
14
+ params: dict[str, Any],
15
+ table_prefix: str = "",
16
+ ) -> tuple[str, str, str]:
17
+ """
18
+ Build SQL filter clauses from match context.
19
+
20
+ Args:
21
+ context: Optional filtering context
22
+ params: Parameters dict to update with filter values
23
+ table_prefix: Optional table alias prefix (e.g., "e" or "")
24
+ Will be converted to "e." format if non-empty
25
+
26
+ Returns:
27
+ Tuple of (temporal_filter, type_filter, parent_filter) SQL clauses
28
+ """
29
+ prefix = f"{table_prefix}." if table_prefix else ""
30
+
31
+ temporal_filter = ""
32
+ if context and context.as_of:
33
+ temporal_filter = f"""
34
+ AND ({prefix}valid_from IS NULL OR {prefix}valid_from <= :as_of)
35
+ AND ({prefix}valid_until IS NULL OR {prefix}valid_until >= :as_of)
36
+ """
37
+ params["as_of"] = context.as_of.isoformat()
38
+
39
+ type_filter = ""
40
+ if context and context.entity_type:
41
+ type_filter = f"AND {prefix}entity_type = :entity_type"
42
+ params["entity_type"] = context.entity_type.value
43
+
44
+ parent_filter = ""
45
+ if context and context.parent_dcid:
46
+ parent_filter = f"AND {prefix}parent_dcid = :parent_dcid"
47
+ params["parent_dcid"] = context.parent_dcid
48
+
49
+ return temporal_filter, type_filter, parent_filter