resolvekit 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- resolvekit/README.md +134 -0
- resolvekit/__init__.py +67 -0
- resolvekit/api/README.md +165 -0
- resolvekit/api/__init__.py +10 -0
- resolvekit/api/convenience.py +53 -0
- resolvekit/api/resolver.py +457 -0
- resolvekit/builders/README.md +173 -0
- resolvekit/builders/__init__.py +0 -0
- resolvekit/calibration/README.md +351 -0
- resolvekit/calibration/__init__.py +12 -0
- resolvekit/calibration/calibrator.py +184 -0
- resolvekit/calibration/features.py +139 -0
- resolvekit/calibration/models.py +78 -0
- resolvekit/cli/README.md +215 -0
- resolvekit/cli/__init__.py +0 -0
- resolvekit/cli/main.py +18 -0
- resolvekit/config.py +128 -0
- resolvekit/constants.py +252 -0
- resolvekit/constraints/README.md +102 -0
- resolvekit/constraints/__init__.py +17 -0
- resolvekit/constraints/constraint_engine.py +111 -0
- resolvekit/constraints/hierarchy_validator.py +148 -0
- resolvekit/constraints/membership_validator.py +60 -0
- resolvekit/constraints/protocols.py +33 -0
- resolvekit/constraints/temporal_validator.py +43 -0
- resolvekit/constraints/type_validator.py +42 -0
- resolvekit/data/README.md +165 -0
- resolvekit/data/__init__.py +14 -0
- resolvekit/data/alias_repository.py +206 -0
- resolvekit/data/code_repository.py +85 -0
- resolvekit/data/context_filters.py +49 -0
- resolvekit/data/db_manager.py +196 -0
- resolvekit/data/entity_repository.py +466 -0
- resolvekit/data/membership_repository.py +107 -0
- resolvekit/data/query_builder.py +177 -0
- resolvekit/data/schema.py +122 -0
- resolvekit/disambiguation/README.md +72 -0
- resolvekit/disambiguation/__init__.py +0 -0
- resolvekit/extraction/README.md +204 -0
- resolvekit/extraction/__init__.py +0 -0
- resolvekit/matchers/README.md +77 -0
- resolvekit/matchers/__init__.py +65 -0
- resolvekit/matchers/alias_exact.py +65 -0
- resolvekit/matchers/canonical_name.py +62 -0
- resolvekit/matchers/cascade.py +127 -0
- resolvekit/matchers/code_validators.py +250 -0
- resolvekit/matchers/exact_code.py +177 -0
- resolvekit/matchers/fts_matcher.py +106 -0
- resolvekit/matchers/fuzzy_matcher.py +142 -0
- resolvekit/matchers/priorities.py +174 -0
- resolvekit/matchers/protocols.py +75 -0
- resolvekit/normalization/README.md +192 -0
- resolvekit/normalization/__init__.py +8 -0
- resolvekit/normalization/normalizer.py +164 -0
- resolvekit/overlays/README.md +226 -0
- resolvekit/overlays/__init__.py +0 -0
- resolvekit/types.py +534 -0
- resolvekit/utils/README.md +188 -0
- resolvekit/utils/__init__.py +48 -0
- resolvekit/utils/cache.py +109 -0
- resolvekit/utils/dates.py +339 -0
- resolvekit/utils/errors.py +145 -0
- resolvekit/utils/files.py +366 -0
- resolvekit/utils/logging.py +219 -0
- resolvekit/utils/text.py +475 -0
- resolvekit/utils/validation.py +301 -0
- resolvekit-0.0.1.dist-info/METADATA +36 -0
- resolvekit-0.0.1.dist-info/RECORD +70 -0
- resolvekit-0.0.1.dist-info/WHEEL +4 -0
- resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Repository for group membership queries."""
|
|
2
|
+
|
|
3
|
+
from datetime import date
|
|
4
|
+
|
|
5
|
+
from sqlalchemy import text
|
|
6
|
+
from sqlmodel import Session
|
|
7
|
+
|
|
8
|
+
from resolvekit.data.db_manager import DatabaseManager
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MembershipRepository:
|
|
12
|
+
"""Repository for querying group memberships with temporal support."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, db_manager: DatabaseManager):
|
|
15
|
+
"""
|
|
16
|
+
Initialize repository.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
db_manager: Database manager instance
|
|
20
|
+
"""
|
|
21
|
+
self.db = db_manager
|
|
22
|
+
|
|
23
|
+
def check_memberships_batch(
|
|
24
|
+
self, entity_dcids: list[str], group_dcid: str, as_of: date
|
|
25
|
+
) -> set[str]:
|
|
26
|
+
"""
|
|
27
|
+
Check which entities are members of a group at a given date.
|
|
28
|
+
|
|
29
|
+
Uses a single batch query for efficiency. Respects overlay databases -
|
|
30
|
+
if any source (base or overlay) indicates membership at the given date,
|
|
31
|
+
the entity is considered a member.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
entity_dcids: List of entity DCIDs to check
|
|
35
|
+
group_dcid: Group DCID to check membership in
|
|
36
|
+
as_of: Date to check membership at
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Set of entity DCIDs that are members of the group at the given date
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
RuntimeError: If database is not connected
|
|
43
|
+
"""
|
|
44
|
+
if not self.db.engine:
|
|
45
|
+
raise RuntimeError("Database not connected. Call connect() first.")
|
|
46
|
+
|
|
47
|
+
if not entity_dcids:
|
|
48
|
+
return set()
|
|
49
|
+
|
|
50
|
+
# Build parameterized query with IN clause
|
|
51
|
+
placeholders = ", ".join(f":dcid_{i}" for i in range(len(entity_dcids)))
|
|
52
|
+
|
|
53
|
+
# Build union of all membership sources (main + overlays)
|
|
54
|
+
# Collect all rows first, then deduplicate by precedence
|
|
55
|
+
all_sources = [
|
|
56
|
+
f"""
|
|
57
|
+
SELECT entity_dcid, group_dcid, valid_from, valid_until, 0 AS precedence
|
|
58
|
+
FROM main.memberships
|
|
59
|
+
WHERE entity_dcid IN ({placeholders})
|
|
60
|
+
AND group_dcid = :group_dcid
|
|
61
|
+
"""
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
for schema_name, precedence in self.db.overlays:
|
|
65
|
+
all_sources.append(f"""
|
|
66
|
+
SELECT entity_dcid, group_dcid, valid_from, valid_until, {precedence} AS precedence
|
|
67
|
+
FROM {schema_name}.memberships
|
|
68
|
+
WHERE entity_dcid IN ({placeholders})
|
|
69
|
+
AND group_dcid = :group_dcid
|
|
70
|
+
""")
|
|
71
|
+
|
|
72
|
+
# Deduplicate by (entity_dcid, group_dcid) keeping highest precedence,
|
|
73
|
+
# THEN apply temporal filter. This ensures overlay rows override base rows.
|
|
74
|
+
query = text(f"""
|
|
75
|
+
WITH best_memberships AS (
|
|
76
|
+
SELECT entity_dcid, valid_from, valid_until
|
|
77
|
+
FROM (
|
|
78
|
+
SELECT entity_dcid, group_dcid, valid_from, valid_until,
|
|
79
|
+
ROW_NUMBER() OVER (
|
|
80
|
+
PARTITION BY entity_dcid, group_dcid
|
|
81
|
+
ORDER BY precedence DESC
|
|
82
|
+
) as rn
|
|
83
|
+
FROM (
|
|
84
|
+
{" UNION ALL ".join(all_sources)}
|
|
85
|
+
) all_memberships
|
|
86
|
+
) ranked
|
|
87
|
+
WHERE rn = 1
|
|
88
|
+
)
|
|
89
|
+
SELECT DISTINCT entity_dcid
|
|
90
|
+
FROM best_memberships
|
|
91
|
+
WHERE valid_from <= :as_of
|
|
92
|
+
AND (valid_until IS NULL OR valid_until > :as_of)
|
|
93
|
+
""")
|
|
94
|
+
|
|
95
|
+
# Build parameters dict
|
|
96
|
+
params = {
|
|
97
|
+
"group_dcid": group_dcid,
|
|
98
|
+
"as_of": as_of.isoformat(),
|
|
99
|
+
}
|
|
100
|
+
for i, dcid in enumerate(entity_dcids):
|
|
101
|
+
params[f"dcid_{i}"] = dcid
|
|
102
|
+
|
|
103
|
+
with Session(self.db.engine) as session:
|
|
104
|
+
result = session.execute(query, params)
|
|
105
|
+
members = {row[0] for row in result.fetchall()}
|
|
106
|
+
|
|
107
|
+
return members
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Query builder for overlay precedence."""
|
|
2
|
+
|
|
3
|
+
from datetime import date
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class QueryBuilder:
|
|
8
|
+
"""
|
|
9
|
+
Builds SQL queries with overlay precedence support.
|
|
10
|
+
|
|
11
|
+
Constructs UNION queries across main + overlay databases,
|
|
12
|
+
with deduplication keeping highest precedence.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, db_manager: Any):
|
|
16
|
+
"""
|
|
17
|
+
Initialize query builder.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
db_manager: DatabaseManager instance
|
|
21
|
+
"""
|
|
22
|
+
self.db = db_manager
|
|
23
|
+
self.overlays = db_manager.overlays
|
|
24
|
+
|
|
25
|
+
def build_union_query(
|
|
26
|
+
self,
|
|
27
|
+
table: str,
|
|
28
|
+
columns: list[str],
|
|
29
|
+
where_clause: str,
|
|
30
|
+
params: dict[str, Any],
|
|
31
|
+
unique_key: str,
|
|
32
|
+
as_of: date | None = None,
|
|
33
|
+
) -> tuple[str, dict[str, Any]]:
|
|
34
|
+
"""
|
|
35
|
+
Build UNION query across all databases with deduplication.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
table: Table name
|
|
39
|
+
columns: Column names to select
|
|
40
|
+
where_clause: WHERE clause (using named parameters)
|
|
41
|
+
params: Query parameters
|
|
42
|
+
unique_key: Column for deduplication (e.g., "dcid", "alias_uid")
|
|
43
|
+
as_of: Optional date for temporal filtering
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Tuple of (SQL query, parameters dict)
|
|
47
|
+
"""
|
|
48
|
+
queries = []
|
|
49
|
+
col_list = ", ".join(columns)
|
|
50
|
+
|
|
51
|
+
# Add temporal filtering to where clause if as_of is provided
|
|
52
|
+
full_where = where_clause
|
|
53
|
+
if as_of is not None:
|
|
54
|
+
full_where += " AND (valid_from IS NULL OR valid_from <= :as_of)"
|
|
55
|
+
full_where += " AND (valid_until IS NULL OR valid_until >= :as_of)"
|
|
56
|
+
params = {**params, "as_of": as_of.isoformat()}
|
|
57
|
+
|
|
58
|
+
# Main database (precedence 0)
|
|
59
|
+
queries.append(f"""
|
|
60
|
+
SELECT {col_list}, 0 AS precedence, 'main' AS source_db
|
|
61
|
+
FROM main.{table}
|
|
62
|
+
WHERE {full_where}
|
|
63
|
+
""")
|
|
64
|
+
|
|
65
|
+
# Overlay databases with their precedence
|
|
66
|
+
for schema_name, precedence in self.overlays:
|
|
67
|
+
queries.append(f"""
|
|
68
|
+
SELECT {col_list}, {precedence} AS precedence, '{schema_name}' AS source_db
|
|
69
|
+
FROM {schema_name}.{table}
|
|
70
|
+
WHERE {full_where}
|
|
71
|
+
""")
|
|
72
|
+
|
|
73
|
+
# Union all results
|
|
74
|
+
union_query = " UNION ALL ".join(queries)
|
|
75
|
+
|
|
76
|
+
# Deduplicate by unique_key keeping highest precedence using window functions
|
|
77
|
+
final_query = f"""
|
|
78
|
+
WITH all_results AS ({union_query}),
|
|
79
|
+
ranked AS (
|
|
80
|
+
SELECT *, ROW_NUMBER() OVER (PARTITION BY {unique_key} ORDER BY precedence DESC) as rn
|
|
81
|
+
FROM all_results
|
|
82
|
+
)
|
|
83
|
+
SELECT * FROM ranked WHERE rn = 1
|
|
84
|
+
ORDER BY precedence DESC
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
return final_query, params
|
|
88
|
+
|
|
89
|
+
def build_code_lookup_union(
|
|
90
|
+
self,
|
|
91
|
+
code_system: str,
|
|
92
|
+
code_values: str | list[str],
|
|
93
|
+
entity_columns: list[str],
|
|
94
|
+
include_code_value: bool = False,
|
|
95
|
+
) -> tuple[str, dict[str, Any]]:
|
|
96
|
+
"""
|
|
97
|
+
Build UNION query for code lookups across all databases.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
code_system: Code system to search
|
|
101
|
+
code_values: Single code value or list of code values
|
|
102
|
+
entity_columns: Entity column names to select
|
|
103
|
+
include_code_value: Whether to include code_value in results (for batch lookups)
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Tuple of (SQL query, parameters dict)
|
|
107
|
+
"""
|
|
108
|
+
# Build column list
|
|
109
|
+
entity_cols = ", ".join(f"e.{col}" for col in entity_columns)
|
|
110
|
+
|
|
111
|
+
# Build CTE columns and WHERE clause
|
|
112
|
+
is_batch = isinstance(code_values, list)
|
|
113
|
+
params: dict[str, Any] = {"code_system": code_system}
|
|
114
|
+
|
|
115
|
+
if is_batch:
|
|
116
|
+
placeholders = ", ".join(f":code_{i}" for i in range(len(code_values)))
|
|
117
|
+
params.update({f"code_{i}": code for i, code in enumerate(code_values)})
|
|
118
|
+
where_clause = (
|
|
119
|
+
f"code_system = :code_system AND code_value IN ({placeholders})"
|
|
120
|
+
)
|
|
121
|
+
cte_columns = (
|
|
122
|
+
"entity_dcid, code_value" if include_code_value else "entity_dcid"
|
|
123
|
+
)
|
|
124
|
+
else:
|
|
125
|
+
params["code_value"] = code_values
|
|
126
|
+
where_clause = "code_system = :code_system AND code_value = :code_value"
|
|
127
|
+
cte_columns = (
|
|
128
|
+
"entity_dcid, code_value" if include_code_value else "entity_dcid"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Select clause for results
|
|
132
|
+
result_columns = (
|
|
133
|
+
f"{entity_cols}, c.code_value" if include_code_value else entity_cols
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Build query for main database
|
|
137
|
+
sql = f"""
|
|
138
|
+
WITH code_lookup AS (
|
|
139
|
+
SELECT {cte_columns}
|
|
140
|
+
FROM main.codes
|
|
141
|
+
WHERE {where_clause}
|
|
142
|
+
)
|
|
143
|
+
SELECT {result_columns}, 0 AS precedence, 'main' AS source_db
|
|
144
|
+
FROM main.entities e
|
|
145
|
+
INNER JOIN code_lookup c ON e.dcid = c.entity_dcid
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
# Add overlays
|
|
149
|
+
for schema_name, precedence in self.overlays:
|
|
150
|
+
sql += f"""
|
|
151
|
+
UNION ALL
|
|
152
|
+
SELECT {result_columns}, {precedence} AS precedence, '{schema_name}' AS source_db
|
|
153
|
+
FROM {schema_name}.entities e
|
|
154
|
+
INNER JOIN (
|
|
155
|
+
SELECT {cte_columns}
|
|
156
|
+
FROM {schema_name}.codes
|
|
157
|
+
WHERE {where_clause}
|
|
158
|
+
) c ON e.dcid = c.entity_dcid
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
return sql, params
|
|
162
|
+
|
|
163
|
+
def escape_fts_query(self, query: str) -> str:
|
|
164
|
+
"""
|
|
165
|
+
Escape FTS5 special characters.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
query: Raw query string
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Escaped query string
|
|
172
|
+
"""
|
|
173
|
+
# Escape double quotes
|
|
174
|
+
escaped = query.replace('"', '""')
|
|
175
|
+
|
|
176
|
+
# Wrap in quotes for phrase search
|
|
177
|
+
return f'"{escaped}"'
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""Database schema definitions using SQLModel.
|
|
2
|
+
|
|
3
|
+
Schema is automatically generated from SQLModel definitions:
|
|
4
|
+
- Table definitions in types.py (EntityRow, AliasRow, CodeRow, MembershipRow)
|
|
5
|
+
- Constraints, indexes, and foreign keys defined in SQLModel
|
|
6
|
+
- FTS5 virtual table requires raw SQL (SQLAlchemy doesn't support FTS5)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from sqlalchemy import Index, text
|
|
10
|
+
from sqlalchemy.engine import Engine
|
|
11
|
+
from sqlmodel import SQLModel
|
|
12
|
+
|
|
13
|
+
from resolvekit.types import AliasRow, CodeRow, EntityRow, MembershipRow
|
|
14
|
+
|
|
15
|
+
# Schema version
|
|
16
|
+
SCHEMA_VERSION = 1
|
|
17
|
+
|
|
18
|
+
# ==============================================================================
|
|
19
|
+
# Indexes (additional indexes beyond those defined in SQLModel)
|
|
20
|
+
# ==============================================================================
|
|
21
|
+
|
|
22
|
+
# Note: Primary indexes are already defined in the SQLModel classes
|
|
23
|
+
# These are additional composite or coverage indexes
|
|
24
|
+
|
|
25
|
+
indexes = [
|
|
26
|
+
Index("idx_entities_type", EntityRow.__table__.c.entity_type),
|
|
27
|
+
Index("idx_entities_parent", EntityRow.__table__.c.parent_dcid),
|
|
28
|
+
Index(
|
|
29
|
+
"idx_entities_valid",
|
|
30
|
+
EntityRow.__table__.c.valid_from,
|
|
31
|
+
EntityRow.__table__.c.valid_until,
|
|
32
|
+
),
|
|
33
|
+
Index("idx_aliases_dcid", AliasRow.__table__.c.entity_dcid),
|
|
34
|
+
# idx_aliases_norm already defined in SQLModel via index=True
|
|
35
|
+
Index("idx_aliases_type", AliasRow.__table__.c.alias_type),
|
|
36
|
+
Index(
|
|
37
|
+
"idx_codes_lookup",
|
|
38
|
+
CodeRow.__table__.c.code_system,
|
|
39
|
+
CodeRow.__table__.c.code_value,
|
|
40
|
+
),
|
|
41
|
+
Index(
|
|
42
|
+
"idx_memberships_entity",
|
|
43
|
+
MembershipRow.__table__.c.entity_dcid,
|
|
44
|
+
MembershipRow.__table__.c.valid_from,
|
|
45
|
+
MembershipRow.__table__.c.valid_until,
|
|
46
|
+
),
|
|
47
|
+
Index("idx_memberships_group", MembershipRow.__table__.c.group_dcid),
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
# ==============================================================================
|
|
51
|
+
# FTS Virtual Table (requires raw SQL)
|
|
52
|
+
# ==============================================================================
|
|
53
|
+
|
|
54
|
+
# FTS5 virtual tables must be created with raw SQL
|
|
55
|
+
# (SQLAlchemy doesn't have native FTS5 support)
|
|
56
|
+
ALIASES_FTS_SQL = """
|
|
57
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS aliases_fts USING fts5(
|
|
58
|
+
alias_norm,
|
|
59
|
+
content='aliases',
|
|
60
|
+
content_rowid='alias_id',
|
|
61
|
+
tokenize = "unicode61 remove_diacritics 2 tokenchars '.-'",
|
|
62
|
+
prefix='2,3'
|
|
63
|
+
)
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def create_schema(engine: Engine) -> None:
|
|
68
|
+
"""
|
|
69
|
+
Create database schema with all tables and indexes.
|
|
70
|
+
|
|
71
|
+
Uses SQLModel metadata to create tables, then raw SQL for FTS5.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
engine: SQLAlchemy engine
|
|
75
|
+
"""
|
|
76
|
+
# Create all tables via SQLModel metadata
|
|
77
|
+
SQLModel.metadata.create_all(engine)
|
|
78
|
+
|
|
79
|
+
# Create additional indexes
|
|
80
|
+
for index in indexes:
|
|
81
|
+
index.create(engine, checkfirst=True)
|
|
82
|
+
|
|
83
|
+
# Create FTS virtual table (requires raw SQL)
|
|
84
|
+
with engine.connect() as conn:
|
|
85
|
+
conn.execute(text(ALIASES_FTS_SQL))
|
|
86
|
+
|
|
87
|
+
# Create schema_version table manually (not a SQLModel)
|
|
88
|
+
conn.execute(
|
|
89
|
+
text("""
|
|
90
|
+
CREATE TABLE IF NOT EXISTS schema_version (
|
|
91
|
+
version INTEGER NOT NULL,
|
|
92
|
+
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
93
|
+
)
|
|
94
|
+
""")
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Insert schema version if not exists
|
|
98
|
+
result = conn.execute(text("SELECT COUNT(*) FROM schema_version"))
|
|
99
|
+
if result.fetchone()[0] == 0:
|
|
100
|
+
conn.execute(
|
|
101
|
+
text("INSERT INTO schema_version (version) VALUES (:version)"),
|
|
102
|
+
{"version": SCHEMA_VERSION},
|
|
103
|
+
)
|
|
104
|
+
conn.commit()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_schema_version(engine: Engine) -> int:
|
|
108
|
+
"""
|
|
109
|
+
Get current schema version.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
engine: SQLAlchemy engine
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Schema version number
|
|
116
|
+
"""
|
|
117
|
+
with engine.connect() as conn:
|
|
118
|
+
result = conn.execute(
|
|
119
|
+
text("SELECT version FROM schema_version ORDER BY updated_at DESC LIMIT 1")
|
|
120
|
+
)
|
|
121
|
+
row = result.fetchone()
|
|
122
|
+
return row[0] if row else 0
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# Disambiguation Module
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
|
|
5
|
+
The disambiguation module handles ambiguous queries where multiple entities are plausible matches, using context and semantic understanding to select the most likely entity.
|
|
6
|
+
|
|
7
|
+
## Components
|
|
8
|
+
|
|
9
|
+
### Core Components
|
|
10
|
+
|
|
11
|
+
1. **Ambiguity Detector** (`detector.py`)
|
|
12
|
+
- Computes score margin between top candidates
|
|
13
|
+
- Checks ambiguity registry for known ambiguous terms
|
|
14
|
+
- Determines when to invoke semantic disambiguation
|
|
15
|
+
|
|
16
|
+
2. **Ambiguity Registry** (`registry.py`)
|
|
17
|
+
- SQLite table of known ambiguous surface forms
|
|
18
|
+
- Maps ambiguous terms to likely entity types and notes
|
|
19
|
+
- Examples: "Georgia" → {country, state}, "Congo" → {COD, COG}
|
|
20
|
+
|
|
21
|
+
3. **Micro-Semantic Sidecar** (`semantic_sidecar.py`)
|
|
22
|
+
- HNSW ANN index for semantic similarity search
|
|
23
|
+
- Only stores embeddings for curated ambiguous aliases
|
|
24
|
+
- Uses PQ/4-bit quantization for memory efficiency
|
|
25
|
+
- Optional component (requires `resolver[semantic]`)
|
|
26
|
+
|
|
27
|
+
4. **Context Analyzer** (`context.py`)
|
|
28
|
+
- Extracts disambiguation hints from context
|
|
29
|
+
- Handles co-mentioned entities, parent regions, coordinates
|
|
30
|
+
- Computes context-candidate compatibility scores
|
|
31
|
+
|
|
32
|
+
5. **Default Heuristics** (`heuristics.py`)
|
|
33
|
+
- Non-semantic disambiguation rules
|
|
34
|
+
- Examples: prefer countries over subdivisions by default
|
|
35
|
+
- Global prominence scores, population-based ranking
|
|
36
|
+
|
|
37
|
+
### Key Files
|
|
38
|
+
|
|
39
|
+
- `ambiguity_model.py`: Data models for ambiguity metadata
|
|
40
|
+
- `encoder.py`: Optional tiny encoder (MiniLM-class, ~80MB)
|
|
41
|
+
- `sidecar_builder.py`: Tools to build ambiguity sidecar from registry
|
|
42
|
+
|
|
43
|
+
## Ambiguity Detection Logic
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
# Compute margin between top 2 candidates
|
|
47
|
+
margin = score(top1) - score(top2)
|
|
48
|
+
|
|
49
|
+
# Trigger semantic if:
|
|
50
|
+
is_ambiguous = (
|
|
51
|
+
margin < threshold # Small margin (e.g., < 0.15)
|
|
52
|
+
or query in ambiguity_registry # Known ambiguous term
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
if is_ambiguous and semantic_available():
|
|
56
|
+
# Re-rank using semantic similarity
|
|
57
|
+
rerank_with_semantics(candidates, query_embedding)
|
|
58
|
+
else:
|
|
59
|
+
# Use lexicon-only heuristics
|
|
60
|
+
apply_default_heuristics(candidates)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Design Principles
|
|
64
|
+
|
|
65
|
+
1. **Selective invocation**: Only use expensive semantic search when needed
|
|
66
|
+
2. **Explainable defaults**: Clear rules for non-semantic disambiguation
|
|
67
|
+
3. **Graceful degradation**: Works without semantic components
|
|
68
|
+
4. **User override**: Allow explicit disambiguation via parameters
|
|
69
|
+
|
|
70
|
+
## Implementation Priority
|
|
71
|
+
|
|
72
|
+
**Phase E** - Ambiguity subsystem
|
|
File without changes
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
# Extraction Module
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
|
|
5
|
+
The extraction module identifies and extracts entities from unstructured text (sentences, paragraphs, documents), resolving them to canonical entities.
|
|
6
|
+
|
|
7
|
+
## Components
|
|
8
|
+
|
|
9
|
+
### Core Components
|
|
10
|
+
|
|
11
|
+
1. **Text Extractor** (`extractor.py`)
|
|
12
|
+
- Main extraction pipeline
|
|
13
|
+
- Orchestrates NER and resolution
|
|
14
|
+
- Returns entities with spans and confidence
|
|
15
|
+
|
|
16
|
+
2. **Dictionary Matcher** (`dictionary_matcher.py`)
|
|
17
|
+
- Aho-Corasick or FlashText for fast dictionary matching
|
|
18
|
+
- Matches known entity names and aliases in text
|
|
19
|
+
- Efficient for large dictionaries
|
|
20
|
+
|
|
21
|
+
3. **NER Assistant** (`ner.py`)
|
|
22
|
+
- Optional spaCy integration for named entity recognition
|
|
23
|
+
- Helps identify entity boundaries
|
|
24
|
+
- Filters false positives (common words)
|
|
25
|
+
|
|
26
|
+
4. **Context Extractor** (`context.py`)
|
|
27
|
+
- Extract surrounding context for disambiguation
|
|
28
|
+
- Co-mentioned entities, geographic hints
|
|
29
|
+
- Sentence/paragraph boundaries
|
|
30
|
+
|
|
31
|
+
5. **Span Manager** (`spans.py`)
|
|
32
|
+
- Handle overlapping and nested entities
|
|
33
|
+
- Deduplication (same entity mentioned multiple times)
|
|
34
|
+
- Link acronyms to full forms
|
|
35
|
+
|
|
36
|
+
### Filters and Validators
|
|
37
|
+
|
|
38
|
+
- `stoplist.py`: Common words that aren't entities ("March", "Reading")
|
|
39
|
+
- `pos_filter.py`: Part-of-speech filters to reduce false positives
|
|
40
|
+
- `confidence_thresholds.py`: Extraction-specific confidence tuning
|
|
41
|
+
|
|
42
|
+
## Extraction Pipeline
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from resolvekit.extraction import TextExtractor
|
|
46
|
+
|
|
47
|
+
extractor = TextExtractor(min_confidence=0.8)
|
|
48
|
+
|
|
49
|
+
text = """
|
|
50
|
+
The agreement was signed by representatives from France, Germany,
|
|
51
|
+
and Côte d'Ivoire. Regional coordination will be managed through
|
|
52
|
+
the West African Economic and Monetary Union (UEMOA) headquarters
|
|
53
|
+
in Burkina Faso.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
# Extract entities
|
|
57
|
+
entities = extractor.extract(text)
|
|
58
|
+
|
|
59
|
+
# Results
|
|
60
|
+
for entity in entities:
|
|
61
|
+
print(f"{entity.text} [{entity.span}]")
|
|
62
|
+
print(f" → {entity.dcid} ({entity.type})")
|
|
63
|
+
print(f" Confidence: {entity.confidence}")
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Extracted Entity Structure
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
@dataclass
|
|
70
|
+
class ExtractedEntity:
|
|
71
|
+
"""Entity extracted from text."""
|
|
72
|
+
|
|
73
|
+
text: str # Original text span
|
|
74
|
+
span: tuple[int, int] # Character offsets (start, end)
|
|
75
|
+
dcid: str # Resolved entity DCID
|
|
76
|
+
canonical_name: str # Canonical entity name
|
|
77
|
+
entity_type: str # Entity type (country, org, etc.)
|
|
78
|
+
confidence: float # Resolution confidence
|
|
79
|
+
context: str | None # Surrounding context
|
|
80
|
+
method: str # Extraction method (dictionary, ner, etc.)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Extraction Modes
|
|
84
|
+
|
|
85
|
+
### 1. Dictionary-First (Fast)
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
# Use only dictionary matching
|
|
89
|
+
extractor = TextExtractor(mode="dictionary")
|
|
90
|
+
entities = extractor.extract(text)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
- Fast: O(n) where n = text length
|
|
94
|
+
- High precision for known names
|
|
95
|
+
- May miss creative mentions
|
|
96
|
+
|
|
97
|
+
### 2. NER-Assisted (Accurate)
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
# Use spaCy NER + dictionary
|
|
101
|
+
extractor = TextExtractor(mode="ner")
|
|
102
|
+
entities = extractor.extract(text)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
- Better entity boundary detection
|
|
106
|
+
- Catches variations not in dictionary
|
|
107
|
+
- Slower, requires spaCy model
|
|
108
|
+
|
|
109
|
+
### 3. Hybrid (Recommended)
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
# Dictionary first, NER for gaps
|
|
113
|
+
extractor = TextExtractor(mode="hybrid")
|
|
114
|
+
entities = extractor.extract(text)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
- Best of both: fast + accurate
|
|
118
|
+
- Dictionary for common entities
|
|
119
|
+
- NER for rare/variant mentions
|
|
120
|
+
|
|
121
|
+
## Context-Aware Disambiguation
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
text = "Georgia joined the UN in 1992."
|
|
125
|
+
|
|
126
|
+
# Without context: ambiguous
|
|
127
|
+
entities = extractor.extract(text, use_context=False)
|
|
128
|
+
# → Georgia (country) with confidence 0.6
|
|
129
|
+
|
|
130
|
+
# With context: disambiguated
|
|
131
|
+
entities = extractor.extract(text, use_context=True)
|
|
132
|
+
# → Georgia (country) with confidence 0.95 (UN membership hint)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Context signals:
|
|
136
|
+
- Co-mentioned entities (nearby countries suggest geography)
|
|
137
|
+
- Temporal hints (dates in EU/UN membership context)
|
|
138
|
+
- Acronym expansions (UEMOA → West African Union)
|
|
139
|
+
- Document topic (academic paper vs. news article)
|
|
140
|
+
|
|
141
|
+
## Handling Special Cases
|
|
142
|
+
|
|
143
|
+
### Overlapping/Nested Entities
|
|
144
|
+
|
|
145
|
+
```text
|
|
146
|
+
"Paris, France"
|
|
147
|
+
^^^^^^ → France (country)
|
|
148
|
+
^^^^^ → Paris (city)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Strategy: Return both with relationship noted
|
|
152
|
+
|
|
153
|
+
### Acronyms and Full Forms
|
|
154
|
+
|
|
155
|
+
```text
|
|
156
|
+
"West African Economic and Monetary Union (UEMOA)"
|
|
157
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ → org/UEMOA (full form)
|
|
158
|
+
^^^^^^ → org/UEMOA (acronym)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Strategy: Link acronym to full form, return single entity
|
|
162
|
+
|
|
163
|
+
### False Positives
|
|
164
|
+
|
|
165
|
+
```text
|
|
166
|
+
"We will meet in March in Reading."
|
|
167
|
+
^^^^^ → Month, not country
|
|
168
|
+
^^^^^^^ → City, not event
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
Filtering:
|
|
172
|
+
- Stoplist (common words)
|
|
173
|
+
- POS tagging (proper nouns only)
|
|
174
|
+
- Capitalization checks
|
|
175
|
+
- Context validation
|
|
176
|
+
|
|
177
|
+
## Performance
|
|
178
|
+
|
|
179
|
+
### Dictionary Size vs. Speed
|
|
180
|
+
|
|
181
|
+
- 10K aliases: ~1ms per sentence
|
|
182
|
+
- 100K aliases: ~5ms per sentence
|
|
183
|
+
- 1M aliases: ~50ms per sentence
|
|
184
|
+
|
|
185
|
+
Use Aho-Corasick for efficient multi-pattern matching.
|
|
186
|
+
|
|
187
|
+
### Document Length
|
|
188
|
+
|
|
189
|
+
- Sentences (<100 words): <10ms
|
|
190
|
+
- Paragraphs (<500 words): <50ms
|
|
191
|
+
- Full documents (>5000 words): <1s
|
|
192
|
+
|
|
193
|
+
Batch processing recommended for large corpora.
|
|
194
|
+
|
|
195
|
+
## Design Principles
|
|
196
|
+
|
|
197
|
+
1. **Precision over recall**: Avoid false positives
|
|
198
|
+
2. **Context-aware**: Use surrounding text for disambiguation
|
|
199
|
+
3. **Configurable**: Adjustable confidence thresholds
|
|
200
|
+
4. **Efficient**: Handle large documents without timeouts
|
|
201
|
+
|
|
202
|
+
## Implementation Priority
|
|
203
|
+
|
|
204
|
+
**Phase F** - Entity extraction add-on
|
|
File without changes
|