resolvekit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. resolvekit/README.md +134 -0
  2. resolvekit/__init__.py +67 -0
  3. resolvekit/api/README.md +165 -0
  4. resolvekit/api/__init__.py +10 -0
  5. resolvekit/api/convenience.py +53 -0
  6. resolvekit/api/resolver.py +457 -0
  7. resolvekit/builders/README.md +173 -0
  8. resolvekit/builders/__init__.py +0 -0
  9. resolvekit/calibration/README.md +351 -0
  10. resolvekit/calibration/__init__.py +12 -0
  11. resolvekit/calibration/calibrator.py +184 -0
  12. resolvekit/calibration/features.py +139 -0
  13. resolvekit/calibration/models.py +78 -0
  14. resolvekit/cli/README.md +215 -0
  15. resolvekit/cli/__init__.py +0 -0
  16. resolvekit/cli/main.py +18 -0
  17. resolvekit/config.py +128 -0
  18. resolvekit/constants.py +252 -0
  19. resolvekit/constraints/README.md +102 -0
  20. resolvekit/constraints/__init__.py +17 -0
  21. resolvekit/constraints/constraint_engine.py +111 -0
  22. resolvekit/constraints/hierarchy_validator.py +148 -0
  23. resolvekit/constraints/membership_validator.py +60 -0
  24. resolvekit/constraints/protocols.py +33 -0
  25. resolvekit/constraints/temporal_validator.py +43 -0
  26. resolvekit/constraints/type_validator.py +42 -0
  27. resolvekit/data/README.md +165 -0
  28. resolvekit/data/__init__.py +14 -0
  29. resolvekit/data/alias_repository.py +206 -0
  30. resolvekit/data/code_repository.py +85 -0
  31. resolvekit/data/context_filters.py +49 -0
  32. resolvekit/data/db_manager.py +196 -0
  33. resolvekit/data/entity_repository.py +466 -0
  34. resolvekit/data/membership_repository.py +107 -0
  35. resolvekit/data/query_builder.py +177 -0
  36. resolvekit/data/schema.py +122 -0
  37. resolvekit/disambiguation/README.md +72 -0
  38. resolvekit/disambiguation/__init__.py +0 -0
  39. resolvekit/extraction/README.md +204 -0
  40. resolvekit/extraction/__init__.py +0 -0
  41. resolvekit/matchers/README.md +77 -0
  42. resolvekit/matchers/__init__.py +65 -0
  43. resolvekit/matchers/alias_exact.py +65 -0
  44. resolvekit/matchers/canonical_name.py +62 -0
  45. resolvekit/matchers/cascade.py +127 -0
  46. resolvekit/matchers/code_validators.py +250 -0
  47. resolvekit/matchers/exact_code.py +177 -0
  48. resolvekit/matchers/fts_matcher.py +106 -0
  49. resolvekit/matchers/fuzzy_matcher.py +142 -0
  50. resolvekit/matchers/priorities.py +174 -0
  51. resolvekit/matchers/protocols.py +75 -0
  52. resolvekit/normalization/README.md +192 -0
  53. resolvekit/normalization/__init__.py +8 -0
  54. resolvekit/normalization/normalizer.py +164 -0
  55. resolvekit/overlays/README.md +226 -0
  56. resolvekit/overlays/__init__.py +0 -0
  57. resolvekit/types.py +534 -0
  58. resolvekit/utils/README.md +188 -0
  59. resolvekit/utils/__init__.py +48 -0
  60. resolvekit/utils/cache.py +109 -0
  61. resolvekit/utils/dates.py +339 -0
  62. resolvekit/utils/errors.py +145 -0
  63. resolvekit/utils/files.py +366 -0
  64. resolvekit/utils/logging.py +219 -0
  65. resolvekit/utils/text.py +475 -0
  66. resolvekit/utils/validation.py +301 -0
  67. resolvekit-0.0.1.dist-info/METADATA +36 -0
  68. resolvekit-0.0.1.dist-info/RECORD +70 -0
  69. resolvekit-0.0.1.dist-info/WHEEL +4 -0
  70. resolvekit-0.0.1.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,107 @@
1
+ """Repository for group membership queries."""
2
+
3
+ from datetime import date
4
+
5
+ from sqlalchemy import text
6
+ from sqlmodel import Session
7
+
8
+ from resolvekit.data.db_manager import DatabaseManager
9
+
10
+
11
+ class MembershipRepository:
12
+ """Repository for querying group memberships with temporal support."""
13
+
14
+ def __init__(self, db_manager: DatabaseManager):
15
+ """
16
+ Initialize repository.
17
+
18
+ Args:
19
+ db_manager: Database manager instance
20
+ """
21
+ self.db = db_manager
22
+
23
+ def check_memberships_batch(
24
+ self, entity_dcids: list[str], group_dcid: str, as_of: date
25
+ ) -> set[str]:
26
+ """
27
+ Check which entities are members of a group at a given date.
28
+
29
+ Uses a single batch query for efficiency. Respects overlay databases -
30
+ if any source (base or overlay) indicates membership at the given date,
31
+ the entity is considered a member.
32
+
33
+ Args:
34
+ entity_dcids: List of entity DCIDs to check
35
+ group_dcid: Group DCID to check membership in
36
+ as_of: Date to check membership at
37
+
38
+ Returns:
39
+ Set of entity DCIDs that are members of the group at the given date
40
+
41
+ Raises:
42
+ RuntimeError: If database is not connected
43
+ """
44
+ if not self.db.engine:
45
+ raise RuntimeError("Database not connected. Call connect() first.")
46
+
47
+ if not entity_dcids:
48
+ return set()
49
+
50
+ # Build parameterized query with IN clause
51
+ placeholders = ", ".join(f":dcid_{i}" for i in range(len(entity_dcids)))
52
+
53
+ # Build union of all membership sources (main + overlays)
54
+ # Collect all rows first, then deduplicate by precedence
55
+ all_sources = [
56
+ f"""
57
+ SELECT entity_dcid, group_dcid, valid_from, valid_until, 0 AS precedence
58
+ FROM main.memberships
59
+ WHERE entity_dcid IN ({placeholders})
60
+ AND group_dcid = :group_dcid
61
+ """
62
+ ]
63
+
64
+ for schema_name, precedence in self.db.overlays:
65
+ all_sources.append(f"""
66
+ SELECT entity_dcid, group_dcid, valid_from, valid_until, {precedence} AS precedence
67
+ FROM {schema_name}.memberships
68
+ WHERE entity_dcid IN ({placeholders})
69
+ AND group_dcid = :group_dcid
70
+ """)
71
+
72
+ # Deduplicate by (entity_dcid, group_dcid) keeping highest precedence,
73
+ # THEN apply temporal filter. This ensures overlay rows override base rows.
74
+ query = text(f"""
75
+ WITH best_memberships AS (
76
+ SELECT entity_dcid, valid_from, valid_until
77
+ FROM (
78
+ SELECT entity_dcid, group_dcid, valid_from, valid_until,
79
+ ROW_NUMBER() OVER (
80
+ PARTITION BY entity_dcid, group_dcid
81
+ ORDER BY precedence DESC
82
+ ) as rn
83
+ FROM (
84
+ {" UNION ALL ".join(all_sources)}
85
+ ) all_memberships
86
+ ) ranked
87
+ WHERE rn = 1
88
+ )
89
+ SELECT DISTINCT entity_dcid
90
+ FROM best_memberships
91
+ WHERE valid_from <= :as_of
92
+ AND (valid_until IS NULL OR valid_until > :as_of)
93
+ """)
94
+
95
+ # Build parameters dict
96
+ params = {
97
+ "group_dcid": group_dcid,
98
+ "as_of": as_of.isoformat(),
99
+ }
100
+ for i, dcid in enumerate(entity_dcids):
101
+ params[f"dcid_{i}"] = dcid
102
+
103
+ with Session(self.db.engine) as session:
104
+ result = session.execute(query, params)
105
+ members = {row[0] for row in result.fetchall()}
106
+
107
+ return members
@@ -0,0 +1,177 @@
1
+ """Query builder for overlay precedence."""
2
+
3
+ from datetime import date
4
+ from typing import Any
5
+
6
+
7
+ class QueryBuilder:
8
+ """
9
+ Builds SQL queries with overlay precedence support.
10
+
11
+ Constructs UNION queries across main + overlay databases,
12
+ with deduplication keeping highest precedence.
13
+ """
14
+
15
+ def __init__(self, db_manager: Any):
16
+ """
17
+ Initialize query builder.
18
+
19
+ Args:
20
+ db_manager: DatabaseManager instance
21
+ """
22
+ self.db = db_manager
23
+ self.overlays = db_manager.overlays
24
+
25
+ def build_union_query(
26
+ self,
27
+ table: str,
28
+ columns: list[str],
29
+ where_clause: str,
30
+ params: dict[str, Any],
31
+ unique_key: str,
32
+ as_of: date | None = None,
33
+ ) -> tuple[str, dict[str, Any]]:
34
+ """
35
+ Build UNION query across all databases with deduplication.
36
+
37
+ Args:
38
+ table: Table name
39
+ columns: Column names to select
40
+ where_clause: WHERE clause (using named parameters)
41
+ params: Query parameters
42
+ unique_key: Column for deduplication (e.g., "dcid", "alias_uid")
43
+ as_of: Optional date for temporal filtering
44
+
45
+ Returns:
46
+ Tuple of (SQL query, parameters dict)
47
+ """
48
+ queries = []
49
+ col_list = ", ".join(columns)
50
+
51
+ # Add temporal filtering to where clause if as_of is provided
52
+ full_where = where_clause
53
+ if as_of is not None:
54
+ full_where += " AND (valid_from IS NULL OR valid_from <= :as_of)"
55
+ full_where += " AND (valid_until IS NULL OR valid_until >= :as_of)"
56
+ params = {**params, "as_of": as_of.isoformat()}
57
+
58
+ # Main database (precedence 0)
59
+ queries.append(f"""
60
+ SELECT {col_list}, 0 AS precedence, 'main' AS source_db
61
+ FROM main.{table}
62
+ WHERE {full_where}
63
+ """)
64
+
65
+ # Overlay databases with their precedence
66
+ for schema_name, precedence in self.overlays:
67
+ queries.append(f"""
68
+ SELECT {col_list}, {precedence} AS precedence, '{schema_name}' AS source_db
69
+ FROM {schema_name}.{table}
70
+ WHERE {full_where}
71
+ """)
72
+
73
+ # Union all results
74
+ union_query = " UNION ALL ".join(queries)
75
+
76
+ # Deduplicate by unique_key keeping highest precedence using window functions
77
+ final_query = f"""
78
+ WITH all_results AS ({union_query}),
79
+ ranked AS (
80
+ SELECT *, ROW_NUMBER() OVER (PARTITION BY {unique_key} ORDER BY precedence DESC) as rn
81
+ FROM all_results
82
+ )
83
+ SELECT * FROM ranked WHERE rn = 1
84
+ ORDER BY precedence DESC
85
+ """
86
+
87
+ return final_query, params
88
+
89
+ def build_code_lookup_union(
90
+ self,
91
+ code_system: str,
92
+ code_values: str | list[str],
93
+ entity_columns: list[str],
94
+ include_code_value: bool = False,
95
+ ) -> tuple[str, dict[str, Any]]:
96
+ """
97
+ Build UNION query for code lookups across all databases.
98
+
99
+ Args:
100
+ code_system: Code system to search
101
+ code_values: Single code value or list of code values
102
+ entity_columns: Entity column names to select
103
+ include_code_value: Whether to include code_value in results (for batch lookups)
104
+
105
+ Returns:
106
+ Tuple of (SQL query, parameters dict)
107
+ """
108
+ # Build column list
109
+ entity_cols = ", ".join(f"e.{col}" for col in entity_columns)
110
+
111
+ # Build CTE columns and WHERE clause
112
+ is_batch = isinstance(code_values, list)
113
+ params: dict[str, Any] = {"code_system": code_system}
114
+
115
+ if is_batch:
116
+ placeholders = ", ".join(f":code_{i}" for i in range(len(code_values)))
117
+ params.update({f"code_{i}": code for i, code in enumerate(code_values)})
118
+ where_clause = (
119
+ f"code_system = :code_system AND code_value IN ({placeholders})"
120
+ )
121
+ cte_columns = (
122
+ "entity_dcid, code_value" if include_code_value else "entity_dcid"
123
+ )
124
+ else:
125
+ params["code_value"] = code_values
126
+ where_clause = "code_system = :code_system AND code_value = :code_value"
127
+ cte_columns = (
128
+ "entity_dcid, code_value" if include_code_value else "entity_dcid"
129
+ )
130
+
131
+ # Select clause for results
132
+ result_columns = (
133
+ f"{entity_cols}, c.code_value" if include_code_value else entity_cols
134
+ )
135
+
136
+ # Build query for main database
137
+ sql = f"""
138
+ WITH code_lookup AS (
139
+ SELECT {cte_columns}
140
+ FROM main.codes
141
+ WHERE {where_clause}
142
+ )
143
+ SELECT {result_columns}, 0 AS precedence, 'main' AS source_db
144
+ FROM main.entities e
145
+ INNER JOIN code_lookup c ON e.dcid = c.entity_dcid
146
+ """
147
+
148
+ # Add overlays
149
+ for schema_name, precedence in self.overlays:
150
+ sql += f"""
151
+ UNION ALL
152
+ SELECT {result_columns}, {precedence} AS precedence, '{schema_name}' AS source_db
153
+ FROM {schema_name}.entities e
154
+ INNER JOIN (
155
+ SELECT {cte_columns}
156
+ FROM {schema_name}.codes
157
+ WHERE {where_clause}
158
+ ) c ON e.dcid = c.entity_dcid
159
+ """
160
+
161
+ return sql, params
162
+
163
+ def escape_fts_query(self, query: str) -> str:
164
+ """
165
+ Escape FTS5 special characters.
166
+
167
+ Args:
168
+ query: Raw query string
169
+
170
+ Returns:
171
+ Escaped query string
172
+ """
173
+ # Escape double quotes
174
+ escaped = query.replace('"', '""')
175
+
176
+ # Wrap in quotes for phrase search
177
+ return f'"{escaped}"'
@@ -0,0 +1,122 @@
1
+ """Database schema definitions using SQLModel.
2
+
3
+ Schema is automatically generated from SQLModel definitions:
4
+ - Table definitions in types.py (EntityRow, AliasRow, CodeRow, MembershipRow)
5
+ - Constraints, indexes, and foreign keys defined in SQLModel
6
+ - FTS5 virtual table requires raw SQL (SQLAlchemy doesn't support FTS5)
7
+ """
8
+
9
+ from sqlalchemy import Index, text
10
+ from sqlalchemy.engine import Engine
11
+ from sqlmodel import SQLModel
12
+
13
+ from resolvekit.types import AliasRow, CodeRow, EntityRow, MembershipRow
14
+
15
+ # Schema version
16
+ SCHEMA_VERSION = 1
17
+
18
+ # ==============================================================================
19
+ # Indexes (additional indexes beyond those defined in SQLModel)
20
+ # ==============================================================================
21
+
22
+ # Note: Primary indexes are already defined in the SQLModel classes
23
+ # These are additional composite or coverage indexes
24
+
25
+ indexes = [
26
+ Index("idx_entities_type", EntityRow.__table__.c.entity_type),
27
+ Index("idx_entities_parent", EntityRow.__table__.c.parent_dcid),
28
+ Index(
29
+ "idx_entities_valid",
30
+ EntityRow.__table__.c.valid_from,
31
+ EntityRow.__table__.c.valid_until,
32
+ ),
33
+ Index("idx_aliases_dcid", AliasRow.__table__.c.entity_dcid),
34
+ # idx_aliases_norm already defined in SQLModel via index=True
35
+ Index("idx_aliases_type", AliasRow.__table__.c.alias_type),
36
+ Index(
37
+ "idx_codes_lookup",
38
+ CodeRow.__table__.c.code_system,
39
+ CodeRow.__table__.c.code_value,
40
+ ),
41
+ Index(
42
+ "idx_memberships_entity",
43
+ MembershipRow.__table__.c.entity_dcid,
44
+ MembershipRow.__table__.c.valid_from,
45
+ MembershipRow.__table__.c.valid_until,
46
+ ),
47
+ Index("idx_memberships_group", MembershipRow.__table__.c.group_dcid),
48
+ ]
49
+
50
+ # ==============================================================================
51
+ # FTS Virtual Table (requires raw SQL)
52
+ # ==============================================================================
53
+
54
+ # FTS5 virtual tables must be created with raw SQL
55
+ # (SQLAlchemy doesn't have native FTS5 support)
56
+ ALIASES_FTS_SQL = """
57
+ CREATE VIRTUAL TABLE IF NOT EXISTS aliases_fts USING fts5(
58
+ alias_norm,
59
+ content='aliases',
60
+ content_rowid='alias_id',
61
+ tokenize = "unicode61 remove_diacritics 2 tokenchars '.-'",
62
+ prefix='2,3'
63
+ )
64
+ """
65
+
66
+
67
+ def create_schema(engine: Engine) -> None:
68
+ """
69
+ Create database schema with all tables and indexes.
70
+
71
+ Uses SQLModel metadata to create tables, then raw SQL for FTS5.
72
+
73
+ Args:
74
+ engine: SQLAlchemy engine
75
+ """
76
+ # Create all tables via SQLModel metadata
77
+ SQLModel.metadata.create_all(engine)
78
+
79
+ # Create additional indexes
80
+ for index in indexes:
81
+ index.create(engine, checkfirst=True)
82
+
83
+ # Create FTS virtual table (requires raw SQL)
84
+ with engine.connect() as conn:
85
+ conn.execute(text(ALIASES_FTS_SQL))
86
+
87
+ # Create schema_version table manually (not a SQLModel)
88
+ conn.execute(
89
+ text("""
90
+ CREATE TABLE IF NOT EXISTS schema_version (
91
+ version INTEGER NOT NULL,
92
+ updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
93
+ )
94
+ """)
95
+ )
96
+
97
+ # Insert schema version if not exists
98
+ result = conn.execute(text("SELECT COUNT(*) FROM schema_version"))
99
+ if result.fetchone()[0] == 0:
100
+ conn.execute(
101
+ text("INSERT INTO schema_version (version) VALUES (:version)"),
102
+ {"version": SCHEMA_VERSION},
103
+ )
104
+ conn.commit()
105
+
106
+
107
+ def get_schema_version(engine: Engine) -> int:
108
+ """
109
+ Get current schema version.
110
+
111
+ Args:
112
+ engine: SQLAlchemy engine
113
+
114
+ Returns:
115
+ Schema version number
116
+ """
117
+ with engine.connect() as conn:
118
+ result = conn.execute(
119
+ text("SELECT version FROM schema_version ORDER BY updated_at DESC LIMIT 1")
120
+ )
121
+ row = result.fetchone()
122
+ return row[0] if row else 0
@@ -0,0 +1,72 @@
1
+ # Disambiguation Module
2
+
3
+ ## Purpose
4
+
5
+ The disambiguation module handles ambiguous queries where multiple entities are plausible matches, using context and semantic understanding to select the most likely entity.
6
+
7
+ ## Components
8
+
9
+ ### Core Components
10
+
11
+ 1. **Ambiguity Detector** (`detector.py`)
12
+ - Computes score margin between top candidates
13
+ - Checks ambiguity registry for known ambiguous terms
14
+ - Determines when to invoke semantic disambiguation
15
+
16
+ 2. **Ambiguity Registry** (`registry.py`)
17
+ - SQLite table of known ambiguous surface forms
18
+ - Maps ambiguous terms to likely entity types and notes
19
+ - Examples: "Georgia" → {country, state}, "Congo" → {COD, COG}
20
+
21
+ 3. **Micro-Semantic Sidecar** (`semantic_sidecar.py`)
22
+ - HNSW ANN index for semantic similarity search
23
+ - Only stores embeddings for curated ambiguous aliases
24
+ - Uses PQ/4-bit quantization for memory efficiency
25
+ - Optional component (requires `resolver[semantic]`)
26
+
27
+ 4. **Context Analyzer** (`context.py`)
28
+ - Extracts disambiguation hints from context
29
+ - Handles co-mentioned entities, parent regions, coordinates
30
+ - Computes context-candidate compatibility scores
31
+
32
+ 5. **Default Heuristics** (`heuristics.py`)
33
+ - Non-semantic disambiguation rules
34
+ - Examples: prefer countries over subdivisions by default
35
+ - Global prominence scores, population-based ranking
36
+
37
+ ### Key Files
38
+
39
+ - `ambiguity_model.py`: Data models for ambiguity metadata
40
+ - `encoder.py`: Optional tiny encoder (MiniLM-class, ~80MB)
41
+ - `sidecar_builder.py`: Tools to build ambiguity sidecar from registry
42
+
43
+ ## Ambiguity Detection Logic
44
+
45
+ ```python
46
+ # Compute margin between top 2 candidates
47
+ margin = score(top1) - score(top2)
48
+
49
+ # Trigger semantic if:
50
+ is_ambiguous = (
51
+ margin < threshold # Small margin (e.g., < 0.15)
52
+ or query in ambiguity_registry # Known ambiguous term
53
+ )
54
+
55
+ if is_ambiguous and semantic_available():
56
+ # Re-rank using semantic similarity
57
+ rerank_with_semantics(candidates, query_embedding)
58
+ else:
59
+ # Use lexicon-only heuristics
60
+ apply_default_heuristics(candidates)
61
+ ```
62
+
63
+ ## Design Principles
64
+
65
+ 1. **Selective invocation**: Only use expensive semantic search when needed
66
+ 2. **Explainable defaults**: Clear rules for non-semantic disambiguation
67
+ 3. **Graceful degradation**: Works without semantic components
68
+ 4. **User override**: Allow explicit disambiguation via parameters
69
+
70
+ ## Implementation Priority
71
+
72
+ **Phase E** - Ambiguity subsystem
File without changes
@@ -0,0 +1,204 @@
1
+ # Extraction Module
2
+
3
+ ## Purpose
4
+
5
+ The extraction module identifies and extracts entities from unstructured text (sentences, paragraphs, documents), resolving them to canonical entities.
6
+
7
+ ## Components
8
+
9
+ ### Core Components
10
+
11
+ 1. **Text Extractor** (`extractor.py`)
12
+ - Main extraction pipeline
13
+ - Orchestrates NER and resolution
14
+ - Returns entities with spans and confidence
15
+
16
+ 2. **Dictionary Matcher** (`dictionary_matcher.py`)
17
+ - Aho-Corasick or FlashText for fast dictionary matching
18
+ - Matches known entity names and aliases in text
19
+ - Efficient for large dictionaries
20
+
21
+ 3. **NER Assistant** (`ner.py`)
22
+ - Optional spaCy integration for named entity recognition
23
+ - Helps identify entity boundaries
24
+ - Filters false positives (common words)
25
+
26
+ 4. **Context Extractor** (`context.py`)
27
+ - Extract surrounding context for disambiguation
28
+ - Co-mentioned entities, geographic hints
29
+ - Sentence/paragraph boundaries
30
+
31
+ 5. **Span Manager** (`spans.py`)
32
+ - Handle overlapping and nested entities
33
+ - Deduplication (same entity mentioned multiple times)
34
+ - Link acronyms to full forms
35
+
36
+ ### Filters and Validators
37
+
38
+ - `stoplist.py`: Common words that aren't entities ("March", "Reading")
39
+ - `pos_filter.py`: Part-of-speech filters to reduce false positives
40
+ - `confidence_thresholds.py`: Extraction-specific confidence tuning
41
+
42
+ ## Extraction Pipeline
43
+
44
+ ```python
45
+ from resolvekit.extraction import TextExtractor
46
+
47
+ extractor = TextExtractor(min_confidence=0.8)
48
+
49
+ text = """
50
+ The agreement was signed by representatives from France, Germany,
51
+ and Côte d'Ivoire. Regional coordination will be managed through
52
+ the West African Economic and Monetary Union (UEMOA) headquarters
53
+ in Burkina Faso.
54
+ """
55
+
56
+ # Extract entities
57
+ entities = extractor.extract(text)
58
+
59
+ # Results
60
+ for entity in entities:
61
+ print(f"{entity.text} [{entity.span}]")
62
+ print(f" → {entity.dcid} ({entity.type})")
63
+ print(f" Confidence: {entity.confidence}")
64
+ ```
65
+
66
+ ## Extracted Entity Structure
67
+
68
+ ```python
69
+ @dataclass
70
+ class ExtractedEntity:
71
+ """Entity extracted from text."""
72
+
73
+ text: str # Original text span
74
+ span: tuple[int, int] # Character offsets (start, end)
75
+ dcid: str # Resolved entity DCID
76
+ canonical_name: str # Canonical entity name
77
+ entity_type: str # Entity type (country, org, etc.)
78
+ confidence: float # Resolution confidence
79
+ context: str | None # Surrounding context
80
+ method: str # Extraction method (dictionary, ner, etc.)
81
+ ```
82
+
83
+ ## Extraction Modes
84
+
85
+ ### 1. Dictionary-First (Fast)
86
+
87
+ ```python
88
+ # Use only dictionary matching
89
+ extractor = TextExtractor(mode="dictionary")
90
+ entities = extractor.extract(text)
91
+ ```
92
+
93
+ - Fast: O(n) where n = text length
94
+ - High precision for known names
95
+ - May miss creative mentions
96
+
97
+ ### 2. NER-Assisted (Accurate)
98
+
99
+ ```python
100
+ # Use spaCy NER + dictionary
101
+ extractor = TextExtractor(mode="ner")
102
+ entities = extractor.extract(text)
103
+ ```
104
+
105
+ - Better entity boundary detection
106
+ - Catches variations not in dictionary
107
+ - Slower, requires spaCy model
108
+
109
+ ### 3. Hybrid (Recommended)
110
+
111
+ ```python
112
+ # Dictionary first, NER for gaps
113
+ extractor = TextExtractor(mode="hybrid")
114
+ entities = extractor.extract(text)
115
+ ```
116
+
117
+ - Best of both: fast + accurate
118
+ - Dictionary for common entities
119
+ - NER for rare/variant mentions
120
+
121
+ ## Context-Aware Disambiguation
122
+
123
+ ```python
124
+ text = "Georgia joined the UN in 1992."
125
+
126
+ # Without context: ambiguous
127
+ entities = extractor.extract(text, use_context=False)
128
+ # → Georgia (country) with confidence 0.6
129
+
130
+ # With context: disambiguated
131
+ entities = extractor.extract(text, use_context=True)
132
+ # → Georgia (country) with confidence 0.95 (UN membership hint)
133
+ ```
134
+
135
+ Context signals:
136
+ - Co-mentioned entities (nearby countries suggest geography)
137
+ - Temporal hints (dates in EU/UN membership context)
138
+ - Acronym expansions (UEMOA → West African Union)
139
+ - Document topic (academic paper vs. news article)
140
+
141
+ ## Handling Special Cases
142
+
143
+ ### Overlapping/Nested Entities
144
+
145
+ ```text
146
+ "Paris, France"
147
+ ^^^^^^ → France (country)
148
+ ^^^^^ → Paris (city)
149
+ ```
150
+
151
+ Strategy: Return both with relationship noted
152
+
153
+ ### Acronyms and Full Forms
154
+
155
+ ```text
156
+ "West African Economic and Monetary Union (UEMOA)"
157
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ → org/UEMOA (full form)
158
+ ^^^^^^ → org/UEMOA (acronym)
159
+ ```
160
+
161
+ Strategy: Link acronym to full form, return single entity
162
+
163
+ ### False Positives
164
+
165
+ ```text
166
+ "We will meet in March in Reading."
167
+ ^^^^^ → Month, not country
168
+ ^^^^^^^ → City, not event
169
+ ```
170
+
171
+ Filtering:
172
+ - Stoplist (common words)
173
+ - POS tagging (proper nouns only)
174
+ - Capitalization checks
175
+ - Context validation
176
+
177
+ ## Performance
178
+
179
+ ### Dictionary Size vs. Speed
180
+
181
+ - 10K aliases: ~1ms per sentence
182
+ - 100K aliases: ~5ms per sentence
183
+ - 1M aliases: ~50ms per sentence
184
+
185
+ Use Aho-Corasick for efficient multi-pattern matching.
186
+
187
+ ### Document Length
188
+
189
+ - Sentences (<100 words): <10ms
190
+ - Paragraphs (<500 words): <50ms
191
+ - Full documents (>5000 words): <1s
192
+
193
+ Batch processing recommended for large corpora.
194
+
195
+ ## Design Principles
196
+
197
+ 1. **Precision over recall**: Avoid false positives
198
+ 2. **Context-aware**: Use surrounding text for disambiguation
199
+ 3. **Configurable**: Adjustable confidence thresholds
200
+ 4. **Efficient**: Handle large documents without timeouts
201
+
202
+ ## Implementation Priority
203
+
204
+ **Phase F** - Entity extraction add-on
File without changes