metaspn-entities 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 MetaSPN Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,57 @@
1
+ Metadata-Version: 2.4
2
+ Name: metaspn-entities
3
+ Version: 0.1.0
4
+ Summary: Canonical entity resolution, aliasing, and merges for MetaSPN systems
5
+ Author: MetaSPN Contributors
6
+ License-Expression: MIT
7
+ Keywords: entity-resolution,identity,aliasing,dedupe,sqlite
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3 :: Only
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Software Development :: Libraries
16
+ Classifier: Topic :: Database
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: metaspn-schemas
21
+ Provides-Extra: dev
22
+ Requires-Dist: build>=1.2.0; extra == "dev"
23
+ Requires-Dist: twine>=5.0.0; extra == "dev"
24
+ Dynamic: license-file
25
+
26
+ # metaspn-entities
27
+
28
+ Identity layer for MetaSPN systems.
29
+
30
+ ## Features
31
+
32
+ - Canonical entity IDs
33
+ - Deterministic identifier normalization + alias resolution
34
+ - Merge history and reversible soft undo
35
+ - SQLite backend using stdlib `sqlite3`
36
+ - Event emission payloads for `EntityResolved`, `EntityMerged`, `EntityAliasAdded`
37
+ - Optional filesystem snapshot export
38
+
39
+ ## Quick usage
40
+
41
+ ```python
42
+ from metaspn_entities import EntityResolver
43
+
44
+ resolver = EntityResolver()
45
+ resolution = resolver.resolve("twitter_handle", "@some_handle")
46
+ events = resolver.drain_events()
47
+ print(resolution.entity_id, resolution.confidence)
48
+ ```
49
+
50
+ ## API notes
51
+
52
+ - `resolve(identifier_type, value, context=None) -> EntityResolution`
53
+ - `add_alias(entity_id, identifier_type, value, ...)`
54
+ - `merge_entities(from_entity_id, to_entity_id, reason, ...)`
55
+ - `undo_merge(from_entity_id, to_entity_id, ...)` (implemented as reverse merge with redirect correction)
56
+ - `drain_events() -> list[EmittedEvent]`
57
+ - `export_snapshot(output_path)` to inspect SQLite state as JSON
@@ -0,0 +1,32 @@
1
+ # metaspn-entities
2
+
3
+ Identity layer for MetaSPN systems.
4
+
5
+ ## Features
6
+
7
+ - Canonical entity IDs
8
+ - Deterministic identifier normalization + alias resolution
9
+ - Merge history and reversible soft undo
10
+ - SQLite backend using stdlib `sqlite3`
11
+ - Event emission payloads for `EntityResolved`, `EntityMerged`, `EntityAliasAdded`
12
+ - Optional filesystem snapshot export
13
+
14
+ ## Quick usage
15
+
16
+ ```python
17
+ from metaspn_entities import EntityResolver
18
+
19
+ resolver = EntityResolver()
20
+ resolution = resolver.resolve("twitter_handle", "@some_handle")
21
+ events = resolver.drain_events()
22
+ print(resolution.entity_id, resolution.confidence)
23
+ ```
24
+
25
+ ## API notes
26
+
27
+ - `resolve(identifier_type, value, context=None) -> EntityResolution`
28
+ - `add_alias(entity_id, identifier_type, value, ...)`
29
+ - `merge_entities(from_entity_id, to_entity_id, reason, ...)`
30
+ - `undo_merge(from_entity_id, to_entity_id, ...)` (implemented as reverse merge with redirect correction)
31
+ - `drain_events() -> list[EmittedEvent]`
32
+ - `export_snapshot(output_path)` to inspect SQLite state as JSON
@@ -0,0 +1,11 @@
1
+ from .events import EmittedEvent
2
+ from .models import EntityResolution
3
+ from .resolver import EntityResolver
4
+ from .sqlite_backend import SQLiteEntityStore
5
+
6
+ __all__ = [
7
+ "EntityResolver",
8
+ "EntityResolution",
9
+ "EmittedEvent",
10
+ "SQLiteEntityStore",
11
+ ]
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class EmittedEvent:
9
+ event_type: str
10
+ payload: Dict[str, Any]
11
+
12
+
13
+ class EventFactory:
14
+ @staticmethod
15
+ def entity_resolved(entity_id: str, identifier_type: str, value: str, confidence: float, created_new_entity: bool) -> EmittedEvent:
16
+ return EmittedEvent(
17
+ event_type="EntityResolved",
18
+ payload={
19
+ "entity_id": entity_id,
20
+ "identifier_type": identifier_type,
21
+ "value": value,
22
+ "confidence": confidence,
23
+ "created_new_entity": created_new_entity,
24
+ },
25
+ )
26
+
27
+ @staticmethod
28
+ def entity_merged(from_entity_id: str, to_entity_id: str, reason: str, caused_by: str) -> EmittedEvent:
29
+ return EmittedEvent(
30
+ event_type="EntityMerged",
31
+ payload={
32
+ "from_entity_id": from_entity_id,
33
+ "to_entity_id": to_entity_id,
34
+ "reason": reason,
35
+ "caused_by": caused_by,
36
+ },
37
+ )
38
+
39
+ @staticmethod
40
+ def entity_alias_added(entity_id: str, identifier_type: str, normalized_value: str, caused_by: str) -> EmittedEvent:
41
+ return EmittedEvent(
42
+ event_type="EntityAliasAdded",
43
+ payload={
44
+ "entity_id": entity_id,
45
+ "identifier_type": identifier_type,
46
+ "normalized_value": normalized_value,
47
+ "caused_by": caused_by,
48
+ },
49
+ )
@@ -0,0 +1,73 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime, timezone
5
+ from typing import Any, Dict, List, Optional
6
+
7
+
8
+ DEFAULT_MATCH_CONFIDENCE = 0.95
9
+ DEFAULT_NEW_ENTITY_CONFIDENCE = 0.6
10
+
11
+
12
+ class EntityStatus:
13
+ ACTIVE = "active"
14
+ MERGED = "merged"
15
+
16
+
17
+ class EntityType:
18
+ PERSON = "person"
19
+ ORG = "org"
20
+ PROJECT = "project"
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class Entity:
25
+ entity_id: str
26
+ entity_type: str
27
+ created_at: str
28
+ status: str
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class Identifier:
33
+ identifier_type: str
34
+ value: str
35
+ normalized_value: str
36
+ confidence: float
37
+ first_seen_at: str
38
+ last_seen_at: str
39
+ provenance: Optional[str] = None
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class Alias:
44
+ identifier_type: str
45
+ normalized_value: str
46
+ entity_id: str
47
+ confidence: float
48
+ created_at: str
49
+ caused_by: str
50
+ provenance: Optional[str] = None
51
+
52
+
53
+ @dataclass(frozen=True)
54
+ class MergeRecord:
55
+ merge_id: int
56
+ from_entity_id: str
57
+ to_entity_id: str
58
+ reason: str
59
+ timestamp: str
60
+ caused_by: str
61
+
62
+
63
+ @dataclass(frozen=True)
64
+ class EntityResolution:
65
+ entity_id: str
66
+ confidence: float
67
+ created_new_entity: bool
68
+ matched_identifiers: List[Dict[str, Any]] = field(default_factory=list)
69
+
70
+
71
+
72
+ def utcnow_iso() -> str:
73
+ return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+ from urllib.parse import urlparse
4
+
5
+
6
+ def normalize_identifier(identifier_type: str, value: str) -> str:
7
+ identifier_type = identifier_type.strip().lower()
8
+ value = value.strip()
9
+
10
+ if identifier_type in {"twitter_handle", "github_handle", "handle"}:
11
+ return value.lstrip("@").lower()
12
+
13
+ if identifier_type == "email":
14
+ return value.lower()
15
+
16
+ if identifier_type == "domain":
17
+ cleaned = value.lower()
18
+ if cleaned.startswith("http://") or cleaned.startswith("https://"):
19
+ cleaned = urlparse(cleaned).netloc or cleaned
20
+ return cleaned.lstrip("www.")
21
+
22
+ if identifier_type in {"linkedin_url", "url", "canonical_url"}:
23
+ parsed = urlparse(value)
24
+ if parsed.scheme:
25
+ host = parsed.netloc.lower().lstrip("www.")
26
+ path = parsed.path.rstrip("/")
27
+ return f"{host}{path}".lower()
28
+ return value.lower().rstrip("/")
29
+
30
+ if identifier_type == "name":
31
+ return " ".join(value.lower().split())
32
+
33
+ return value.lower()
34
+
35
+
36
+ AUTO_MERGE_IDENTIFIER_TYPES = {"email", "canonical_url", "url"}
@@ -0,0 +1,149 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from .events import EmittedEvent, EventFactory
6
+ from .models import (
7
+ DEFAULT_MATCH_CONFIDENCE,
8
+ DEFAULT_NEW_ENTITY_CONFIDENCE,
9
+ EntityResolution,
10
+ EntityStatus,
11
+ EntityType,
12
+ )
13
+ from .normalize import AUTO_MERGE_IDENTIFIER_TYPES, normalize_identifier
14
+ from .sqlite_backend import SQLiteEntityStore
15
+
16
+
17
+ class EntityResolver:
18
+ def __init__(self, store: Optional[SQLiteEntityStore] = None) -> None:
19
+ self.store = store or SQLiteEntityStore()
20
+ self._event_buffer: List[EmittedEvent] = []
21
+
22
+ def resolve(self, identifier_type: str, value: str, context: Optional[Dict[str, Any]] = None) -> EntityResolution:
23
+ context = context or {}
24
+ confidence = float(context.get("confidence", DEFAULT_MATCH_CONFIDENCE))
25
+ provenance = context.get("provenance")
26
+ entity_type = context.get("entity_type", EntityType.PERSON)
27
+ caused_by = context.get("caused_by", "resolver")
28
+
29
+ normalized = normalize_identifier(identifier_type, value)
30
+ self.store.upsert_identifier(identifier_type, value, normalized, confidence, provenance)
31
+
32
+ existing_alias = self.store.find_alias(identifier_type, normalized)
33
+ if existing_alias:
34
+ entity_id = self.store.canonical_entity_id(existing_alias["entity_id"])
35
+ matched_identifiers = list(self.store.iter_identifiers_for_entity(entity_id))
36
+ resolution = EntityResolution(
37
+ entity_id=entity_id,
38
+ confidence=max(float(existing_alias["confidence"]), confidence),
39
+ created_new_entity=False,
40
+ matched_identifiers=matched_identifiers,
41
+ )
42
+ self._event_buffer.append(
43
+ EventFactory.entity_resolved(entity_id, identifier_type, value, resolution.confidence, False)
44
+ )
45
+ return resolution
46
+
47
+ entity_id = self.store.create_entity(entity_type)
48
+ added, conflicting_entity_id = self.store.add_alias(
49
+ identifier_type=identifier_type,
50
+ normalized_value=normalized,
51
+ entity_id=entity_id,
52
+ confidence=confidence,
53
+ caused_by=caused_by,
54
+ provenance=provenance,
55
+ )
56
+
57
+ if conflicting_entity_id and identifier_type in AUTO_MERGE_IDENTIFIER_TYPES:
58
+ merge_reason = f"auto-merge on {identifier_type}:{normalized}"
59
+ self.store.merge_entities(entity_id, conflicting_entity_id, merge_reason, "auto-merge")
60
+ entity_id = self.store.canonical_entity_id(conflicting_entity_id)
61
+ self._event_buffer.append(
62
+ EventFactory.entity_merged(entity_id, conflicting_entity_id, merge_reason, "auto-merge")
63
+ )
64
+
65
+ matched_identifiers = list(self.store.iter_identifiers_for_entity(entity_id))
66
+ resolution = EntityResolution(
67
+ entity_id=entity_id,
68
+ confidence=confidence if added else DEFAULT_NEW_ENTITY_CONFIDENCE,
69
+ created_new_entity=True,
70
+ matched_identifiers=matched_identifiers,
71
+ )
72
+ if added:
73
+ self._event_buffer.append(EventFactory.entity_alias_added(entity_id, identifier_type, normalized, caused_by))
74
+ self._event_buffer.append(
75
+ EventFactory.entity_resolved(entity_id, identifier_type, value, resolution.confidence, True)
76
+ )
77
+ return resolution
78
+
79
+ def add_alias(
80
+ self,
81
+ entity_id: str,
82
+ identifier_type: str,
83
+ value: str,
84
+ confidence: float = DEFAULT_MATCH_CONFIDENCE,
85
+ caused_by: str = "manual",
86
+ provenance: Optional[str] = None,
87
+ ) -> List[EmittedEvent]:
88
+ self.store.ensure_entity(entity_id)
89
+ canonical_entity_id = self.store.canonical_entity_id(entity_id)
90
+ normalized = normalize_identifier(identifier_type, value)
91
+ self.store.upsert_identifier(identifier_type, value, normalized, confidence, provenance)
92
+
93
+ added, conflicting_entity_id = self.store.add_alias(
94
+ identifier_type=identifier_type,
95
+ normalized_value=normalized,
96
+ entity_id=canonical_entity_id,
97
+ confidence=confidence,
98
+ caused_by=caused_by,
99
+ provenance=provenance,
100
+ )
101
+ if conflicting_entity_id and conflicting_entity_id != canonical_entity_id:
102
+ if identifier_type in AUTO_MERGE_IDENTIFIER_TYPES:
103
+ reason = f"auto-merge on {identifier_type}:{normalized}"
104
+ self.store.merge_entities(canonical_entity_id, conflicting_entity_id, reason, "auto-merge")
105
+ event = EventFactory.entity_merged(canonical_entity_id, conflicting_entity_id, reason, "auto-merge")
106
+ self._event_buffer.append(event)
107
+ return [event]
108
+ raise ValueError(
109
+ f"Alias already mapped to another entity: {identifier_type}:{normalized} -> {conflicting_entity_id}"
110
+ )
111
+
112
+ if not added:
113
+ return []
114
+
115
+ event = EventFactory.entity_alias_added(canonical_entity_id, identifier_type, normalized, caused_by)
116
+ self._event_buffer.append(event)
117
+ return [event]
118
+
119
+ def merge_entities(self, from_entity_id: str, to_entity_id: str, reason: str, caused_by: str = "manual") -> EmittedEvent:
120
+ self.store.ensure_entity(from_entity_id)
121
+ self.store.ensure_entity(to_entity_id)
122
+ self.store.merge_entities(from_entity_id, to_entity_id, reason, caused_by)
123
+ event = EventFactory.entity_merged(from_entity_id, to_entity_id, reason, caused_by)
124
+ self._event_buffer.append(event)
125
+ return event
126
+
127
+ def undo_merge(self, from_entity_id: str, to_entity_id: str, caused_by: str = "manual") -> EmittedEvent:
128
+ reason = f"undo merge {from_entity_id}->{to_entity_id}"
129
+ if self.store.get_redirect_target(from_entity_id) == to_entity_id:
130
+ self.store.remove_redirect(from_entity_id)
131
+ self.store.set_entity_status(from_entity_id, EntityStatus.ACTIVE)
132
+ self.store.merge_entities(to_entity_id, from_entity_id, reason, caused_by)
133
+ event = EventFactory.entity_merged(to_entity_id, from_entity_id, reason, caused_by)
134
+ self._event_buffer.append(event)
135
+ return event
136
+
137
+ def merge_history(self) -> List[Dict[str, Any]]:
138
+ return self.store.list_merge_history()
139
+
140
+ def aliases_for_entity(self, entity_id: str) -> List[Dict[str, Any]]:
141
+ return self.store.list_aliases_for_entity(entity_id)
142
+
143
+ def export_snapshot(self, output_path: str) -> None:
144
+ self.store.export_snapshot(output_path)
145
+
146
+ def drain_events(self) -> List[EmittedEvent]:
147
+ events = list(self._event_buffer)
148
+ self._event_buffer.clear()
149
+ return events
@@ -0,0 +1,278 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import sqlite3
5
+ import uuid
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
8
+
9
+ from .models import EntityStatus, utcnow_iso
10
+
11
+
12
+ SCHEMA_SQL = """
13
+ CREATE TABLE IF NOT EXISTS entities (
14
+ entity_id TEXT PRIMARY KEY,
15
+ entity_type TEXT NOT NULL,
16
+ created_at TEXT NOT NULL,
17
+ status TEXT NOT NULL
18
+ );
19
+
20
+ CREATE TABLE IF NOT EXISTS identifiers (
21
+ identifier_type TEXT NOT NULL,
22
+ value TEXT NOT NULL,
23
+ normalized_value TEXT NOT NULL,
24
+ confidence REAL NOT NULL,
25
+ first_seen_at TEXT NOT NULL,
26
+ last_seen_at TEXT NOT NULL,
27
+ provenance TEXT,
28
+ UNIQUE(identifier_type, normalized_value)
29
+ );
30
+
31
+ CREATE TABLE IF NOT EXISTS aliases (
32
+ identifier_type TEXT NOT NULL,
33
+ normalized_value TEXT NOT NULL,
34
+ entity_id TEXT NOT NULL,
35
+ confidence REAL NOT NULL,
36
+ created_at TEXT NOT NULL,
37
+ caused_by TEXT NOT NULL,
38
+ provenance TEXT,
39
+ UNIQUE(identifier_type, normalized_value)
40
+ );
41
+
42
+ CREATE TABLE IF NOT EXISTS merge_records (
43
+ merge_id INTEGER PRIMARY KEY AUTOINCREMENT,
44
+ from_entity_id TEXT NOT NULL,
45
+ to_entity_id TEXT NOT NULL,
46
+ reason TEXT NOT NULL,
47
+ timestamp TEXT NOT NULL,
48
+ caused_by TEXT NOT NULL
49
+ );
50
+
51
+ CREATE TABLE IF NOT EXISTS entity_redirects (
52
+ from_entity_id TEXT PRIMARY KEY,
53
+ to_entity_id TEXT NOT NULL,
54
+ timestamp TEXT NOT NULL,
55
+ reason TEXT NOT NULL,
56
+ caused_by TEXT NOT NULL
57
+ );
58
+ """
59
+
60
+
61
+ class SQLiteEntityStore:
62
+ def __init__(self, db_path: str = ":memory:") -> None:
63
+ self.db_path = db_path
64
+ self.conn = sqlite3.connect(db_path)
65
+ self.conn.row_factory = sqlite3.Row
66
+ self.conn.executescript(SCHEMA_SQL)
67
+ self.conn.commit()
68
+
69
+ def close(self) -> None:
70
+ self.conn.close()
71
+
72
+ def create_entity(self, entity_type: str) -> str:
73
+ entity_id = f"ent_{uuid.uuid4().hex}"
74
+ now = utcnow_iso()
75
+ self.conn.execute(
76
+ "INSERT INTO entities(entity_id, entity_type, created_at, status) VALUES (?, ?, ?, ?)",
77
+ (entity_id, entity_type, now, EntityStatus.ACTIVE),
78
+ )
79
+ self.conn.commit()
80
+ return entity_id
81
+
82
+ def get_entity(self, entity_id: str) -> Optional[sqlite3.Row]:
83
+ row = self.conn.execute("SELECT * FROM entities WHERE entity_id = ?", (entity_id,)).fetchone()
84
+ return row
85
+
86
+ def canonical_entity_id(self, entity_id: str) -> str:
87
+ current = entity_id
88
+ visited = set()
89
+ while True:
90
+ if current in visited:
91
+ raise ValueError(f"Cycle detected in merge redirects for {entity_id}")
92
+ visited.add(current)
93
+ row = self.conn.execute(
94
+ "SELECT to_entity_id FROM entity_redirects WHERE from_entity_id = ?", (current,)
95
+ ).fetchone()
96
+ if not row:
97
+ return current
98
+ current = row["to_entity_id"]
99
+
100
+ def find_alias(self, identifier_type: str, normalized_value: str) -> Optional[sqlite3.Row]:
101
+ return self.conn.execute(
102
+ "SELECT * FROM aliases WHERE identifier_type = ? AND normalized_value = ?",
103
+ (identifier_type, normalized_value),
104
+ ).fetchone()
105
+
106
+ def upsert_identifier(
107
+ self,
108
+ identifier_type: str,
109
+ value: str,
110
+ normalized_value: str,
111
+ confidence: float,
112
+ provenance: Optional[str],
113
+ ) -> None:
114
+ now = utcnow_iso()
115
+ existing = self.conn.execute(
116
+ "SELECT * FROM identifiers WHERE identifier_type = ? AND normalized_value = ?",
117
+ (identifier_type, normalized_value),
118
+ ).fetchone()
119
+ if existing:
120
+ self.conn.execute(
121
+ "UPDATE identifiers SET value = ?, confidence = ?, last_seen_at = ?, provenance = ? WHERE identifier_type = ? AND normalized_value = ?",
122
+ (
123
+ value,
124
+ max(confidence, existing["confidence"]),
125
+ now,
126
+ provenance or existing["provenance"],
127
+ identifier_type,
128
+ normalized_value,
129
+ ),
130
+ )
131
+ else:
132
+ self.conn.execute(
133
+ "INSERT INTO identifiers(identifier_type, value, normalized_value, confidence, first_seen_at, last_seen_at, provenance) VALUES (?, ?, ?, ?, ?, ?, ?)",
134
+ (identifier_type, value, normalized_value, confidence, now, now, provenance),
135
+ )
136
+ self.conn.commit()
137
+
138
+ def add_alias(
139
+ self,
140
+ identifier_type: str,
141
+ normalized_value: str,
142
+ entity_id: str,
143
+ confidence: float,
144
+ caused_by: str,
145
+ provenance: Optional[str] = None,
146
+ ) -> Tuple[bool, Optional[str]]:
147
+ now = utcnow_iso()
148
+ existing = self.find_alias(identifier_type, normalized_value)
149
+ canonical_target = self.canonical_entity_id(entity_id)
150
+
151
+ if existing:
152
+ existing_entity = self.canonical_entity_id(existing["entity_id"])
153
+ if existing_entity == canonical_target:
154
+ self.conn.execute(
155
+ "UPDATE aliases SET confidence = ?, provenance = ? WHERE identifier_type = ? AND normalized_value = ?",
156
+ (
157
+ max(confidence, existing["confidence"]),
158
+ provenance or existing["provenance"],
159
+ identifier_type,
160
+ normalized_value,
161
+ ),
162
+ )
163
+ self.conn.commit()
164
+ return False, None
165
+ return False, existing_entity
166
+
167
+ self.conn.execute(
168
+ "INSERT INTO aliases(identifier_type, normalized_value, entity_id, confidence, created_at, caused_by, provenance) VALUES (?, ?, ?, ?, ?, ?, ?)",
169
+ (identifier_type, normalized_value, canonical_target, confidence, now, caused_by, provenance),
170
+ )
171
+ self.conn.commit()
172
+ return True, None
173
+
174
+ def reassign_aliases(self, from_entity_id: str, to_entity_id: str) -> None:
175
+ self.conn.execute(
176
+ "UPDATE aliases SET entity_id = ? WHERE entity_id = ?",
177
+ (to_entity_id, from_entity_id),
178
+ )
179
+
180
+ def get_redirect_target(self, from_entity_id: str) -> Optional[str]:
181
+ row = self.conn.execute(
182
+ "SELECT to_entity_id FROM entity_redirects WHERE from_entity_id = ?",
183
+ (from_entity_id,),
184
+ ).fetchone()
185
+ if not row:
186
+ return None
187
+ return str(row["to_entity_id"])
188
+
189
+ def remove_redirect(self, from_entity_id: str) -> None:
190
+ self.conn.execute("DELETE FROM entity_redirects WHERE from_entity_id = ?", (from_entity_id,))
191
+ self.conn.commit()
192
+
193
+ def set_entity_status(self, entity_id: str, status: str) -> None:
194
+ self.conn.execute("UPDATE entities SET status = ? WHERE entity_id = ?", (status, entity_id))
195
+ self.conn.commit()
196
+
197
+ def merge_entities(self, from_entity_id: str, to_entity_id: str, reason: str, caused_by: str) -> int:
198
+ from_canonical = self.canonical_entity_id(from_entity_id)
199
+ to_canonical = self.canonical_entity_id(to_entity_id)
200
+
201
+ if from_canonical == to_canonical:
202
+ raise ValueError("Entities are already merged")
203
+
204
+ timestamp = utcnow_iso()
205
+ self.conn.execute(
206
+ "INSERT OR REPLACE INTO entity_redirects(from_entity_id, to_entity_id, timestamp, reason, caused_by) VALUES (?, ?, ?, ?, ?)",
207
+ (from_canonical, to_canonical, timestamp, reason, caused_by),
208
+ )
209
+ self.conn.execute(
210
+ "UPDATE entities SET status = ? WHERE entity_id = ?",
211
+ (EntityStatus.MERGED, from_canonical),
212
+ )
213
+ self.conn.execute(
214
+ "UPDATE entities SET status = ? WHERE entity_id = ?",
215
+ (EntityStatus.ACTIVE, to_canonical),
216
+ )
217
+ cursor = self.conn.execute(
218
+ "INSERT INTO merge_records(from_entity_id, to_entity_id, reason, timestamp, caused_by) VALUES (?, ?, ?, ?, ?)",
219
+ (from_canonical, to_canonical, reason, timestamp, caused_by),
220
+ )
221
+ self.conn.commit()
222
+ return int(cursor.lastrowid)
223
+
224
+ def list_aliases_for_entity(self, entity_id: str) -> List[Dict[str, Any]]:
225
+ target = self.canonical_entity_id(entity_id)
226
+ rows = self.conn.execute(
227
+ "SELECT identifier_type, normalized_value, entity_id, confidence FROM aliases ORDER BY identifier_type, normalized_value"
228
+ ).fetchall()
229
+ return [
230
+ {
231
+ "identifier_type": row["identifier_type"],
232
+ "normalized_value": row["normalized_value"],
233
+ "entity_id": row["entity_id"],
234
+ "confidence": row["confidence"],
235
+ }
236
+ for row in rows
237
+ if self.canonical_entity_id(row["entity_id"]) == target
238
+ ]
239
+
240
+ def list_merge_history(self) -> List[Dict[str, Any]]:
241
+ rows = self.conn.execute(
242
+ "SELECT merge_id, from_entity_id, to_entity_id, reason, timestamp, caused_by FROM merge_records ORDER BY merge_id"
243
+ ).fetchall()
244
+ return [dict(row) for row in rows]
245
+
246
+ def export_snapshot(self, output_path: str) -> None:
247
+ payload: Dict[str, Any] = {}
248
+ for table in ["entities", "identifiers", "aliases", "merge_records", "entity_redirects"]:
249
+ rows = self.conn.execute(f"SELECT * FROM {table}").fetchall()
250
+ payload[table] = [dict(row) for row in rows]
251
+
252
+ path = Path(output_path)
253
+ path.parent.mkdir(parents=True, exist_ok=True)
254
+ path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
255
+
256
+ def ensure_entity(self, entity_id: str) -> None:
257
+ row = self.get_entity(entity_id)
258
+ if not row:
259
+ raise ValueError(f"Unknown entity_id: {entity_id}")
260
+
261
+ def iter_identifiers_for_entity(self, entity_id: str) -> Iterable[Dict[str, Any]]:
262
+ target = self.canonical_entity_id(entity_id)
263
+ rows = self.conn.execute(
264
+ """
265
+ SELECT a.entity_id, i.identifier_type, i.value, i.normalized_value, i.confidence
266
+ FROM aliases a
267
+ JOIN identifiers i ON a.identifier_type = i.identifier_type AND a.normalized_value = i.normalized_value
268
+ ORDER BY i.identifier_type, i.normalized_value
269
+ """
270
+ ).fetchall()
271
+ for row in rows:
272
+ if self.canonical_entity_id(row["entity_id"]) == target:
273
+ yield {
274
+ "identifier_type": row["identifier_type"],
275
+ "value": row["value"],
276
+ "normalized_value": row["normalized_value"],
277
+ "confidence": row["confidence"],
278
+ }
@@ -0,0 +1,57 @@
1
+ Metadata-Version: 2.4
2
+ Name: metaspn-entities
3
+ Version: 0.1.0
4
+ Summary: Canonical entity resolution, aliasing, and merges for MetaSPN systems
5
+ Author: MetaSPN Contributors
6
+ License-Expression: MIT
7
+ Keywords: entity-resolution,identity,aliasing,dedupe,sqlite
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3 :: Only
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Software Development :: Libraries
16
+ Classifier: Topic :: Database
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: metaspn-schemas
21
+ Provides-Extra: dev
22
+ Requires-Dist: build>=1.2.0; extra == "dev"
23
+ Requires-Dist: twine>=5.0.0; extra == "dev"
24
+ Dynamic: license-file
25
+
26
+ # metaspn-entities
27
+
28
+ Identity layer for MetaSPN systems.
29
+
30
+ ## Features
31
+
32
+ - Canonical entity IDs
33
+ - Deterministic identifier normalization + alias resolution
34
+ - Merge history and reversible soft undo
35
+ - SQLite backend using stdlib `sqlite3`
36
+ - Event emission payloads for `EntityResolved`, `EntityMerged`, `EntityAliasAdded`
37
+ - Optional filesystem snapshot export
38
+
39
+ ## Quick usage
40
+
41
+ ```python
42
+ from metaspn_entities import EntityResolver
43
+
44
+ resolver = EntityResolver()
45
+ resolution = resolver.resolve("twitter_handle", "@some_handle")
46
+ events = resolver.drain_events()
47
+ print(resolution.entity_id, resolution.confidence)
48
+ ```
49
+
50
+ ## API notes
51
+
52
+ - `resolve(identifier_type, value, context=None) -> EntityResolution`
53
+ - `add_alias(entity_id, identifier_type, value, ...)`
54
+ - `merge_entities(from_entity_id, to_entity_id, reason, ...)`
55
+ - `undo_merge(from_entity_id, to_entity_id, ...)` (implemented as reverse merge with redirect correction)
56
+ - `drain_events() -> list[EmittedEvent]`
57
+ - `export_snapshot(output_path)` to inspect SQLite state as JSON
@@ -0,0 +1,15 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ metaspn_entities/__init__.py
5
+ metaspn_entities/events.py
6
+ metaspn_entities/models.py
7
+ metaspn_entities/normalize.py
8
+ metaspn_entities/resolver.py
9
+ metaspn_entities/sqlite_backend.py
10
+ metaspn_entities.egg-info/PKG-INFO
11
+ metaspn_entities.egg-info/SOURCES.txt
12
+ metaspn_entities.egg-info/dependency_links.txt
13
+ metaspn_entities.egg-info/requires.txt
14
+ metaspn_entities.egg-info/top_level.txt
15
+ tests/test_resolver.py
@@ -0,0 +1,5 @@
1
+ metaspn-schemas
2
+
3
+ [dev]
4
+ build>=1.2.0
5
+ twine>=5.0.0
@@ -0,0 +1 @@
1
+ metaspn_entities
@@ -0,0 +1,38 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "metaspn-entities"
7
+ version = "0.1.0"
8
+ description = "Canonical entity resolution, aliasing, and merges for MetaSPN systems"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ authors = [
13
+ {name = "MetaSPN Contributors"}
14
+ ]
15
+ keywords = ["entity-resolution", "identity", "aliasing", "dedupe", "sqlite"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3 :: Only",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Topic :: Software Development :: Libraries",
25
+ "Topic :: Database",
26
+ ]
27
+ dependencies = [
28
+ "metaspn-schemas"
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "build>=1.2.0",
34
+ "twine>=5.0.0"
35
+ ]
36
+
37
+ [tool.setuptools]
38
+ packages = ["metaspn_entities"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,87 @@
1
+ import tempfile
2
+ import unittest
3
+ from pathlib import Path
4
+
5
+ from metaspn_entities import EntityResolver, SQLiteEntityStore
6
+
7
+
8
+ class ResolverTests(unittest.TestCase):
9
+ def setUp(self) -> None:
10
+ self.tempdir = tempfile.TemporaryDirectory()
11
+ self.db_path = str(Path(self.tempdir.name) / "entities.db")
12
+ self.store = SQLiteEntityStore(self.db_path)
13
+ self.resolver = EntityResolver(self.store)
14
+
15
+ def tearDown(self) -> None:
16
+ self.store.close()
17
+ self.tempdir.cleanup()
18
+
19
+ def test_exact_match_resolution(self) -> None:
20
+ first = self.resolver.resolve("twitter_handle", "@same")
21
+ second = self.resolver.resolve("twitter_handle", "same")
22
+ self.assertEqual(first.entity_id, second.entity_id)
23
+ self.assertFalse(second.created_new_entity)
24
+
25
+ def test_alias_addition(self) -> None:
26
+ created = self.resolver.resolve("twitter_handle", "alpha")
27
+ events = self.resolver.add_alias(created.entity_id, "email", "alpha@example.com", caused_by="manual")
28
+ self.assertEqual(len(events), 1)
29
+
30
+ again = self.resolver.resolve("email", "ALPHA@example.com")
31
+ self.assertEqual(again.entity_id, created.entity_id)
32
+
33
+ def test_merge_correctness(self) -> None:
34
+ a = self.resolver.resolve("twitter_handle", "person_a")
35
+ b = self.resolver.resolve("twitter_handle", "person_b")
36
+ self.resolver.merge_entities(a.entity_id, b.entity_id, reason="manual dedupe", caused_by="reviewer")
37
+
38
+ merged = self.resolver.resolve("twitter_handle", "person_a")
39
+ self.assertEqual(merged.entity_id, b.entity_id)
40
+
41
+ def test_merge_history(self) -> None:
42
+ a = self.resolver.resolve("twitter_handle", "x_a")
43
+ b = self.resolver.resolve("twitter_handle", "x_b")
44
+ self.resolver.merge_entities(a.entity_id, b.entity_id, reason="dedupe", caused_by="reviewer")
45
+ history = self.resolver.merge_history()
46
+ self.assertEqual(len(history), 1)
47
+ self.assertEqual(history[0]["from_entity_id"], a.entity_id)
48
+ self.assertEqual(history[0]["to_entity_id"], b.entity_id)
49
+
50
+ def test_merge_undo_via_reverse_merge(self) -> None:
51
+ a = self.resolver.resolve("twitter_handle", "undo_a")
52
+ b = self.resolver.resolve("twitter_handle", "undo_b")
53
+ self.resolver.merge_entities(a.entity_id, b.entity_id, reason="dedupe", caused_by="reviewer")
54
+ merged = self.resolver.resolve("twitter_handle", "undo_a")
55
+ self.assertEqual(merged.entity_id, b.entity_id)
56
+ self.resolver.undo_merge(a.entity_id, b.entity_id, caused_by="reviewer")
57
+
58
+ current_a = self.resolver.resolve("twitter_handle", "undo_a")
59
+ current_b = self.resolver.resolve("twitter_handle", "undo_b")
60
+ self.assertEqual(current_a.entity_id, a.entity_id)
61
+ self.assertEqual(current_b.entity_id, a.entity_id)
62
+
63
+ def test_confidence_behavior(self) -> None:
64
+ first = self.resolver.resolve("email", "test@example.com", context={"confidence": 0.7})
65
+ second = self.resolver.resolve("email", "test@example.com", context={"confidence": 0.4})
66
+ self.assertEqual(first.entity_id, second.entity_id)
67
+ self.assertGreaterEqual(second.confidence, 0.7)
68
+
69
+ def test_auto_merge_on_email(self) -> None:
70
+ a = self.resolver.resolve("twitter_handle", "owner_a")
71
+ b = self.resolver.resolve("twitter_handle", "owner_b")
72
+
73
+ self.resolver.add_alias(a.entity_id, "email", "shared@example.com")
74
+ events = self.resolver.add_alias(b.entity_id, "email", "shared@example.com")
75
+ self.assertEqual(len(events), 1)
76
+ self.assertEqual(events[0].event_type, "EntityMerged")
77
+
78
+ # resolving either alias should route to same canonical entity
79
+ one = self.resolver.resolve("twitter_handle", "owner_a")
80
+ two = self.resolver.resolve("twitter_handle", "owner_b")
81
+ self.assertEqual(one.entity_id, two.entity_id)
82
+ shared = self.resolver.resolve("email", "shared@example.com")
83
+ self.assertEqual(one.entity_id, shared.entity_id)
84
+
85
+
86
+ if __name__ == "__main__":
87
+ unittest.main()