metaspn-entities 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaspn_entities-0.1.0/LICENSE +21 -0
- metaspn_entities-0.1.0/PKG-INFO +57 -0
- metaspn_entities-0.1.0/README.md +32 -0
- metaspn_entities-0.1.0/metaspn_entities/__init__.py +11 -0
- metaspn_entities-0.1.0/metaspn_entities/events.py +49 -0
- metaspn_entities-0.1.0/metaspn_entities/models.py +73 -0
- metaspn_entities-0.1.0/metaspn_entities/normalize.py +36 -0
- metaspn_entities-0.1.0/metaspn_entities/resolver.py +149 -0
- metaspn_entities-0.1.0/metaspn_entities/sqlite_backend.py +278 -0
- metaspn_entities-0.1.0/metaspn_entities.egg-info/PKG-INFO +57 -0
- metaspn_entities-0.1.0/metaspn_entities.egg-info/SOURCES.txt +15 -0
- metaspn_entities-0.1.0/metaspn_entities.egg-info/dependency_links.txt +1 -0
- metaspn_entities-0.1.0/metaspn_entities.egg-info/requires.txt +5 -0
- metaspn_entities-0.1.0/metaspn_entities.egg-info/top_level.txt +1 -0
- metaspn_entities-0.1.0/pyproject.toml +38 -0
- metaspn_entities-0.1.0/setup.cfg +4 -0
- metaspn_entities-0.1.0/tests/test_resolver.py +87 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 MetaSPN Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: metaspn-entities
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Canonical entity resolution, aliasing, and merges for MetaSPN systems
|
|
5
|
+
Author: MetaSPN Contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: entity-resolution,identity,aliasing,dedupe,sqlite
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
16
|
+
Classifier: Topic :: Database
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: metaspn-schemas
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: build>=1.2.0; extra == "dev"
|
|
23
|
+
Requires-Dist: twine>=5.0.0; extra == "dev"
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# metaspn-entities
|
|
27
|
+
|
|
28
|
+
Identity layer for MetaSPN systems.
|
|
29
|
+
|
|
30
|
+
## Features
|
|
31
|
+
|
|
32
|
+
- Canonical entity IDs
|
|
33
|
+
- Deterministic identifier normalization + alias resolution
|
|
34
|
+
- Merge history and reversible soft undo
|
|
35
|
+
- SQLite backend using stdlib `sqlite3`
|
|
36
|
+
- Event emission payloads for `EntityResolved`, `EntityMerged`, `EntityAliasAdded`
|
|
37
|
+
- Optional filesystem snapshot export
|
|
38
|
+
|
|
39
|
+
## Quick usage
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from metaspn_entities import EntityResolver
|
|
43
|
+
|
|
44
|
+
resolver = EntityResolver()
|
|
45
|
+
resolution = resolver.resolve("twitter_handle", "@some_handle")
|
|
46
|
+
events = resolver.drain_events()
|
|
47
|
+
print(resolution.entity_id, resolution.confidence)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## API notes
|
|
51
|
+
|
|
52
|
+
- `resolve(identifier_type, value, context=None) -> EntityResolution`
|
|
53
|
+
- `add_alias(entity_id, identifier_type, value, ...)`
|
|
54
|
+
- `merge_entities(from_entity_id, to_entity_id, reason, ...)`
|
|
55
|
+
- `undo_merge(from_entity_id, to_entity_id, ...)` (implemented as reverse merge with redirect correction)
|
|
56
|
+
- `drain_events() -> list[EmittedEvent]`
|
|
57
|
+
- `export_snapshot(output_path)` to inspect SQLite state as JSON
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# metaspn-entities
|
|
2
|
+
|
|
3
|
+
Identity layer for MetaSPN systems.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Canonical entity IDs
|
|
8
|
+
- Deterministic identifier normalization + alias resolution
|
|
9
|
+
- Merge history and reversible soft undo
|
|
10
|
+
- SQLite backend using stdlib `sqlite3`
|
|
11
|
+
- Event emission payloads for `EntityResolved`, `EntityMerged`, `EntityAliasAdded`
|
|
12
|
+
- Optional filesystem snapshot export
|
|
13
|
+
|
|
14
|
+
## Quick usage
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
from metaspn_entities import EntityResolver
|
|
18
|
+
|
|
19
|
+
resolver = EntityResolver()
|
|
20
|
+
resolution = resolver.resolve("twitter_handle", "@some_handle")
|
|
21
|
+
events = resolver.drain_events()
|
|
22
|
+
print(resolution.entity_id, resolution.confidence)
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## API notes
|
|
26
|
+
|
|
27
|
+
- `resolve(identifier_type, value, context=None) -> EntityResolution`
|
|
28
|
+
- `add_alias(entity_id, identifier_type, value, ...)`
|
|
29
|
+
- `merge_entities(from_entity_id, to_entity_id, reason, ...)`
|
|
30
|
+
- `undo_merge(from_entity_id, to_entity_id, ...)` (implemented as reverse merge with redirect correction)
|
|
31
|
+
- `drain_events() -> list[EmittedEvent]`
|
|
32
|
+
- `export_snapshot(output_path)` to inspect SQLite state as JSON
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from .events import EmittedEvent
|
|
2
|
+
from .models import EntityResolution
|
|
3
|
+
from .resolver import EntityResolver
|
|
4
|
+
from .sqlite_backend import SQLiteEntityStore
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"EntityResolver",
|
|
8
|
+
"EntityResolution",
|
|
9
|
+
"EmittedEvent",
|
|
10
|
+
"SQLiteEntityStore",
|
|
11
|
+
]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class EmittedEvent:
|
|
9
|
+
event_type: str
|
|
10
|
+
payload: Dict[str, Any]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class EventFactory:
|
|
14
|
+
@staticmethod
|
|
15
|
+
def entity_resolved(entity_id: str, identifier_type: str, value: str, confidence: float, created_new_entity: bool) -> EmittedEvent:
|
|
16
|
+
return EmittedEvent(
|
|
17
|
+
event_type="EntityResolved",
|
|
18
|
+
payload={
|
|
19
|
+
"entity_id": entity_id,
|
|
20
|
+
"identifier_type": identifier_type,
|
|
21
|
+
"value": value,
|
|
22
|
+
"confidence": confidence,
|
|
23
|
+
"created_new_entity": created_new_entity,
|
|
24
|
+
},
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def entity_merged(from_entity_id: str, to_entity_id: str, reason: str, caused_by: str) -> EmittedEvent:
|
|
29
|
+
return EmittedEvent(
|
|
30
|
+
event_type="EntityMerged",
|
|
31
|
+
payload={
|
|
32
|
+
"from_entity_id": from_entity_id,
|
|
33
|
+
"to_entity_id": to_entity_id,
|
|
34
|
+
"reason": reason,
|
|
35
|
+
"caused_by": caused_by,
|
|
36
|
+
},
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def entity_alias_added(entity_id: str, identifier_type: str, normalized_value: str, caused_by: str) -> EmittedEvent:
|
|
41
|
+
return EmittedEvent(
|
|
42
|
+
event_type="EntityAliasAdded",
|
|
43
|
+
payload={
|
|
44
|
+
"entity_id": entity_id,
|
|
45
|
+
"identifier_type": identifier_type,
|
|
46
|
+
"normalized_value": normalized_value,
|
|
47
|
+
"caused_by": caused_by,
|
|
48
|
+
},
|
|
49
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
DEFAULT_MATCH_CONFIDENCE = 0.95
|
|
9
|
+
DEFAULT_NEW_ENTITY_CONFIDENCE = 0.6
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class EntityStatus:
|
|
13
|
+
ACTIVE = "active"
|
|
14
|
+
MERGED = "merged"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EntityType:
|
|
18
|
+
PERSON = "person"
|
|
19
|
+
ORG = "org"
|
|
20
|
+
PROJECT = "project"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True)
|
|
24
|
+
class Entity:
|
|
25
|
+
entity_id: str
|
|
26
|
+
entity_type: str
|
|
27
|
+
created_at: str
|
|
28
|
+
status: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass(frozen=True)
|
|
32
|
+
class Identifier:
|
|
33
|
+
identifier_type: str
|
|
34
|
+
value: str
|
|
35
|
+
normalized_value: str
|
|
36
|
+
confidence: float
|
|
37
|
+
first_seen_at: str
|
|
38
|
+
last_seen_at: str
|
|
39
|
+
provenance: Optional[str] = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(frozen=True)
|
|
43
|
+
class Alias:
|
|
44
|
+
identifier_type: str
|
|
45
|
+
normalized_value: str
|
|
46
|
+
entity_id: str
|
|
47
|
+
confidence: float
|
|
48
|
+
created_at: str
|
|
49
|
+
caused_by: str
|
|
50
|
+
provenance: Optional[str] = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(frozen=True)
|
|
54
|
+
class MergeRecord:
|
|
55
|
+
merge_id: int
|
|
56
|
+
from_entity_id: str
|
|
57
|
+
to_entity_id: str
|
|
58
|
+
reason: str
|
|
59
|
+
timestamp: str
|
|
60
|
+
caused_by: str
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass(frozen=True)
|
|
64
|
+
class EntityResolution:
|
|
65
|
+
entity_id: str
|
|
66
|
+
confidence: float
|
|
67
|
+
created_new_entity: bool
|
|
68
|
+
matched_identifiers: List[Dict[str, Any]] = field(default_factory=list)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def utcnow_iso() -> str:
|
|
73
|
+
return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def normalize_identifier(identifier_type: str, value: str) -> str:
|
|
7
|
+
identifier_type = identifier_type.strip().lower()
|
|
8
|
+
value = value.strip()
|
|
9
|
+
|
|
10
|
+
if identifier_type in {"twitter_handle", "github_handle", "handle"}:
|
|
11
|
+
return value.lstrip("@").lower()
|
|
12
|
+
|
|
13
|
+
if identifier_type == "email":
|
|
14
|
+
return value.lower()
|
|
15
|
+
|
|
16
|
+
if identifier_type == "domain":
|
|
17
|
+
cleaned = value.lower()
|
|
18
|
+
if cleaned.startswith("http://") or cleaned.startswith("https://"):
|
|
19
|
+
cleaned = urlparse(cleaned).netloc or cleaned
|
|
20
|
+
return cleaned.lstrip("www.")
|
|
21
|
+
|
|
22
|
+
if identifier_type in {"linkedin_url", "url", "canonical_url"}:
|
|
23
|
+
parsed = urlparse(value)
|
|
24
|
+
if parsed.scheme:
|
|
25
|
+
host = parsed.netloc.lower().lstrip("www.")
|
|
26
|
+
path = parsed.path.rstrip("/")
|
|
27
|
+
return f"{host}{path}".lower()
|
|
28
|
+
return value.lower().rstrip("/")
|
|
29
|
+
|
|
30
|
+
if identifier_type == "name":
|
|
31
|
+
return " ".join(value.lower().split())
|
|
32
|
+
|
|
33
|
+
return value.lower()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
AUTO_MERGE_IDENTIFIER_TYPES = {"email", "canonical_url", "url"}
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from .events import EmittedEvent, EventFactory
|
|
6
|
+
from .models import (
|
|
7
|
+
DEFAULT_MATCH_CONFIDENCE,
|
|
8
|
+
DEFAULT_NEW_ENTITY_CONFIDENCE,
|
|
9
|
+
EntityResolution,
|
|
10
|
+
EntityStatus,
|
|
11
|
+
EntityType,
|
|
12
|
+
)
|
|
13
|
+
from .normalize import AUTO_MERGE_IDENTIFIER_TYPES, normalize_identifier
|
|
14
|
+
from .sqlite_backend import SQLiteEntityStore
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EntityResolver:
|
|
18
|
+
def __init__(self, store: Optional[SQLiteEntityStore] = None) -> None:
|
|
19
|
+
self.store = store or SQLiteEntityStore()
|
|
20
|
+
self._event_buffer: List[EmittedEvent] = []
|
|
21
|
+
|
|
22
|
+
def resolve(self, identifier_type: str, value: str, context: Optional[Dict[str, Any]] = None) -> EntityResolution:
|
|
23
|
+
context = context or {}
|
|
24
|
+
confidence = float(context.get("confidence", DEFAULT_MATCH_CONFIDENCE))
|
|
25
|
+
provenance = context.get("provenance")
|
|
26
|
+
entity_type = context.get("entity_type", EntityType.PERSON)
|
|
27
|
+
caused_by = context.get("caused_by", "resolver")
|
|
28
|
+
|
|
29
|
+
normalized = normalize_identifier(identifier_type, value)
|
|
30
|
+
self.store.upsert_identifier(identifier_type, value, normalized, confidence, provenance)
|
|
31
|
+
|
|
32
|
+
existing_alias = self.store.find_alias(identifier_type, normalized)
|
|
33
|
+
if existing_alias:
|
|
34
|
+
entity_id = self.store.canonical_entity_id(existing_alias["entity_id"])
|
|
35
|
+
matched_identifiers = list(self.store.iter_identifiers_for_entity(entity_id))
|
|
36
|
+
resolution = EntityResolution(
|
|
37
|
+
entity_id=entity_id,
|
|
38
|
+
confidence=max(float(existing_alias["confidence"]), confidence),
|
|
39
|
+
created_new_entity=False,
|
|
40
|
+
matched_identifiers=matched_identifiers,
|
|
41
|
+
)
|
|
42
|
+
self._event_buffer.append(
|
|
43
|
+
EventFactory.entity_resolved(entity_id, identifier_type, value, resolution.confidence, False)
|
|
44
|
+
)
|
|
45
|
+
return resolution
|
|
46
|
+
|
|
47
|
+
entity_id = self.store.create_entity(entity_type)
|
|
48
|
+
added, conflicting_entity_id = self.store.add_alias(
|
|
49
|
+
identifier_type=identifier_type,
|
|
50
|
+
normalized_value=normalized,
|
|
51
|
+
entity_id=entity_id,
|
|
52
|
+
confidence=confidence,
|
|
53
|
+
caused_by=caused_by,
|
|
54
|
+
provenance=provenance,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
if conflicting_entity_id and identifier_type in AUTO_MERGE_IDENTIFIER_TYPES:
|
|
58
|
+
merge_reason = f"auto-merge on {identifier_type}:{normalized}"
|
|
59
|
+
self.store.merge_entities(entity_id, conflicting_entity_id, merge_reason, "auto-merge")
|
|
60
|
+
entity_id = self.store.canonical_entity_id(conflicting_entity_id)
|
|
61
|
+
self._event_buffer.append(
|
|
62
|
+
EventFactory.entity_merged(entity_id, conflicting_entity_id, merge_reason, "auto-merge")
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
matched_identifiers = list(self.store.iter_identifiers_for_entity(entity_id))
|
|
66
|
+
resolution = EntityResolution(
|
|
67
|
+
entity_id=entity_id,
|
|
68
|
+
confidence=confidence if added else DEFAULT_NEW_ENTITY_CONFIDENCE,
|
|
69
|
+
created_new_entity=True,
|
|
70
|
+
matched_identifiers=matched_identifiers,
|
|
71
|
+
)
|
|
72
|
+
if added:
|
|
73
|
+
self._event_buffer.append(EventFactory.entity_alias_added(entity_id, identifier_type, normalized, caused_by))
|
|
74
|
+
self._event_buffer.append(
|
|
75
|
+
EventFactory.entity_resolved(entity_id, identifier_type, value, resolution.confidence, True)
|
|
76
|
+
)
|
|
77
|
+
return resolution
|
|
78
|
+
|
|
79
|
+
def add_alias(
|
|
80
|
+
self,
|
|
81
|
+
entity_id: str,
|
|
82
|
+
identifier_type: str,
|
|
83
|
+
value: str,
|
|
84
|
+
confidence: float = DEFAULT_MATCH_CONFIDENCE,
|
|
85
|
+
caused_by: str = "manual",
|
|
86
|
+
provenance: Optional[str] = None,
|
|
87
|
+
) -> List[EmittedEvent]:
|
|
88
|
+
self.store.ensure_entity(entity_id)
|
|
89
|
+
canonical_entity_id = self.store.canonical_entity_id(entity_id)
|
|
90
|
+
normalized = normalize_identifier(identifier_type, value)
|
|
91
|
+
self.store.upsert_identifier(identifier_type, value, normalized, confidence, provenance)
|
|
92
|
+
|
|
93
|
+
added, conflicting_entity_id = self.store.add_alias(
|
|
94
|
+
identifier_type=identifier_type,
|
|
95
|
+
normalized_value=normalized,
|
|
96
|
+
entity_id=canonical_entity_id,
|
|
97
|
+
confidence=confidence,
|
|
98
|
+
caused_by=caused_by,
|
|
99
|
+
provenance=provenance,
|
|
100
|
+
)
|
|
101
|
+
if conflicting_entity_id and conflicting_entity_id != canonical_entity_id:
|
|
102
|
+
if identifier_type in AUTO_MERGE_IDENTIFIER_TYPES:
|
|
103
|
+
reason = f"auto-merge on {identifier_type}:{normalized}"
|
|
104
|
+
self.store.merge_entities(canonical_entity_id, conflicting_entity_id, reason, "auto-merge")
|
|
105
|
+
event = EventFactory.entity_merged(canonical_entity_id, conflicting_entity_id, reason, "auto-merge")
|
|
106
|
+
self._event_buffer.append(event)
|
|
107
|
+
return [event]
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f"Alias already mapped to another entity: {identifier_type}:{normalized} -> {conflicting_entity_id}"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if not added:
|
|
113
|
+
return []
|
|
114
|
+
|
|
115
|
+
event = EventFactory.entity_alias_added(canonical_entity_id, identifier_type, normalized, caused_by)
|
|
116
|
+
self._event_buffer.append(event)
|
|
117
|
+
return [event]
|
|
118
|
+
|
|
119
|
+
def merge_entities(self, from_entity_id: str, to_entity_id: str, reason: str, caused_by: str = "manual") -> EmittedEvent:
|
|
120
|
+
self.store.ensure_entity(from_entity_id)
|
|
121
|
+
self.store.ensure_entity(to_entity_id)
|
|
122
|
+
self.store.merge_entities(from_entity_id, to_entity_id, reason, caused_by)
|
|
123
|
+
event = EventFactory.entity_merged(from_entity_id, to_entity_id, reason, caused_by)
|
|
124
|
+
self._event_buffer.append(event)
|
|
125
|
+
return event
|
|
126
|
+
|
|
127
|
+
def undo_merge(self, from_entity_id: str, to_entity_id: str, caused_by: str = "manual") -> EmittedEvent:
|
|
128
|
+
reason = f"undo merge {from_entity_id}->{to_entity_id}"
|
|
129
|
+
if self.store.get_redirect_target(from_entity_id) == to_entity_id:
|
|
130
|
+
self.store.remove_redirect(from_entity_id)
|
|
131
|
+
self.store.set_entity_status(from_entity_id, EntityStatus.ACTIVE)
|
|
132
|
+
self.store.merge_entities(to_entity_id, from_entity_id, reason, caused_by)
|
|
133
|
+
event = EventFactory.entity_merged(to_entity_id, from_entity_id, reason, caused_by)
|
|
134
|
+
self._event_buffer.append(event)
|
|
135
|
+
return event
|
|
136
|
+
|
|
137
|
+
def merge_history(self) -> List[Dict[str, Any]]:
|
|
138
|
+
return self.store.list_merge_history()
|
|
139
|
+
|
|
140
|
+
def aliases_for_entity(self, entity_id: str) -> List[Dict[str, Any]]:
|
|
141
|
+
return self.store.list_aliases_for_entity(entity_id)
|
|
142
|
+
|
|
143
|
+
def export_snapshot(self, output_path: str) -> None:
|
|
144
|
+
self.store.export_snapshot(output_path)
|
|
145
|
+
|
|
146
|
+
def drain_events(self) -> List[EmittedEvent]:
|
|
147
|
+
events = list(self._event_buffer)
|
|
148
|
+
self._event_buffer.clear()
|
|
149
|
+
return events
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sqlite3
|
|
5
|
+
import uuid
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
from .models import EntityStatus, utcnow_iso
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
SCHEMA_SQL = """
|
|
13
|
+
CREATE TABLE IF NOT EXISTS entities (
|
|
14
|
+
entity_id TEXT PRIMARY KEY,
|
|
15
|
+
entity_type TEXT NOT NULL,
|
|
16
|
+
created_at TEXT NOT NULL,
|
|
17
|
+
status TEXT NOT NULL
|
|
18
|
+
);
|
|
19
|
+
|
|
20
|
+
CREATE TABLE IF NOT EXISTS identifiers (
|
|
21
|
+
identifier_type TEXT NOT NULL,
|
|
22
|
+
value TEXT NOT NULL,
|
|
23
|
+
normalized_value TEXT NOT NULL,
|
|
24
|
+
confidence REAL NOT NULL,
|
|
25
|
+
first_seen_at TEXT NOT NULL,
|
|
26
|
+
last_seen_at TEXT NOT NULL,
|
|
27
|
+
provenance TEXT,
|
|
28
|
+
UNIQUE(identifier_type, normalized_value)
|
|
29
|
+
);
|
|
30
|
+
|
|
31
|
+
CREATE TABLE IF NOT EXISTS aliases (
|
|
32
|
+
identifier_type TEXT NOT NULL,
|
|
33
|
+
normalized_value TEXT NOT NULL,
|
|
34
|
+
entity_id TEXT NOT NULL,
|
|
35
|
+
confidence REAL NOT NULL,
|
|
36
|
+
created_at TEXT NOT NULL,
|
|
37
|
+
caused_by TEXT NOT NULL,
|
|
38
|
+
provenance TEXT,
|
|
39
|
+
UNIQUE(identifier_type, normalized_value)
|
|
40
|
+
);
|
|
41
|
+
|
|
42
|
+
CREATE TABLE IF NOT EXISTS merge_records (
|
|
43
|
+
merge_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
44
|
+
from_entity_id TEXT NOT NULL,
|
|
45
|
+
to_entity_id TEXT NOT NULL,
|
|
46
|
+
reason TEXT NOT NULL,
|
|
47
|
+
timestamp TEXT NOT NULL,
|
|
48
|
+
caused_by TEXT NOT NULL
|
|
49
|
+
);
|
|
50
|
+
|
|
51
|
+
CREATE TABLE IF NOT EXISTS entity_redirects (
|
|
52
|
+
from_entity_id TEXT PRIMARY KEY,
|
|
53
|
+
to_entity_id TEXT NOT NULL,
|
|
54
|
+
timestamp TEXT NOT NULL,
|
|
55
|
+
reason TEXT NOT NULL,
|
|
56
|
+
caused_by TEXT NOT NULL
|
|
57
|
+
);
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class SQLiteEntityStore:
|
|
62
|
+
def __init__(self, db_path: str = ":memory:") -> None:
|
|
63
|
+
self.db_path = db_path
|
|
64
|
+
self.conn = sqlite3.connect(db_path)
|
|
65
|
+
self.conn.row_factory = sqlite3.Row
|
|
66
|
+
self.conn.executescript(SCHEMA_SQL)
|
|
67
|
+
self.conn.commit()
|
|
68
|
+
|
|
69
|
+
def close(self) -> None:
|
|
70
|
+
self.conn.close()
|
|
71
|
+
|
|
72
|
+
def create_entity(self, entity_type: str) -> str:
|
|
73
|
+
entity_id = f"ent_{uuid.uuid4().hex}"
|
|
74
|
+
now = utcnow_iso()
|
|
75
|
+
self.conn.execute(
|
|
76
|
+
"INSERT INTO entities(entity_id, entity_type, created_at, status) VALUES (?, ?, ?, ?)",
|
|
77
|
+
(entity_id, entity_type, now, EntityStatus.ACTIVE),
|
|
78
|
+
)
|
|
79
|
+
self.conn.commit()
|
|
80
|
+
return entity_id
|
|
81
|
+
|
|
82
|
+
def get_entity(self, entity_id: str) -> Optional[sqlite3.Row]:
|
|
83
|
+
row = self.conn.execute("SELECT * FROM entities WHERE entity_id = ?", (entity_id,)).fetchone()
|
|
84
|
+
return row
|
|
85
|
+
|
|
86
|
+
def canonical_entity_id(self, entity_id: str) -> str:
|
|
87
|
+
current = entity_id
|
|
88
|
+
visited = set()
|
|
89
|
+
while True:
|
|
90
|
+
if current in visited:
|
|
91
|
+
raise ValueError(f"Cycle detected in merge redirects for {entity_id}")
|
|
92
|
+
visited.add(current)
|
|
93
|
+
row = self.conn.execute(
|
|
94
|
+
"SELECT to_entity_id FROM entity_redirects WHERE from_entity_id = ?", (current,)
|
|
95
|
+
).fetchone()
|
|
96
|
+
if not row:
|
|
97
|
+
return current
|
|
98
|
+
current = row["to_entity_id"]
|
|
99
|
+
|
|
100
|
+
def find_alias(self, identifier_type: str, normalized_value: str) -> Optional[sqlite3.Row]:
|
|
101
|
+
return self.conn.execute(
|
|
102
|
+
"SELECT * FROM aliases WHERE identifier_type = ? AND normalized_value = ?",
|
|
103
|
+
(identifier_type, normalized_value),
|
|
104
|
+
).fetchone()
|
|
105
|
+
|
|
106
|
+
def upsert_identifier(
|
|
107
|
+
self,
|
|
108
|
+
identifier_type: str,
|
|
109
|
+
value: str,
|
|
110
|
+
normalized_value: str,
|
|
111
|
+
confidence: float,
|
|
112
|
+
provenance: Optional[str],
|
|
113
|
+
) -> None:
|
|
114
|
+
now = utcnow_iso()
|
|
115
|
+
existing = self.conn.execute(
|
|
116
|
+
"SELECT * FROM identifiers WHERE identifier_type = ? AND normalized_value = ?",
|
|
117
|
+
(identifier_type, normalized_value),
|
|
118
|
+
).fetchone()
|
|
119
|
+
if existing:
|
|
120
|
+
self.conn.execute(
|
|
121
|
+
"UPDATE identifiers SET value = ?, confidence = ?, last_seen_at = ?, provenance = ? WHERE identifier_type = ? AND normalized_value = ?",
|
|
122
|
+
(
|
|
123
|
+
value,
|
|
124
|
+
max(confidence, existing["confidence"]),
|
|
125
|
+
now,
|
|
126
|
+
provenance or existing["provenance"],
|
|
127
|
+
identifier_type,
|
|
128
|
+
normalized_value,
|
|
129
|
+
),
|
|
130
|
+
)
|
|
131
|
+
else:
|
|
132
|
+
self.conn.execute(
|
|
133
|
+
"INSERT INTO identifiers(identifier_type, value, normalized_value, confidence, first_seen_at, last_seen_at, provenance) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
134
|
+
(identifier_type, value, normalized_value, confidence, now, now, provenance),
|
|
135
|
+
)
|
|
136
|
+
self.conn.commit()
|
|
137
|
+
|
|
138
|
+
def add_alias(
|
|
139
|
+
self,
|
|
140
|
+
identifier_type: str,
|
|
141
|
+
normalized_value: str,
|
|
142
|
+
entity_id: str,
|
|
143
|
+
confidence: float,
|
|
144
|
+
caused_by: str,
|
|
145
|
+
provenance: Optional[str] = None,
|
|
146
|
+
) -> Tuple[bool, Optional[str]]:
|
|
147
|
+
now = utcnow_iso()
|
|
148
|
+
existing = self.find_alias(identifier_type, normalized_value)
|
|
149
|
+
canonical_target = self.canonical_entity_id(entity_id)
|
|
150
|
+
|
|
151
|
+
if existing:
|
|
152
|
+
existing_entity = self.canonical_entity_id(existing["entity_id"])
|
|
153
|
+
if existing_entity == canonical_target:
|
|
154
|
+
self.conn.execute(
|
|
155
|
+
"UPDATE aliases SET confidence = ?, provenance = ? WHERE identifier_type = ? AND normalized_value = ?",
|
|
156
|
+
(
|
|
157
|
+
max(confidence, existing["confidence"]),
|
|
158
|
+
provenance or existing["provenance"],
|
|
159
|
+
identifier_type,
|
|
160
|
+
normalized_value,
|
|
161
|
+
),
|
|
162
|
+
)
|
|
163
|
+
self.conn.commit()
|
|
164
|
+
return False, None
|
|
165
|
+
return False, existing_entity
|
|
166
|
+
|
|
167
|
+
self.conn.execute(
|
|
168
|
+
"INSERT INTO aliases(identifier_type, normalized_value, entity_id, confidence, created_at, caused_by, provenance) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
169
|
+
(identifier_type, normalized_value, canonical_target, confidence, now, caused_by, provenance),
|
|
170
|
+
)
|
|
171
|
+
self.conn.commit()
|
|
172
|
+
return True, None
|
|
173
|
+
|
|
174
|
+
def reassign_aliases(self, from_entity_id: str, to_entity_id: str) -> None:
|
|
175
|
+
self.conn.execute(
|
|
176
|
+
"UPDATE aliases SET entity_id = ? WHERE entity_id = ?",
|
|
177
|
+
(to_entity_id, from_entity_id),
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def get_redirect_target(self, from_entity_id: str) -> Optional[str]:
|
|
181
|
+
row = self.conn.execute(
|
|
182
|
+
"SELECT to_entity_id FROM entity_redirects WHERE from_entity_id = ?",
|
|
183
|
+
(from_entity_id,),
|
|
184
|
+
).fetchone()
|
|
185
|
+
if not row:
|
|
186
|
+
return None
|
|
187
|
+
return str(row["to_entity_id"])
|
|
188
|
+
|
|
189
|
+
def remove_redirect(self, from_entity_id: str) -> None:
|
|
190
|
+
self.conn.execute("DELETE FROM entity_redirects WHERE from_entity_id = ?", (from_entity_id,))
|
|
191
|
+
self.conn.commit()
|
|
192
|
+
|
|
193
|
+
def set_entity_status(self, entity_id: str, status: str) -> None:
|
|
194
|
+
self.conn.execute("UPDATE entities SET status = ? WHERE entity_id = ?", (status, entity_id))
|
|
195
|
+
self.conn.commit()
|
|
196
|
+
|
|
197
|
+
def merge_entities(self, from_entity_id: str, to_entity_id: str, reason: str, caused_by: str) -> int:
|
|
198
|
+
from_canonical = self.canonical_entity_id(from_entity_id)
|
|
199
|
+
to_canonical = self.canonical_entity_id(to_entity_id)
|
|
200
|
+
|
|
201
|
+
if from_canonical == to_canonical:
|
|
202
|
+
raise ValueError("Entities are already merged")
|
|
203
|
+
|
|
204
|
+
timestamp = utcnow_iso()
|
|
205
|
+
self.conn.execute(
|
|
206
|
+
"INSERT OR REPLACE INTO entity_redirects(from_entity_id, to_entity_id, timestamp, reason, caused_by) VALUES (?, ?, ?, ?, ?)",
|
|
207
|
+
(from_canonical, to_canonical, timestamp, reason, caused_by),
|
|
208
|
+
)
|
|
209
|
+
self.conn.execute(
|
|
210
|
+
"UPDATE entities SET status = ? WHERE entity_id = ?",
|
|
211
|
+
(EntityStatus.MERGED, from_canonical),
|
|
212
|
+
)
|
|
213
|
+
self.conn.execute(
|
|
214
|
+
"UPDATE entities SET status = ? WHERE entity_id = ?",
|
|
215
|
+
(EntityStatus.ACTIVE, to_canonical),
|
|
216
|
+
)
|
|
217
|
+
cursor = self.conn.execute(
|
|
218
|
+
"INSERT INTO merge_records(from_entity_id, to_entity_id, reason, timestamp, caused_by) VALUES (?, ?, ?, ?, ?)",
|
|
219
|
+
(from_canonical, to_canonical, reason, timestamp, caused_by),
|
|
220
|
+
)
|
|
221
|
+
self.conn.commit()
|
|
222
|
+
return int(cursor.lastrowid)
|
|
223
|
+
|
|
224
|
+
def list_aliases_for_entity(self, entity_id: str) -> List[Dict[str, Any]]:
|
|
225
|
+
target = self.canonical_entity_id(entity_id)
|
|
226
|
+
rows = self.conn.execute(
|
|
227
|
+
"SELECT identifier_type, normalized_value, entity_id, confidence FROM aliases ORDER BY identifier_type, normalized_value"
|
|
228
|
+
).fetchall()
|
|
229
|
+
return [
|
|
230
|
+
{
|
|
231
|
+
"identifier_type": row["identifier_type"],
|
|
232
|
+
"normalized_value": row["normalized_value"],
|
|
233
|
+
"entity_id": row["entity_id"],
|
|
234
|
+
"confidence": row["confidence"],
|
|
235
|
+
}
|
|
236
|
+
for row in rows
|
|
237
|
+
if self.canonical_entity_id(row["entity_id"]) == target
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
def list_merge_history(self) -> List[Dict[str, Any]]:
|
|
241
|
+
rows = self.conn.execute(
|
|
242
|
+
"SELECT merge_id, from_entity_id, to_entity_id, reason, timestamp, caused_by FROM merge_records ORDER BY merge_id"
|
|
243
|
+
).fetchall()
|
|
244
|
+
return [dict(row) for row in rows]
|
|
245
|
+
|
|
246
|
+
def export_snapshot(self, output_path: str) -> None:
|
|
247
|
+
payload: Dict[str, Any] = {}
|
|
248
|
+
for table in ["entities", "identifiers", "aliases", "merge_records", "entity_redirects"]:
|
|
249
|
+
rows = self.conn.execute(f"SELECT * FROM {table}").fetchall()
|
|
250
|
+
payload[table] = [dict(row) for row in rows]
|
|
251
|
+
|
|
252
|
+
path = Path(output_path)
|
|
253
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
254
|
+
path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
|
255
|
+
|
|
256
|
+
def ensure_entity(self, entity_id: str) -> None:
|
|
257
|
+
row = self.get_entity(entity_id)
|
|
258
|
+
if not row:
|
|
259
|
+
raise ValueError(f"Unknown entity_id: {entity_id}")
|
|
260
|
+
|
|
261
|
+
def iter_identifiers_for_entity(self, entity_id: str) -> Iterable[Dict[str, Any]]:
|
|
262
|
+
target = self.canonical_entity_id(entity_id)
|
|
263
|
+
rows = self.conn.execute(
|
|
264
|
+
"""
|
|
265
|
+
SELECT a.entity_id, i.identifier_type, i.value, i.normalized_value, i.confidence
|
|
266
|
+
FROM aliases a
|
|
267
|
+
JOIN identifiers i ON a.identifier_type = i.identifier_type AND a.normalized_value = i.normalized_value
|
|
268
|
+
ORDER BY i.identifier_type, i.normalized_value
|
|
269
|
+
"""
|
|
270
|
+
).fetchall()
|
|
271
|
+
for row in rows:
|
|
272
|
+
if self.canonical_entity_id(row["entity_id"]) == target:
|
|
273
|
+
yield {
|
|
274
|
+
"identifier_type": row["identifier_type"],
|
|
275
|
+
"value": row["value"],
|
|
276
|
+
"normalized_value": row["normalized_value"],
|
|
277
|
+
"confidence": row["confidence"],
|
|
278
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: metaspn-entities
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Canonical entity resolution, aliasing, and merges for MetaSPN systems
|
|
5
|
+
Author: MetaSPN Contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: entity-resolution,identity,aliasing,dedupe,sqlite
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
16
|
+
Classifier: Topic :: Database
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: metaspn-schemas
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: build>=1.2.0; extra == "dev"
|
|
23
|
+
Requires-Dist: twine>=5.0.0; extra == "dev"
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# metaspn-entities
|
|
27
|
+
|
|
28
|
+
Identity layer for MetaSPN systems.
|
|
29
|
+
|
|
30
|
+
## Features
|
|
31
|
+
|
|
32
|
+
- Canonical entity IDs
|
|
33
|
+
- Deterministic identifier normalization + alias resolution
|
|
34
|
+
- Merge history and reversible soft undo
|
|
35
|
+
- SQLite backend using stdlib `sqlite3`
|
|
36
|
+
- Event emission payloads for `EntityResolved`, `EntityMerged`, `EntityAliasAdded`
|
|
37
|
+
- Optional filesystem snapshot export
|
|
38
|
+
|
|
39
|
+
## Quick usage
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from metaspn_entities import EntityResolver
|
|
43
|
+
|
|
44
|
+
resolver = EntityResolver()
|
|
45
|
+
resolution = resolver.resolve("twitter_handle", "@some_handle")
|
|
46
|
+
events = resolver.drain_events()
|
|
47
|
+
print(resolution.entity_id, resolution.confidence)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## API notes
|
|
51
|
+
|
|
52
|
+
- `resolve(identifier_type, value, context=None) -> EntityResolution`
|
|
53
|
+
- `add_alias(entity_id, identifier_type, value, ...)`
|
|
54
|
+
- `merge_entities(from_entity_id, to_entity_id, reason, ...)`
|
|
55
|
+
- `undo_merge(from_entity_id, to_entity_id, ...)` (implemented as reverse merge with redirect correction)
|
|
56
|
+
- `drain_events() -> list[EmittedEvent]`
|
|
57
|
+
- `export_snapshot(output_path)` to inspect SQLite state as JSON
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
metaspn_entities/__init__.py
|
|
5
|
+
metaspn_entities/events.py
|
|
6
|
+
metaspn_entities/models.py
|
|
7
|
+
metaspn_entities/normalize.py
|
|
8
|
+
metaspn_entities/resolver.py
|
|
9
|
+
metaspn_entities/sqlite_backend.py
|
|
10
|
+
metaspn_entities.egg-info/PKG-INFO
|
|
11
|
+
metaspn_entities.egg-info/SOURCES.txt
|
|
12
|
+
metaspn_entities.egg-info/dependency_links.txt
|
|
13
|
+
metaspn_entities.egg-info/requires.txt
|
|
14
|
+
metaspn_entities.egg-info/top_level.txt
|
|
15
|
+
tests/test_resolver.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
metaspn_entities
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "metaspn-entities"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Canonical entity resolution, aliasing, and merges for MetaSPN systems"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "MetaSPN Contributors"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["entity-resolution", "identity", "aliasing", "dedupe", "sqlite"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Topic :: Software Development :: Libraries",
|
|
25
|
+
"Topic :: Database",
|
|
26
|
+
]
|
|
27
|
+
dependencies = [
|
|
28
|
+
"metaspn-schemas"
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
dev = [
|
|
33
|
+
"build>=1.2.0",
|
|
34
|
+
"twine>=5.0.0"
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[tool.setuptools]
|
|
38
|
+
packages = ["metaspn_entities"]
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
import unittest
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from metaspn_entities import EntityResolver, SQLiteEntityStore
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ResolverTests(unittest.TestCase):
|
|
9
|
+
def setUp(self) -> None:
|
|
10
|
+
self.tempdir = tempfile.TemporaryDirectory()
|
|
11
|
+
self.db_path = str(Path(self.tempdir.name) / "entities.db")
|
|
12
|
+
self.store = SQLiteEntityStore(self.db_path)
|
|
13
|
+
self.resolver = EntityResolver(self.store)
|
|
14
|
+
|
|
15
|
+
def tearDown(self) -> None:
|
|
16
|
+
self.store.close()
|
|
17
|
+
self.tempdir.cleanup()
|
|
18
|
+
|
|
19
|
+
def test_exact_match_resolution(self) -> None:
|
|
20
|
+
first = self.resolver.resolve("twitter_handle", "@same")
|
|
21
|
+
second = self.resolver.resolve("twitter_handle", "same")
|
|
22
|
+
self.assertEqual(first.entity_id, second.entity_id)
|
|
23
|
+
self.assertFalse(second.created_new_entity)
|
|
24
|
+
|
|
25
|
+
def test_alias_addition(self) -> None:
|
|
26
|
+
created = self.resolver.resolve("twitter_handle", "alpha")
|
|
27
|
+
events = self.resolver.add_alias(created.entity_id, "email", "alpha@example.com", caused_by="manual")
|
|
28
|
+
self.assertEqual(len(events), 1)
|
|
29
|
+
|
|
30
|
+
again = self.resolver.resolve("email", "ALPHA@example.com")
|
|
31
|
+
self.assertEqual(again.entity_id, created.entity_id)
|
|
32
|
+
|
|
33
|
+
def test_merge_correctness(self) -> None:
|
|
34
|
+
a = self.resolver.resolve("twitter_handle", "person_a")
|
|
35
|
+
b = self.resolver.resolve("twitter_handle", "person_b")
|
|
36
|
+
self.resolver.merge_entities(a.entity_id, b.entity_id, reason="manual dedupe", caused_by="reviewer")
|
|
37
|
+
|
|
38
|
+
merged = self.resolver.resolve("twitter_handle", "person_a")
|
|
39
|
+
self.assertEqual(merged.entity_id, b.entity_id)
|
|
40
|
+
|
|
41
|
+
def test_merge_history(self) -> None:
|
|
42
|
+
a = self.resolver.resolve("twitter_handle", "x_a")
|
|
43
|
+
b = self.resolver.resolve("twitter_handle", "x_b")
|
|
44
|
+
self.resolver.merge_entities(a.entity_id, b.entity_id, reason="dedupe", caused_by="reviewer")
|
|
45
|
+
history = self.resolver.merge_history()
|
|
46
|
+
self.assertEqual(len(history), 1)
|
|
47
|
+
self.assertEqual(history[0]["from_entity_id"], a.entity_id)
|
|
48
|
+
self.assertEqual(history[0]["to_entity_id"], b.entity_id)
|
|
49
|
+
|
|
50
|
+
def test_merge_undo_via_reverse_merge(self) -> None:
|
|
51
|
+
a = self.resolver.resolve("twitter_handle", "undo_a")
|
|
52
|
+
b = self.resolver.resolve("twitter_handle", "undo_b")
|
|
53
|
+
self.resolver.merge_entities(a.entity_id, b.entity_id, reason="dedupe", caused_by="reviewer")
|
|
54
|
+
merged = self.resolver.resolve("twitter_handle", "undo_a")
|
|
55
|
+
self.assertEqual(merged.entity_id, b.entity_id)
|
|
56
|
+
self.resolver.undo_merge(a.entity_id, b.entity_id, caused_by="reviewer")
|
|
57
|
+
|
|
58
|
+
current_a = self.resolver.resolve("twitter_handle", "undo_a")
|
|
59
|
+
current_b = self.resolver.resolve("twitter_handle", "undo_b")
|
|
60
|
+
self.assertEqual(current_a.entity_id, a.entity_id)
|
|
61
|
+
self.assertEqual(current_b.entity_id, a.entity_id)
|
|
62
|
+
|
|
63
|
+
def test_confidence_behavior(self) -> None:
|
|
64
|
+
first = self.resolver.resolve("email", "test@example.com", context={"confidence": 0.7})
|
|
65
|
+
second = self.resolver.resolve("email", "test@example.com", context={"confidence": 0.4})
|
|
66
|
+
self.assertEqual(first.entity_id, second.entity_id)
|
|
67
|
+
self.assertGreaterEqual(second.confidence, 0.7)
|
|
68
|
+
|
|
69
|
+
def test_auto_merge_on_email(self) -> None:
|
|
70
|
+
a = self.resolver.resolve("twitter_handle", "owner_a")
|
|
71
|
+
b = self.resolver.resolve("twitter_handle", "owner_b")
|
|
72
|
+
|
|
73
|
+
self.resolver.add_alias(a.entity_id, "email", "shared@example.com")
|
|
74
|
+
events = self.resolver.add_alias(b.entity_id, "email", "shared@example.com")
|
|
75
|
+
self.assertEqual(len(events), 1)
|
|
76
|
+
self.assertEqual(events[0].event_type, "EntityMerged")
|
|
77
|
+
|
|
78
|
+
# resolving either alias should route to same canonical entity
|
|
79
|
+
one = self.resolver.resolve("twitter_handle", "owner_a")
|
|
80
|
+
two = self.resolver.resolve("twitter_handle", "owner_b")
|
|
81
|
+
self.assertEqual(one.entity_id, two.entity_id)
|
|
82
|
+
shared = self.resolver.resolve("email", "shared@example.com")
|
|
83
|
+
self.assertEqual(one.entity_id, shared.entity_id)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
if __name__ == "__main__":
|
|
87
|
+
unittest.main()
|