metaspn-entities 0.1.0__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/PKG-INFO +45 -1
- metaspn_entities-0.1.4/README.md +76 -0
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/metaspn_entities/__init__.py +3 -0
- metaspn_entities-0.1.4/metaspn_entities/adapter.py +131 -0
- metaspn_entities-0.1.4/metaspn_entities/events.py +65 -0
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/metaspn_entities/resolver.py +9 -14
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/metaspn_entities.egg-info/PKG-INFO +45 -1
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/metaspn_entities.egg-info/SOURCES.txt +3 -0
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/pyproject.toml +1 -1
- metaspn_entities-0.1.4/tests/test_adapter.py +136 -0
- metaspn_entities-0.1.4/tests/test_event_contract.py +98 -0
- metaspn_entities-0.1.0/README.md +0 -32
- metaspn_entities-0.1.0/metaspn_entities/events.py +0 -49
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/LICENSE +0 -0
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/metaspn_entities/models.py +0 -0
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/metaspn_entities/normalize.py +0 -0
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/metaspn_entities/sqlite_backend.py +0 -0
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/metaspn_entities.egg-info/dependency_links.txt +0 -0
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/metaspn_entities.egg-info/requires.txt +0 -0
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/metaspn_entities.egg-info/top_level.txt +0 -0
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/setup.cfg +0 -0
- {metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/tests/test_resolver.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: metaspn-entities
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Canonical entity resolution, aliasing, and merges for MetaSPN systems
|
|
5
5
|
Author: MetaSPN Contributors
|
|
6
6
|
License-Expression: MIT
|
|
@@ -55,3 +55,47 @@ print(resolution.entity_id, resolution.confidence)
|
|
|
55
55
|
- `undo_merge(from_entity_id, to_entity_id, ...)` (implemented as reverse merge with redirect correction)
|
|
56
56
|
- `drain_events() -> list[EmittedEvent]`
|
|
57
57
|
- `export_snapshot(output_path)` to inspect SQLite state as JSON
|
|
58
|
+
|
|
59
|
+
## Event Contract Guarantees
|
|
60
|
+
|
|
61
|
+
`drain_events()` returns `EmittedEvent` objects whose `event_type` and `payload` are
|
|
62
|
+
schema-compatible with `metaspn-schemas` entity events.
|
|
63
|
+
|
|
64
|
+
- `EntityResolved` payload keys:
|
|
65
|
+
- `entity_id`, `resolver`, `resolved_at`, `confidence`, `schema_version`
|
|
66
|
+
- `EntityMerged` payload keys:
|
|
67
|
+
- `entity_id`, `merged_from`, `merged_at`, `reason`, `schema_version`
|
|
68
|
+
- `EntityAliasAdded` payload keys:
|
|
69
|
+
- `entity_id`, `alias`, `alias_type`, `added_at`, `schema_version`
|
|
70
|
+
|
|
71
|
+
Datetime fields are emitted as UTC ISO-8601 strings for deterministic serialization.
|
|
72
|
+
|
|
73
|
+
## M0 Ingestion Adapter
|
|
74
|
+
|
|
75
|
+
For worker/runtime integration, use `resolve_normalized_social_signal(...)` with a
|
|
76
|
+
normalized signal envelope.
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from metaspn_entities import EntityResolver, resolve_normalized_social_signal
|
|
80
|
+
|
|
81
|
+
resolver = EntityResolver()
|
|
82
|
+
signal = {
|
|
83
|
+
"source": "social.ingest",
|
|
84
|
+
"payload_type": "SocialPostSeen",
|
|
85
|
+
"payload": {
|
|
86
|
+
"platform": "twitter",
|
|
87
|
+
"author_handle": "@some_handle",
|
|
88
|
+
"profile_url": "https://example.com/profiles/some-handle",
|
|
89
|
+
},
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
result = resolve_normalized_social_signal(resolver, signal)
|
|
93
|
+
print(result.entity_id, result.confidence)
|
|
94
|
+
for event in result.emitted_events:
|
|
95
|
+
print(event.event_type, event.payload)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Adapter behavior:
|
|
99
|
+
- Extracts deterministic identifier candidates from normalized payloads.
|
|
100
|
+
- Resolves a primary identifier, then adds remaining identifiers as aliases.
|
|
101
|
+
- Returns only events produced during the adapter call.
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# metaspn-entities
|
|
2
|
+
|
|
3
|
+
Identity layer for MetaSPN systems.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Canonical entity IDs
|
|
8
|
+
- Deterministic identifier normalization + alias resolution
|
|
9
|
+
- Merge history and reversible soft undo
|
|
10
|
+
- SQLite backend using stdlib `sqlite3`
|
|
11
|
+
- Event emission payloads for `EntityResolved`, `EntityMerged`, `EntityAliasAdded`
|
|
12
|
+
- Optional filesystem snapshot export
|
|
13
|
+
|
|
14
|
+
## Quick usage
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
from metaspn_entities import EntityResolver
|
|
18
|
+
|
|
19
|
+
resolver = EntityResolver()
|
|
20
|
+
resolution = resolver.resolve("twitter_handle", "@some_handle")
|
|
21
|
+
events = resolver.drain_events()
|
|
22
|
+
print(resolution.entity_id, resolution.confidence)
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## API notes
|
|
26
|
+
|
|
27
|
+
- `resolve(identifier_type, value, context=None) -> EntityResolution`
|
|
28
|
+
- `add_alias(entity_id, identifier_type, value, ...)`
|
|
29
|
+
- `merge_entities(from_entity_id, to_entity_id, reason, ...)`
|
|
30
|
+
- `undo_merge(from_entity_id, to_entity_id, ...)` (implemented as reverse merge with redirect correction)
|
|
31
|
+
- `drain_events() -> list[EmittedEvent]`
|
|
32
|
+
- `export_snapshot(output_path)` to inspect SQLite state as JSON
|
|
33
|
+
|
|
34
|
+
## Event Contract Guarantees
|
|
35
|
+
|
|
36
|
+
`drain_events()` returns `EmittedEvent` objects whose `event_type` and `payload` are
|
|
37
|
+
schema-compatible with `metaspn-schemas` entity events.
|
|
38
|
+
|
|
39
|
+
- `EntityResolved` payload keys:
|
|
40
|
+
- `entity_id`, `resolver`, `resolved_at`, `confidence`, `schema_version`
|
|
41
|
+
- `EntityMerged` payload keys:
|
|
42
|
+
- `entity_id`, `merged_from`, `merged_at`, `reason`, `schema_version`
|
|
43
|
+
- `EntityAliasAdded` payload keys:
|
|
44
|
+
- `entity_id`, `alias`, `alias_type`, `added_at`, `schema_version`
|
|
45
|
+
|
|
46
|
+
Datetime fields are emitted as UTC ISO-8601 strings for deterministic serialization.
|
|
47
|
+
|
|
48
|
+
## M0 Ingestion Adapter
|
|
49
|
+
|
|
50
|
+
For worker/runtime integration, use `resolve_normalized_social_signal(...)` with a
|
|
51
|
+
normalized signal envelope.
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from metaspn_entities import EntityResolver, resolve_normalized_social_signal
|
|
55
|
+
|
|
56
|
+
resolver = EntityResolver()
|
|
57
|
+
signal = {
|
|
58
|
+
"source": "social.ingest",
|
|
59
|
+
"payload_type": "SocialPostSeen",
|
|
60
|
+
"payload": {
|
|
61
|
+
"platform": "twitter",
|
|
62
|
+
"author_handle": "@some_handle",
|
|
63
|
+
"profile_url": "https://example.com/profiles/some-handle",
|
|
64
|
+
},
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
result = resolve_normalized_social_signal(resolver, signal)
|
|
68
|
+
print(result.entity_id, result.confidence)
|
|
69
|
+
for event in result.emitted_events:
|
|
70
|
+
print(event.event_type, event.payload)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Adapter behavior:
|
|
74
|
+
- Extracts deterministic identifier candidates from normalized payloads.
|
|
75
|
+
- Resolves a primary identifier, then adds remaining identifiers as aliases.
|
|
76
|
+
- Returns only events produced during the adapter call.
|
|
@@ -1,9 +1,12 @@
|
|
|
1
|
+
from .adapter import SignalResolutionResult, resolve_normalized_social_signal
|
|
1
2
|
from .events import EmittedEvent
|
|
2
3
|
from .models import EntityResolution
|
|
3
4
|
from .resolver import EntityResolver
|
|
4
5
|
from .sqlite_backend import SQLiteEntityStore
|
|
5
6
|
|
|
6
7
|
__all__ = [
|
|
8
|
+
"resolve_normalized_social_signal",
|
|
9
|
+
"SignalResolutionResult",
|
|
7
10
|
"EntityResolver",
|
|
8
11
|
"EntityResolution",
|
|
9
12
|
"EmittedEvent",
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
from .events import EmittedEvent
|
|
7
|
+
from .models import EntityType
|
|
8
|
+
from .resolver import EntityResolver
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class SignalResolutionResult:
|
|
13
|
+
entity_id: str
|
|
14
|
+
confidence: float
|
|
15
|
+
emitted_events: List[EmittedEvent]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def resolve_normalized_social_signal(
|
|
19
|
+
resolver: EntityResolver,
|
|
20
|
+
signal_envelope: Mapping[str, Any] | Any,
|
|
21
|
+
*,
|
|
22
|
+
default_entity_type: str = EntityType.PERSON,
|
|
23
|
+
caused_by: str = "m0-ingestion",
|
|
24
|
+
) -> SignalResolutionResult:
|
|
25
|
+
"""Resolve a normalized social signal envelope into a canonical entity.
|
|
26
|
+
|
|
27
|
+
The adapter is intentionally deterministic:
|
|
28
|
+
- Identifier extraction order is fixed.
|
|
29
|
+
- Primary resolution always uses the highest-priority available identifier.
|
|
30
|
+
- Remaining identifiers are added as aliases in deterministic order.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
# Keep adapter call output scoped to actions taken in this invocation only.
|
|
34
|
+
resolver.drain_events()
|
|
35
|
+
|
|
36
|
+
envelope = _coerce_envelope(signal_envelope)
|
|
37
|
+
payload = _coerce_payload(envelope.get("payload"))
|
|
38
|
+
source = str(envelope.get("source") or "unknown-source")
|
|
39
|
+
|
|
40
|
+
identifiers = _extract_identifiers(payload)
|
|
41
|
+
if not identifiers:
|
|
42
|
+
raise ValueError("No resolvable identifiers found in normalized social signal payload")
|
|
43
|
+
|
|
44
|
+
primary_type, primary_value, primary_confidence = identifiers[0]
|
|
45
|
+
resolution = resolver.resolve(
|
|
46
|
+
primary_type,
|
|
47
|
+
primary_value,
|
|
48
|
+
context={
|
|
49
|
+
"confidence": primary_confidence,
|
|
50
|
+
"entity_type": default_entity_type,
|
|
51
|
+
"caused_by": caused_by,
|
|
52
|
+
"provenance": source,
|
|
53
|
+
},
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
for alias_type, alias_value, alias_confidence in identifiers[1:]:
|
|
57
|
+
resolver.add_alias(
|
|
58
|
+
resolution.entity_id,
|
|
59
|
+
alias_type,
|
|
60
|
+
alias_value,
|
|
61
|
+
confidence=alias_confidence,
|
|
62
|
+
caused_by=caused_by,
|
|
63
|
+
provenance=source,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
emitted = resolver.drain_events()
|
|
67
|
+
return SignalResolutionResult(
|
|
68
|
+
entity_id=resolution.entity_id,
|
|
69
|
+
confidence=resolution.confidence,
|
|
70
|
+
emitted_events=emitted,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _coerce_envelope(signal_envelope: Mapping[str, Any] | Any) -> Dict[str, Any]:
|
|
75
|
+
if isinstance(signal_envelope, Mapping):
|
|
76
|
+
return dict(signal_envelope)
|
|
77
|
+
if hasattr(signal_envelope, "to_dict") and callable(signal_envelope.to_dict):
|
|
78
|
+
return dict(signal_envelope.to_dict())
|
|
79
|
+
raise TypeError("signal_envelope must be a mapping or provide to_dict()")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _coerce_payload(payload: Any) -> Dict[str, Any]:
|
|
83
|
+
if payload is None:
|
|
84
|
+
return {}
|
|
85
|
+
if isinstance(payload, Mapping):
|
|
86
|
+
return dict(payload)
|
|
87
|
+
if hasattr(payload, "to_dict") and callable(payload.to_dict):
|
|
88
|
+
return dict(payload.to_dict())
|
|
89
|
+
raise TypeError("signal payload must be a mapping or provide to_dict()")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _extract_identifiers(payload: Dict[str, Any]) -> List[Tuple[str, str, float]]:
|
|
93
|
+
platform = str(payload.get("platform") or "").strip().lower()
|
|
94
|
+
|
|
95
|
+
candidates: List[Tuple[int, str, str, float]] = []
|
|
96
|
+
|
|
97
|
+
# Highest confidence identifiers first.
|
|
98
|
+
if isinstance(payload.get("email"), str) and payload["email"].strip():
|
|
99
|
+
candidates.append((0, "email", payload["email"].strip(), 0.98))
|
|
100
|
+
|
|
101
|
+
for key in ("profile_url", "author_url", "canonical_url"):
|
|
102
|
+
value = payload.get(key)
|
|
103
|
+
if isinstance(value, str) and value.strip():
|
|
104
|
+
candidates.append((1, "canonical_url", value.strip(), 0.96))
|
|
105
|
+
break
|
|
106
|
+
|
|
107
|
+
handle = payload.get("author_handle") or payload.get("handle")
|
|
108
|
+
if isinstance(handle, str) and handle.strip():
|
|
109
|
+
handle_type = f"{platform}_handle" if platform else "handle"
|
|
110
|
+
candidates.append((2, handle_type, handle.strip(), 0.93))
|
|
111
|
+
|
|
112
|
+
if isinstance(payload.get("domain"), str) and payload["domain"].strip():
|
|
113
|
+
candidates.append((3, "domain", payload["domain"].strip(), 0.9))
|
|
114
|
+
|
|
115
|
+
for key in ("display_name", "name"):
|
|
116
|
+
value = payload.get(key)
|
|
117
|
+
if isinstance(value, str) and value.strip():
|
|
118
|
+
candidates.append((4, "name", value.strip(), 0.7))
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
# Deduplicate by (identifier_type, raw value) while preserving deterministic order.
|
|
122
|
+
seen: set[Tuple[str, str]] = set()
|
|
123
|
+
ordered: List[Tuple[str, str, float]] = []
|
|
124
|
+
for _, id_type, id_value, confidence in sorted(candidates, key=lambda c: (c[0], c[1], c[2])):
|
|
125
|
+
key = (id_type, id_value)
|
|
126
|
+
if key in seen:
|
|
127
|
+
continue
|
|
128
|
+
seen.add(key)
|
|
129
|
+
ordered.append((id_type, id_value, confidence))
|
|
130
|
+
|
|
131
|
+
return ordered
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from typing import Any, Dict
|
|
6
|
+
|
|
7
|
+
DEFAULT_SCHEMA_VERSION = "0.1"
|
|
8
|
+
try:
|
|
9
|
+
from metaspn_schemas.core import DEFAULT_SCHEMA_VERSION as _SCHEMA_VERSION
|
|
10
|
+
|
|
11
|
+
DEFAULT_SCHEMA_VERSION = _SCHEMA_VERSION
|
|
12
|
+
except Exception:
|
|
13
|
+
# Keep local behavior deterministic when dependency is not importable in dev sandboxes.
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class EmittedEvent:
|
|
19
|
+
event_type: str
|
|
20
|
+
payload: Dict[str, Any]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class EventFactory:
|
|
24
|
+
@staticmethod
|
|
25
|
+
def _now() -> datetime:
|
|
26
|
+
return datetime.now(timezone.utc).replace(microsecond=0)
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def entity_resolved(entity_id: str, resolver: str, confidence: float) -> EmittedEvent:
|
|
30
|
+
return EmittedEvent(
|
|
31
|
+
event_type="EntityResolved",
|
|
32
|
+
payload={
|
|
33
|
+
"entity_id": entity_id,
|
|
34
|
+
"resolver": resolver,
|
|
35
|
+
"resolved_at": EventFactory._now().isoformat(),
|
|
36
|
+
"confidence": confidence,
|
|
37
|
+
"schema_version": DEFAULT_SCHEMA_VERSION,
|
|
38
|
+
},
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def entity_merged(entity_id: str, merged_from: tuple[str, ...], reason: str | None = None) -> EmittedEvent:
|
|
43
|
+
return EmittedEvent(
|
|
44
|
+
event_type="EntityMerged",
|
|
45
|
+
payload={
|
|
46
|
+
"entity_id": entity_id,
|
|
47
|
+
"merged_from": list(merged_from),
|
|
48
|
+
"merged_at": EventFactory._now().isoformat(),
|
|
49
|
+
"reason": reason,
|
|
50
|
+
"schema_version": DEFAULT_SCHEMA_VERSION,
|
|
51
|
+
},
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def entity_alias_added(entity_id: str, alias: str, alias_type: str) -> EmittedEvent:
|
|
56
|
+
return EmittedEvent(
|
|
57
|
+
event_type="EntityAliasAdded",
|
|
58
|
+
payload={
|
|
59
|
+
"entity_id": entity_id,
|
|
60
|
+
"alias": alias,
|
|
61
|
+
"alias_type": alias_type,
|
|
62
|
+
"added_at": EventFactory._now().isoformat(),
|
|
63
|
+
"schema_version": DEFAULT_SCHEMA_VERSION,
|
|
64
|
+
},
|
|
65
|
+
)
|
|
@@ -39,12 +39,11 @@ class EntityResolver:
|
|
|
39
39
|
created_new_entity=False,
|
|
40
40
|
matched_identifiers=matched_identifiers,
|
|
41
41
|
)
|
|
42
|
-
self._event_buffer.append(
|
|
43
|
-
EventFactory.entity_resolved(entity_id, identifier_type, value, resolution.confidence, False)
|
|
44
|
-
)
|
|
42
|
+
self._event_buffer.append(EventFactory.entity_resolved(entity_id, caused_by, resolution.confidence))
|
|
45
43
|
return resolution
|
|
46
44
|
|
|
47
45
|
entity_id = self.store.create_entity(entity_type)
|
|
46
|
+
created_entity_id = entity_id
|
|
48
47
|
added, conflicting_entity_id = self.store.add_alias(
|
|
49
48
|
identifier_type=identifier_type,
|
|
50
49
|
normalized_value=normalized,
|
|
@@ -58,9 +57,7 @@ class EntityResolver:
|
|
|
58
57
|
merge_reason = f"auto-merge on {identifier_type}:{normalized}"
|
|
59
58
|
self.store.merge_entities(entity_id, conflicting_entity_id, merge_reason, "auto-merge")
|
|
60
59
|
entity_id = self.store.canonical_entity_id(conflicting_entity_id)
|
|
61
|
-
self._event_buffer.append(
|
|
62
|
-
EventFactory.entity_merged(entity_id, conflicting_entity_id, merge_reason, "auto-merge")
|
|
63
|
-
)
|
|
60
|
+
self._event_buffer.append(EventFactory.entity_merged(entity_id, (created_entity_id,), merge_reason))
|
|
64
61
|
|
|
65
62
|
matched_identifiers = list(self.store.iter_identifiers_for_entity(entity_id))
|
|
66
63
|
resolution = EntityResolution(
|
|
@@ -70,10 +67,8 @@ class EntityResolver:
|
|
|
70
67
|
matched_identifiers=matched_identifiers,
|
|
71
68
|
)
|
|
72
69
|
if added:
|
|
73
|
-
self._event_buffer.append(EventFactory.entity_alias_added(entity_id,
|
|
74
|
-
self._event_buffer.append(
|
|
75
|
-
EventFactory.entity_resolved(entity_id, identifier_type, value, resolution.confidence, True)
|
|
76
|
-
)
|
|
70
|
+
self._event_buffer.append(EventFactory.entity_alias_added(entity_id, normalized, identifier_type))
|
|
71
|
+
self._event_buffer.append(EventFactory.entity_resolved(entity_id, caused_by, resolution.confidence))
|
|
77
72
|
return resolution
|
|
78
73
|
|
|
79
74
|
def add_alias(
|
|
@@ -102,7 +97,7 @@ class EntityResolver:
|
|
|
102
97
|
if identifier_type in AUTO_MERGE_IDENTIFIER_TYPES:
|
|
103
98
|
reason = f"auto-merge on {identifier_type}:{normalized}"
|
|
104
99
|
self.store.merge_entities(canonical_entity_id, conflicting_entity_id, reason, "auto-merge")
|
|
105
|
-
event = EventFactory.entity_merged(
|
|
100
|
+
event = EventFactory.entity_merged(conflicting_entity_id, (canonical_entity_id,), reason)
|
|
106
101
|
self._event_buffer.append(event)
|
|
107
102
|
return [event]
|
|
108
103
|
raise ValueError(
|
|
@@ -112,7 +107,7 @@ class EntityResolver:
|
|
|
112
107
|
if not added:
|
|
113
108
|
return []
|
|
114
109
|
|
|
115
|
-
event = EventFactory.entity_alias_added(canonical_entity_id,
|
|
110
|
+
event = EventFactory.entity_alias_added(canonical_entity_id, normalized, identifier_type)
|
|
116
111
|
self._event_buffer.append(event)
|
|
117
112
|
return [event]
|
|
118
113
|
|
|
@@ -120,7 +115,7 @@ class EntityResolver:
|
|
|
120
115
|
self.store.ensure_entity(from_entity_id)
|
|
121
116
|
self.store.ensure_entity(to_entity_id)
|
|
122
117
|
self.store.merge_entities(from_entity_id, to_entity_id, reason, caused_by)
|
|
123
|
-
event = EventFactory.entity_merged(
|
|
118
|
+
event = EventFactory.entity_merged(self.store.canonical_entity_id(to_entity_id), (from_entity_id,), reason)
|
|
124
119
|
self._event_buffer.append(event)
|
|
125
120
|
return event
|
|
126
121
|
|
|
@@ -130,7 +125,7 @@ class EntityResolver:
|
|
|
130
125
|
self.store.remove_redirect(from_entity_id)
|
|
131
126
|
self.store.set_entity_status(from_entity_id, EntityStatus.ACTIVE)
|
|
132
127
|
self.store.merge_entities(to_entity_id, from_entity_id, reason, caused_by)
|
|
133
|
-
event = EventFactory.entity_merged(
|
|
128
|
+
event = EventFactory.entity_merged(self.store.canonical_entity_id(from_entity_id), (to_entity_id,), reason)
|
|
134
129
|
self._event_buffer.append(event)
|
|
135
130
|
return event
|
|
136
131
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: metaspn-entities
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Canonical entity resolution, aliasing, and merges for MetaSPN systems
|
|
5
5
|
Author: MetaSPN Contributors
|
|
6
6
|
License-Expression: MIT
|
|
@@ -55,3 +55,47 @@ print(resolution.entity_id, resolution.confidence)
|
|
|
55
55
|
- `undo_merge(from_entity_id, to_entity_id, ...)` (implemented as reverse merge with redirect correction)
|
|
56
56
|
- `drain_events() -> list[EmittedEvent]`
|
|
57
57
|
- `export_snapshot(output_path)` to inspect SQLite state as JSON
|
|
58
|
+
|
|
59
|
+
## Event Contract Guarantees
|
|
60
|
+
|
|
61
|
+
`drain_events()` returns `EmittedEvent` objects whose `event_type` and `payload` are
|
|
62
|
+
schema-compatible with `metaspn-schemas` entity events.
|
|
63
|
+
|
|
64
|
+
- `EntityResolved` payload keys:
|
|
65
|
+
- `entity_id`, `resolver`, `resolved_at`, `confidence`, `schema_version`
|
|
66
|
+
- `EntityMerged` payload keys:
|
|
67
|
+
- `entity_id`, `merged_from`, `merged_at`, `reason`, `schema_version`
|
|
68
|
+
- `EntityAliasAdded` payload keys:
|
|
69
|
+
- `entity_id`, `alias`, `alias_type`, `added_at`, `schema_version`
|
|
70
|
+
|
|
71
|
+
Datetime fields are emitted as UTC ISO-8601 strings for deterministic serialization.
|
|
72
|
+
|
|
73
|
+
## M0 Ingestion Adapter
|
|
74
|
+
|
|
75
|
+
For worker/runtime integration, use `resolve_normalized_social_signal(...)` with a
|
|
76
|
+
normalized signal envelope.
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from metaspn_entities import EntityResolver, resolve_normalized_social_signal
|
|
80
|
+
|
|
81
|
+
resolver = EntityResolver()
|
|
82
|
+
signal = {
|
|
83
|
+
"source": "social.ingest",
|
|
84
|
+
"payload_type": "SocialPostSeen",
|
|
85
|
+
"payload": {
|
|
86
|
+
"platform": "twitter",
|
|
87
|
+
"author_handle": "@some_handle",
|
|
88
|
+
"profile_url": "https://example.com/profiles/some-handle",
|
|
89
|
+
},
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
result = resolve_normalized_social_signal(resolver, signal)
|
|
93
|
+
print(result.entity_id, result.confidence)
|
|
94
|
+
for event in result.emitted_events:
|
|
95
|
+
print(event.event_type, event.payload)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Adapter behavior:
|
|
99
|
+
- Extracts deterministic identifier candidates from normalized payloads.
|
|
100
|
+
- Resolves a primary identifier, then adds remaining identifiers as aliases.
|
|
101
|
+
- Returns only events produced during the adapter call.
|
|
@@ -2,6 +2,7 @@ LICENSE
|
|
|
2
2
|
README.md
|
|
3
3
|
pyproject.toml
|
|
4
4
|
metaspn_entities/__init__.py
|
|
5
|
+
metaspn_entities/adapter.py
|
|
5
6
|
metaspn_entities/events.py
|
|
6
7
|
metaspn_entities/models.py
|
|
7
8
|
metaspn_entities/normalize.py
|
|
@@ -12,4 +13,6 @@ metaspn_entities.egg-info/SOURCES.txt
|
|
|
12
13
|
metaspn_entities.egg-info/dependency_links.txt
|
|
13
14
|
metaspn_entities.egg-info/requires.txt
|
|
14
15
|
metaspn_entities.egg-info/top_level.txt
|
|
16
|
+
tests/test_adapter.py
|
|
17
|
+
tests/test_event_contract.py
|
|
15
18
|
tests/test_resolver.py
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import sys
|
|
3
|
+
import tempfile
|
|
4
|
+
import unittest
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from metaspn_entities import SQLiteEntityStore
|
|
8
|
+
from metaspn_entities.adapter import resolve_normalized_social_signal
|
|
9
|
+
from metaspn_entities.resolver import EntityResolver
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class AdapterTests(unittest.TestCase):
|
|
13
|
+
def setUp(self) -> None:
|
|
14
|
+
self.tempdir = tempfile.TemporaryDirectory()
|
|
15
|
+
self.db_path = str(Path(self.tempdir.name) / "entities.db")
|
|
16
|
+
self.store = SQLiteEntityStore(self.db_path)
|
|
17
|
+
self.resolver = EntityResolver(self.store)
|
|
18
|
+
|
|
19
|
+
def tearDown(self) -> None:
|
|
20
|
+
self.store.close()
|
|
21
|
+
self.tempdir.cleanup()
|
|
22
|
+
|
|
23
|
+
def test_same_author_over_multiple_posts(self) -> None:
|
|
24
|
+
first_signal = {
|
|
25
|
+
"source": "social.ingest",
|
|
26
|
+
"payload_type": "SocialPostSeen",
|
|
27
|
+
"payload": {
|
|
28
|
+
"platform": "twitter",
|
|
29
|
+
"post_id": "p1",
|
|
30
|
+
"author_handle": "@same_author",
|
|
31
|
+
"url": "https://x.com/same_author/status/1",
|
|
32
|
+
},
|
|
33
|
+
}
|
|
34
|
+
second_signal = {
|
|
35
|
+
"source": "social.ingest",
|
|
36
|
+
"payload_type": "SocialPostSeen",
|
|
37
|
+
"payload": {
|
|
38
|
+
"platform": "twitter",
|
|
39
|
+
"post_id": "p2",
|
|
40
|
+
"author_handle": "same_author",
|
|
41
|
+
"url": "https://x.com/same_author/status/2",
|
|
42
|
+
},
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
first = resolve_normalized_social_signal(self.resolver, first_signal)
|
|
46
|
+
second = resolve_normalized_social_signal(self.resolver, second_signal)
|
|
47
|
+
|
|
48
|
+
self.assertEqual(first.entity_id, second.entity_id)
|
|
49
|
+
|
|
50
|
+
def test_cross_platform_identifier_normalization(self) -> None:
|
|
51
|
+
twitter_signal = {
|
|
52
|
+
"source": "social.ingest",
|
|
53
|
+
"payload_type": "SocialPostSeen",
|
|
54
|
+
"payload": {
|
|
55
|
+
"platform": "twitter",
|
|
56
|
+
"post_id": "p100",
|
|
57
|
+
"author_handle": "alice",
|
|
58
|
+
"profile_url": "https://example.com/team/alice/",
|
|
59
|
+
},
|
|
60
|
+
}
|
|
61
|
+
linkedin_signal = {
|
|
62
|
+
"source": "social.ingest",
|
|
63
|
+
"payload_type": "ProfileSnapshotSeen",
|
|
64
|
+
"payload": {
|
|
65
|
+
"platform": "linkedin",
|
|
66
|
+
"profile_id": "li-77",
|
|
67
|
+
"handle": "alice-smith",
|
|
68
|
+
"profile_url": "http://www.example.com/team/alice",
|
|
69
|
+
},
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
first = resolve_normalized_social_signal(self.resolver, twitter_signal)
|
|
73
|
+
second = resolve_normalized_social_signal(self.resolver, linkedin_signal)
|
|
74
|
+
|
|
75
|
+
self.assertEqual(first.entity_id, second.entity_id)
|
|
76
|
+
|
|
77
|
+
def test_idempotent_rerun_behavior(self) -> None:
|
|
78
|
+
signal = {
|
|
79
|
+
"source": "social.ingest",
|
|
80
|
+
"payload_type": "SocialPostSeen",
|
|
81
|
+
"payload": {
|
|
82
|
+
"platform": "twitter",
|
|
83
|
+
"post_id": "idempotent-1",
|
|
84
|
+
"author_handle": "rerun_author",
|
|
85
|
+
"profile_url": "https://example.org/rerun_author",
|
|
86
|
+
},
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
first = resolve_normalized_social_signal(self.resolver, signal)
|
|
90
|
+
second = resolve_normalized_social_signal(self.resolver, signal)
|
|
91
|
+
|
|
92
|
+
self.assertEqual(first.entity_id, second.entity_id)
|
|
93
|
+
self.assertEqual(
|
|
94
|
+
[event.event_type for event in second.emitted_events],
|
|
95
|
+
["EntityResolved"],
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def test_emitted_events_parse_with_metaspn_schemas(self) -> None:
|
|
99
|
+
entities_module = None
|
|
100
|
+
try:
|
|
101
|
+
entities_module = importlib.import_module("metaspn_schemas.entities")
|
|
102
|
+
except Exception:
|
|
103
|
+
sibling_src = Path(__file__).resolve().parents[2] / "metaspn-schemas" / "src"
|
|
104
|
+
if sibling_src.exists():
|
|
105
|
+
sys.path.insert(0, str(sibling_src))
|
|
106
|
+
entities_module = importlib.import_module("metaspn_schemas.entities")
|
|
107
|
+
|
|
108
|
+
if entities_module is None:
|
|
109
|
+
self.skipTest("metaspn_schemas is unavailable")
|
|
110
|
+
|
|
111
|
+
signal = {
|
|
112
|
+
"source": "social.ingest",
|
|
113
|
+
"payload_type": "SocialPostSeen",
|
|
114
|
+
"payload": {
|
|
115
|
+
"platform": "twitter",
|
|
116
|
+
"post_id": "schema-1",
|
|
117
|
+
"author_handle": "schema_user",
|
|
118
|
+
"profile_url": "https://example.net/schema_user",
|
|
119
|
+
},
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
result = resolve_normalized_social_signal(self.resolver, signal)
|
|
123
|
+
for event in result.emitted_events:
|
|
124
|
+
if event.event_type == "EntityResolved":
|
|
125
|
+
parsed = entities_module.EntityResolved.from_dict(event.payload)
|
|
126
|
+
self.assertEqual(parsed.entity_id, result.entity_id)
|
|
127
|
+
elif event.event_type == "EntityAliasAdded":
|
|
128
|
+
parsed = entities_module.EntityAliasAdded.from_dict(event.payload)
|
|
129
|
+
self.assertTrue(parsed.alias)
|
|
130
|
+
elif event.event_type == "EntityMerged":
|
|
131
|
+
parsed = entities_module.EntityMerged.from_dict(event.payload)
|
|
132
|
+
self.assertTrue(parsed.merged_from)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
unittest.main()
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import sys
|
|
3
|
+
import tempfile
|
|
4
|
+
import unittest
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from metaspn_entities import EntityResolver, SQLiteEntityStore
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EventContractTests(unittest.TestCase):
|
|
12
|
+
def setUp(self) -> None:
|
|
13
|
+
self.tempdir = tempfile.TemporaryDirectory()
|
|
14
|
+
self.db_path = str(Path(self.tempdir.name) / "entities.db")
|
|
15
|
+
self.store = SQLiteEntityStore(self.db_path)
|
|
16
|
+
self.resolver = EntityResolver(self.store)
|
|
17
|
+
|
|
18
|
+
def tearDown(self) -> None:
|
|
19
|
+
self.store.close()
|
|
20
|
+
self.tempdir.cleanup()
|
|
21
|
+
|
|
22
|
+
def test_entity_resolved_payload_shape(self) -> None:
|
|
23
|
+
self.resolver.resolve("twitter_handle", "contract_user")
|
|
24
|
+
event = self.resolver.drain_events()[-1]
|
|
25
|
+
|
|
26
|
+
self.assertEqual(event.event_type, "EntityResolved")
|
|
27
|
+
self.assertEqual(
|
|
28
|
+
sorted(event.payload.keys()),
|
|
29
|
+
["confidence", "entity_id", "resolved_at", "resolver", "schema_version"],
|
|
30
|
+
)
|
|
31
|
+
self.assertIsInstance(event.payload["resolver"], str)
|
|
32
|
+
self.assertGreaterEqual(float(event.payload["confidence"]), 0.0)
|
|
33
|
+
datetime.fromisoformat(event.payload["resolved_at"])
|
|
34
|
+
|
|
35
|
+
def test_entity_merged_payload_shape(self) -> None:
|
|
36
|
+
a = self.resolver.resolve("twitter_handle", "merge_a")
|
|
37
|
+
b = self.resolver.resolve("twitter_handle", "merge_b")
|
|
38
|
+
self.resolver.drain_events()
|
|
39
|
+
|
|
40
|
+
self.resolver.merge_entities(a.entity_id, b.entity_id, reason="dedupe")
|
|
41
|
+
event = self.resolver.drain_events()[-1]
|
|
42
|
+
|
|
43
|
+
self.assertEqual(event.event_type, "EntityMerged")
|
|
44
|
+
self.assertEqual(
|
|
45
|
+
sorted(event.payload.keys()),
|
|
46
|
+
["entity_id", "merged_at", "merged_from", "reason", "schema_version"],
|
|
47
|
+
)
|
|
48
|
+
self.assertEqual(event.payload["entity_id"], b.entity_id)
|
|
49
|
+
self.assertEqual(event.payload["merged_from"], [a.entity_id])
|
|
50
|
+
datetime.fromisoformat(event.payload["merged_at"])
|
|
51
|
+
|
|
52
|
+
def test_entity_alias_added_payload_shape(self) -> None:
|
|
53
|
+
created = self.resolver.resolve("twitter_handle", "alias_contract")
|
|
54
|
+
self.resolver.drain_events()
|
|
55
|
+
|
|
56
|
+
events = self.resolver.add_alias(created.entity_id, "email", "alias@example.com")
|
|
57
|
+
self.assertEqual(len(events), 1)
|
|
58
|
+
event = events[0]
|
|
59
|
+
|
|
60
|
+
self.assertEqual(event.event_type, "EntityAliasAdded")
|
|
61
|
+
self.assertEqual(
|
|
62
|
+
sorted(event.payload.keys()),
|
|
63
|
+
["added_at", "alias", "alias_type", "entity_id", "schema_version"],
|
|
64
|
+
)
|
|
65
|
+
self.assertEqual(event.payload["entity_id"], created.entity_id)
|
|
66
|
+
self.assertEqual(event.payload["alias_type"], "email")
|
|
67
|
+
self.assertEqual(event.payload["alias"], "alias@example.com")
|
|
68
|
+
datetime.fromisoformat(event.payload["added_at"])
|
|
69
|
+
|
|
70
|
+
def test_schema_round_trip_when_metaspn_schemas_is_available(self) -> None:
|
|
71
|
+
# Try import from installed package first, then from sibling checkout if present.
|
|
72
|
+
entities_module = None
|
|
73
|
+
try:
|
|
74
|
+
entities_module = importlib.import_module("metaspn_schemas.entities")
|
|
75
|
+
except Exception:
|
|
76
|
+
sibling_src = Path(__file__).resolve().parents[2] / "metaspn-schemas" / "src"
|
|
77
|
+
if sibling_src.exists():
|
|
78
|
+
sys.path.insert(0, str(sibling_src))
|
|
79
|
+
entities_module = importlib.import_module("metaspn_schemas.entities")
|
|
80
|
+
|
|
81
|
+
if entities_module is None:
|
|
82
|
+
self.skipTest("metaspn_schemas is unavailable")
|
|
83
|
+
|
|
84
|
+
self.resolver.resolve("twitter_handle", "roundtrip_user")
|
|
85
|
+
resolved_event = self.resolver.drain_events()[-1]
|
|
86
|
+
|
|
87
|
+
entity_resolved = entities_module.EntityResolved.from_dict(resolved_event.payload)
|
|
88
|
+
self.assertEqual(entity_resolved.entity_id, resolved_event.payload["entity_id"])
|
|
89
|
+
|
|
90
|
+
created = self.resolver.resolve("twitter_handle", "roundtrip_alias")
|
|
91
|
+
self.resolver.drain_events()
|
|
92
|
+
alias_event = self.resolver.add_alias(created.entity_id, "email", "rt@example.com")[0]
|
|
93
|
+
entity_alias = entities_module.EntityAliasAdded.from_dict(alias_event.payload)
|
|
94
|
+
self.assertEqual(entity_alias.alias, "rt@example.com")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
if __name__ == "__main__":
|
|
98
|
+
unittest.main()
|
metaspn_entities-0.1.0/README.md
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
# metaspn-entities
|
|
2
|
-
|
|
3
|
-
Identity layer for MetaSPN systems.
|
|
4
|
-
|
|
5
|
-
## Features
|
|
6
|
-
|
|
7
|
-
- Canonical entity IDs
|
|
8
|
-
- Deterministic identifier normalization + alias resolution
|
|
9
|
-
- Merge history and reversible soft undo
|
|
10
|
-
- SQLite backend using stdlib `sqlite3`
|
|
11
|
-
- Event emission payloads for `EntityResolved`, `EntityMerged`, `EntityAliasAdded`
|
|
12
|
-
- Optional filesystem snapshot export
|
|
13
|
-
|
|
14
|
-
## Quick usage
|
|
15
|
-
|
|
16
|
-
```python
|
|
17
|
-
from metaspn_entities import EntityResolver
|
|
18
|
-
|
|
19
|
-
resolver = EntityResolver()
|
|
20
|
-
resolution = resolver.resolve("twitter_handle", "@some_handle")
|
|
21
|
-
events = resolver.drain_events()
|
|
22
|
-
print(resolution.entity_id, resolution.confidence)
|
|
23
|
-
```
|
|
24
|
-
|
|
25
|
-
## API notes
|
|
26
|
-
|
|
27
|
-
- `resolve(identifier_type, value, context=None) -> EntityResolution`
|
|
28
|
-
- `add_alias(entity_id, identifier_type, value, ...)`
|
|
29
|
-
- `merge_entities(from_entity_id, to_entity_id, reason, ...)`
|
|
30
|
-
- `undo_merge(from_entity_id, to_entity_id, ...)` (implemented as reverse merge with redirect correction)
|
|
31
|
-
- `drain_events() -> list[EmittedEvent]`
|
|
32
|
-
- `export_snapshot(output_path)` to inspect SQLite state as JSON
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
from typing import Any, Dict
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
@dataclass(frozen=True)
|
|
8
|
-
class EmittedEvent:
|
|
9
|
-
event_type: str
|
|
10
|
-
payload: Dict[str, Any]
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class EventFactory:
|
|
14
|
-
@staticmethod
|
|
15
|
-
def entity_resolved(entity_id: str, identifier_type: str, value: str, confidence: float, created_new_entity: bool) -> EmittedEvent:
|
|
16
|
-
return EmittedEvent(
|
|
17
|
-
event_type="EntityResolved",
|
|
18
|
-
payload={
|
|
19
|
-
"entity_id": entity_id,
|
|
20
|
-
"identifier_type": identifier_type,
|
|
21
|
-
"value": value,
|
|
22
|
-
"confidence": confidence,
|
|
23
|
-
"created_new_entity": created_new_entity,
|
|
24
|
-
},
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
@staticmethod
|
|
28
|
-
def entity_merged(from_entity_id: str, to_entity_id: str, reason: str, caused_by: str) -> EmittedEvent:
|
|
29
|
-
return EmittedEvent(
|
|
30
|
-
event_type="EntityMerged",
|
|
31
|
-
payload={
|
|
32
|
-
"from_entity_id": from_entity_id,
|
|
33
|
-
"to_entity_id": to_entity_id,
|
|
34
|
-
"reason": reason,
|
|
35
|
-
"caused_by": caused_by,
|
|
36
|
-
},
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
@staticmethod
|
|
40
|
-
def entity_alias_added(entity_id: str, identifier_type: str, normalized_value: str, caused_by: str) -> EmittedEvent:
|
|
41
|
-
return EmittedEvent(
|
|
42
|
-
event_type="EntityAliasAdded",
|
|
43
|
-
payload={
|
|
44
|
-
"entity_id": entity_id,
|
|
45
|
-
"identifier_type": identifier_type,
|
|
46
|
-
"normalized_value": normalized_value,
|
|
47
|
-
"caused_by": caused_by,
|
|
48
|
-
},
|
|
49
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{metaspn_entities-0.1.0 → metaspn_entities-0.1.4}/metaspn_entities.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|