metaspn-entities 0.1.3__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/PKG-INFO +40 -1
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/README.md +39 -0
- metaspn_entities-0.1.5/metaspn_entities/__init__.py +17 -0
- metaspn_entities-0.1.5/metaspn_entities/adapter.py +131 -0
- metaspn_entities-0.1.5/metaspn_entities/context.py +68 -0
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/metaspn_entities/resolver.py +25 -0
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/metaspn_entities/sqlite_backend.py +34 -0
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/metaspn_entities.egg-info/PKG-INFO +40 -1
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/metaspn_entities.egg-info/SOURCES.txt +4 -0
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/pyproject.toml +1 -1
- metaspn_entities-0.1.5/tests/test_adapter.py +136 -0
- metaspn_entities-0.1.5/tests/test_context.py +90 -0
- metaspn_entities-0.1.3/metaspn_entities/__init__.py +0 -11
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/LICENSE +0 -0
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/metaspn_entities/events.py +0 -0
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/metaspn_entities/models.py +0 -0
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/metaspn_entities/normalize.py +0 -0
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/metaspn_entities.egg-info/dependency_links.txt +0 -0
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/metaspn_entities.egg-info/requires.txt +0 -0
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/metaspn_entities.egg-info/top_level.txt +0 -0
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/setup.cfg +0 -0
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/tests/test_event_contract.py +0 -0
- {metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/tests/test_resolver.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: metaspn-entities
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Canonical entity resolution, aliasing, and merges for MetaSPN systems
|
|
5
5
|
Author: MetaSPN Contributors
|
|
6
6
|
License-Expression: MIT
|
|
@@ -69,3 +69,42 @@ schema-compatible with `metaspn-schemas` entity events.
|
|
|
69
69
|
- `entity_id`, `alias`, `alias_type`, `added_at`, `schema_version`
|
|
70
70
|
|
|
71
71
|
Datetime fields are emitted as UTC ISO-8601 strings for deterministic serialization.
|
|
72
|
+
|
|
73
|
+
## M0 Ingestion Adapter
|
|
74
|
+
|
|
75
|
+
For worker/runtime integration, use `resolve_normalized_social_signal(...)` with a
|
|
76
|
+
normalized signal envelope.
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from metaspn_entities import EntityResolver, resolve_normalized_social_signal
|
|
80
|
+
|
|
81
|
+
resolver = EntityResolver()
|
|
82
|
+
signal = {
|
|
83
|
+
"source": "social.ingest",
|
|
84
|
+
"payload_type": "SocialPostSeen",
|
|
85
|
+
"payload": {
|
|
86
|
+
"platform": "twitter",
|
|
87
|
+
"author_handle": "@some_handle",
|
|
88
|
+
"profile_url": "https://example.com/profiles/some-handle",
|
|
89
|
+
},
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
result = resolve_normalized_social_signal(resolver, signal)
|
|
93
|
+
print(result.entity_id, result.confidence)
|
|
94
|
+
for event in result.emitted_events:
|
|
95
|
+
print(event.event_type, event.payload)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Adapter behavior:
|
|
99
|
+
- Extracts deterministic identifier candidates from normalized payloads.
|
|
100
|
+
- Resolves a primary identifier, then adds remaining identifiers as aliases.
|
|
101
|
+
- Returns only events produced during the adapter call.
|
|
102
|
+
|
|
103
|
+
## M1 Context API
|
|
104
|
+
|
|
105
|
+
Profiler/router workers can read consolidated context using:
|
|
106
|
+
|
|
107
|
+
- `resolver.entity_context(entity_id, recent_limit=10)`
|
|
108
|
+
- `resolver.confidence_summary(entity_id)`
|
|
109
|
+
|
|
110
|
+
Both APIs resolve canonical redirects first, so merged IDs return coherent context.
|
|
@@ -44,3 +44,42 @@ schema-compatible with `metaspn-schemas` entity events.
|
|
|
44
44
|
- `entity_id`, `alias`, `alias_type`, `added_at`, `schema_version`
|
|
45
45
|
|
|
46
46
|
Datetime fields are emitted as UTC ISO-8601 strings for deterministic serialization.
|
|
47
|
+
|
|
48
|
+
## M0 Ingestion Adapter
|
|
49
|
+
|
|
50
|
+
For worker/runtime integration, use `resolve_normalized_social_signal(...)` with a
|
|
51
|
+
normalized signal envelope.
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from metaspn_entities import EntityResolver, resolve_normalized_social_signal
|
|
55
|
+
|
|
56
|
+
resolver = EntityResolver()
|
|
57
|
+
signal = {
|
|
58
|
+
"source": "social.ingest",
|
|
59
|
+
"payload_type": "SocialPostSeen",
|
|
60
|
+
"payload": {
|
|
61
|
+
"platform": "twitter",
|
|
62
|
+
"author_handle": "@some_handle",
|
|
63
|
+
"profile_url": "https://example.com/profiles/some-handle",
|
|
64
|
+
},
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
result = resolve_normalized_social_signal(resolver, signal)
|
|
68
|
+
print(result.entity_id, result.confidence)
|
|
69
|
+
for event in result.emitted_events:
|
|
70
|
+
print(event.event_type, event.payload)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Adapter behavior:
|
|
74
|
+
- Extracts deterministic identifier candidates from normalized payloads.
|
|
75
|
+
- Resolves a primary identifier, then adds remaining identifiers as aliases.
|
|
76
|
+
- Returns only events produced during the adapter call.
|
|
77
|
+
|
|
78
|
+
## M1 Context API
|
|
79
|
+
|
|
80
|
+
Profiler/router workers can read consolidated context using:
|
|
81
|
+
|
|
82
|
+
- `resolver.entity_context(entity_id, recent_limit=10)`
|
|
83
|
+
- `resolver.confidence_summary(entity_id)`
|
|
84
|
+
|
|
85
|
+
Both APIs resolve canonical redirects first, so merged IDs return coherent context.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from .adapter import SignalResolutionResult, resolve_normalized_social_signal
|
|
2
|
+
from .context import EntityContext, build_confidence_summary
|
|
3
|
+
from .events import EmittedEvent
|
|
4
|
+
from .models import EntityResolution
|
|
5
|
+
from .resolver import EntityResolver
|
|
6
|
+
from .sqlite_backend import SQLiteEntityStore
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"resolve_normalized_social_signal",
|
|
10
|
+
"SignalResolutionResult",
|
|
11
|
+
"EntityContext",
|
|
12
|
+
"build_confidence_summary",
|
|
13
|
+
"EntityResolver",
|
|
14
|
+
"EntityResolution",
|
|
15
|
+
"EmittedEvent",
|
|
16
|
+
"SQLiteEntityStore",
|
|
17
|
+
]
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
from .events import EmittedEvent
|
|
7
|
+
from .models import EntityType
|
|
8
|
+
from .resolver import EntityResolver
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class SignalResolutionResult:
|
|
13
|
+
entity_id: str
|
|
14
|
+
confidence: float
|
|
15
|
+
emitted_events: List[EmittedEvent]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def resolve_normalized_social_signal(
|
|
19
|
+
resolver: EntityResolver,
|
|
20
|
+
signal_envelope: Mapping[str, Any] | Any,
|
|
21
|
+
*,
|
|
22
|
+
default_entity_type: str = EntityType.PERSON,
|
|
23
|
+
caused_by: str = "m0-ingestion",
|
|
24
|
+
) -> SignalResolutionResult:
|
|
25
|
+
"""Resolve a normalized social signal envelope into a canonical entity.
|
|
26
|
+
|
|
27
|
+
The adapter is intentionally deterministic:
|
|
28
|
+
- Identifier extraction order is fixed.
|
|
29
|
+
- Primary resolution always uses the highest-priority available identifier.
|
|
30
|
+
- Remaining identifiers are added as aliases in deterministic order.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
# Keep adapter call output scoped to actions taken in this invocation only.
|
|
34
|
+
resolver.drain_events()
|
|
35
|
+
|
|
36
|
+
envelope = _coerce_envelope(signal_envelope)
|
|
37
|
+
payload = _coerce_payload(envelope.get("payload"))
|
|
38
|
+
source = str(envelope.get("source") or "unknown-source")
|
|
39
|
+
|
|
40
|
+
identifiers = _extract_identifiers(payload)
|
|
41
|
+
if not identifiers:
|
|
42
|
+
raise ValueError("No resolvable identifiers found in normalized social signal payload")
|
|
43
|
+
|
|
44
|
+
primary_type, primary_value, primary_confidence = identifiers[0]
|
|
45
|
+
resolution = resolver.resolve(
|
|
46
|
+
primary_type,
|
|
47
|
+
primary_value,
|
|
48
|
+
context={
|
|
49
|
+
"confidence": primary_confidence,
|
|
50
|
+
"entity_type": default_entity_type,
|
|
51
|
+
"caused_by": caused_by,
|
|
52
|
+
"provenance": source,
|
|
53
|
+
},
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
for alias_type, alias_value, alias_confidence in identifiers[1:]:
|
|
57
|
+
resolver.add_alias(
|
|
58
|
+
resolution.entity_id,
|
|
59
|
+
alias_type,
|
|
60
|
+
alias_value,
|
|
61
|
+
confidence=alias_confidence,
|
|
62
|
+
caused_by=caused_by,
|
|
63
|
+
provenance=source,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
emitted = resolver.drain_events()
|
|
67
|
+
return SignalResolutionResult(
|
|
68
|
+
entity_id=resolution.entity_id,
|
|
69
|
+
confidence=resolution.confidence,
|
|
70
|
+
emitted_events=emitted,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _coerce_envelope(signal_envelope: Mapping[str, Any] | Any) -> Dict[str, Any]:
|
|
75
|
+
if isinstance(signal_envelope, Mapping):
|
|
76
|
+
return dict(signal_envelope)
|
|
77
|
+
if hasattr(signal_envelope, "to_dict") and callable(signal_envelope.to_dict):
|
|
78
|
+
return dict(signal_envelope.to_dict())
|
|
79
|
+
raise TypeError("signal_envelope must be a mapping or provide to_dict()")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _coerce_payload(payload: Any) -> Dict[str, Any]:
|
|
83
|
+
if payload is None:
|
|
84
|
+
return {}
|
|
85
|
+
if isinstance(payload, Mapping):
|
|
86
|
+
return dict(payload)
|
|
87
|
+
if hasattr(payload, "to_dict") and callable(payload.to_dict):
|
|
88
|
+
return dict(payload.to_dict())
|
|
89
|
+
raise TypeError("signal payload must be a mapping or provide to_dict()")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _extract_identifiers(payload: Dict[str, Any]) -> List[Tuple[str, str, float]]:
|
|
93
|
+
platform = str(payload.get("platform") or "").strip().lower()
|
|
94
|
+
|
|
95
|
+
candidates: List[Tuple[int, str, str, float]] = []
|
|
96
|
+
|
|
97
|
+
# Highest confidence identifiers first.
|
|
98
|
+
if isinstance(payload.get("email"), str) and payload["email"].strip():
|
|
99
|
+
candidates.append((0, "email", payload["email"].strip(), 0.98))
|
|
100
|
+
|
|
101
|
+
for key in ("profile_url", "author_url", "canonical_url"):
|
|
102
|
+
value = payload.get(key)
|
|
103
|
+
if isinstance(value, str) and value.strip():
|
|
104
|
+
candidates.append((1, "canonical_url", value.strip(), 0.96))
|
|
105
|
+
break
|
|
106
|
+
|
|
107
|
+
handle = payload.get("author_handle") or payload.get("handle")
|
|
108
|
+
if isinstance(handle, str) and handle.strip():
|
|
109
|
+
handle_type = f"{platform}_handle" if platform else "handle"
|
|
110
|
+
candidates.append((2, handle_type, handle.strip(), 0.93))
|
|
111
|
+
|
|
112
|
+
if isinstance(payload.get("domain"), str) and payload["domain"].strip():
|
|
113
|
+
candidates.append((3, "domain", payload["domain"].strip(), 0.9))
|
|
114
|
+
|
|
115
|
+
for key in ("display_name", "name"):
|
|
116
|
+
value = payload.get(key)
|
|
117
|
+
if isinstance(value, str) and value.strip():
|
|
118
|
+
candidates.append((4, "name", value.strip(), 0.7))
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
# Deduplicate by (identifier_type, raw value) while preserving deterministic order.
|
|
122
|
+
seen: set[Tuple[str, str]] = set()
|
|
123
|
+
ordered: List[Tuple[str, str, float]] = []
|
|
124
|
+
for _, id_type, id_value, confidence in sorted(candidates, key=lambda c: (c[0], c[1], c[2])):
|
|
125
|
+
key = (id_type, id_value)
|
|
126
|
+
if key in seen:
|
|
127
|
+
continue
|
|
128
|
+
seen.add(key)
|
|
129
|
+
ordered.append((id_type, id_value, confidence))
|
|
130
|
+
|
|
131
|
+
return ordered
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class EntityContext:
|
|
9
|
+
entity_id: str
|
|
10
|
+
aliases: List[Dict[str, Any]]
|
|
11
|
+
identifiers: List[Dict[str, Any]]
|
|
12
|
+
recent_evidence: List[Dict[str, Any]]
|
|
13
|
+
confidence_summary: Dict[str, Any]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def build_confidence_summary(
|
|
17
|
+
aliases: List[Dict[str, Any]],
|
|
18
|
+
identifiers: List[Dict[str, Any]],
|
|
19
|
+
evidence: List[Dict[str, Any]],
|
|
20
|
+
) -> Dict[str, Any]:
|
|
21
|
+
identifier_confidences = sorted(float(item["confidence"]) for item in identifiers)
|
|
22
|
+
alias_confidences = sorted(float(item["confidence"]) for item in aliases)
|
|
23
|
+
source_set = sorted(
|
|
24
|
+
{
|
|
25
|
+
str(item.get("provenance"))
|
|
26
|
+
for item in evidence
|
|
27
|
+
if item.get("provenance") not in (None, "")
|
|
28
|
+
}
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
identifier_avg = _avg(identifier_confidences)
|
|
32
|
+
alias_avg = _avg(alias_confidences)
|
|
33
|
+
source_diversity = min(1.0, len(source_set) / 3.0)
|
|
34
|
+
|
|
35
|
+
overall = min(1.0, (0.65 * identifier_avg) + (0.25 * alias_avg) + (0.10 * source_diversity))
|
|
36
|
+
by_identifier_type = _rollup_by_identifier_type(identifiers)
|
|
37
|
+
|
|
38
|
+
return {
|
|
39
|
+
"overall_confidence": round(overall, 6),
|
|
40
|
+
"identifier_confidence_avg": round(identifier_avg, 6),
|
|
41
|
+
"alias_confidence_avg": round(alias_avg, 6),
|
|
42
|
+
"unique_source_count": len(source_set),
|
|
43
|
+
"evidence_count": len(evidence),
|
|
44
|
+
"by_identifier_type": by_identifier_type,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _avg(values: List[float]) -> float:
|
|
49
|
+
if not values:
|
|
50
|
+
return 0.0
|
|
51
|
+
return sum(values) / len(values)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _rollup_by_identifier_type(identifiers: List[Dict[str, Any]]) -> Dict[str, Dict[str, float]]:
|
|
55
|
+
grouped: Dict[str, List[float]] = {}
|
|
56
|
+
for item in identifiers:
|
|
57
|
+
key = str(item["identifier_type"])
|
|
58
|
+
grouped.setdefault(key, []).append(float(item["confidence"]))
|
|
59
|
+
|
|
60
|
+
rollup: Dict[str, Dict[str, float]] = {}
|
|
61
|
+
for key in sorted(grouped):
|
|
62
|
+
values = sorted(grouped[key])
|
|
63
|
+
rollup[key] = {
|
|
64
|
+
"count": float(len(values)),
|
|
65
|
+
"avg_confidence": round(_avg(values), 6),
|
|
66
|
+
"max_confidence": round(max(values), 6),
|
|
67
|
+
}
|
|
68
|
+
return rollup
|
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from typing import Any, Dict, List, Optional
|
|
4
4
|
|
|
5
|
+
from .context import EntityContext, build_confidence_summary
|
|
5
6
|
from .events import EmittedEvent, EventFactory
|
|
6
7
|
from .models import (
|
|
7
8
|
DEFAULT_MATCH_CONFIDENCE,
|
|
@@ -135,6 +136,30 @@ class EntityResolver:
|
|
|
135
136
|
def aliases_for_entity(self, entity_id: str) -> List[Dict[str, Any]]:
|
|
136
137
|
return self.store.list_aliases_for_entity(entity_id)
|
|
137
138
|
|
|
139
|
+
def confidence_summary(self, entity_id: str) -> Dict[str, Any]:
|
|
140
|
+
canonical_id = self.store.canonical_entity_id(entity_id)
|
|
141
|
+
aliases = self.store.list_aliases_for_entity(canonical_id)
|
|
142
|
+
identifiers = self.store.list_identifier_records_for_entity(canonical_id)
|
|
143
|
+
return build_confidence_summary(aliases, identifiers, identifiers)
|
|
144
|
+
|
|
145
|
+
def entity_context(self, entity_id: str, recent_limit: int = 10) -> EntityContext:
|
|
146
|
+
canonical_id = self.store.canonical_entity_id(entity_id)
|
|
147
|
+
aliases = self.store.list_aliases_for_entity(canonical_id)
|
|
148
|
+
identifiers = self.store.list_identifier_records_for_entity(canonical_id)
|
|
149
|
+
recent_evidence = sorted(
|
|
150
|
+
identifiers,
|
|
151
|
+
key=lambda row: (str(row["last_seen_at"]), str(row["identifier_type"]), str(row["normalized_value"])),
|
|
152
|
+
reverse=True,
|
|
153
|
+
)[: max(recent_limit, 0)]
|
|
154
|
+
summary = build_confidence_summary(aliases, identifiers, recent_evidence)
|
|
155
|
+
return EntityContext(
|
|
156
|
+
entity_id=canonical_id,
|
|
157
|
+
aliases=aliases,
|
|
158
|
+
identifiers=identifiers,
|
|
159
|
+
recent_evidence=recent_evidence,
|
|
160
|
+
confidence_summary=summary,
|
|
161
|
+
)
|
|
162
|
+
|
|
138
163
|
def export_snapshot(self, output_path: str) -> None:
|
|
139
164
|
self.store.export_snapshot(output_path)
|
|
140
165
|
|
|
@@ -276,3 +276,37 @@ class SQLiteEntityStore:
|
|
|
276
276
|
"normalized_value": row["normalized_value"],
|
|
277
277
|
"confidence": row["confidence"],
|
|
278
278
|
}
|
|
279
|
+
|
|
280
|
+
def list_identifier_records_for_entity(self, entity_id: str) -> List[Dict[str, Any]]:
|
|
281
|
+
target = self.canonical_entity_id(entity_id)
|
|
282
|
+
rows = self.conn.execute(
|
|
283
|
+
"""
|
|
284
|
+
SELECT
|
|
285
|
+
a.entity_id,
|
|
286
|
+
i.identifier_type,
|
|
287
|
+
i.value,
|
|
288
|
+
i.normalized_value,
|
|
289
|
+
i.confidence,
|
|
290
|
+
i.first_seen_at,
|
|
291
|
+
i.last_seen_at,
|
|
292
|
+
i.provenance
|
|
293
|
+
FROM aliases a
|
|
294
|
+
JOIN identifiers i
|
|
295
|
+
ON a.identifier_type = i.identifier_type
|
|
296
|
+
AND a.normalized_value = i.normalized_value
|
|
297
|
+
ORDER BY i.identifier_type, i.normalized_value
|
|
298
|
+
"""
|
|
299
|
+
).fetchall()
|
|
300
|
+
return [
|
|
301
|
+
{
|
|
302
|
+
"identifier_type": row["identifier_type"],
|
|
303
|
+
"value": row["value"],
|
|
304
|
+
"normalized_value": row["normalized_value"],
|
|
305
|
+
"confidence": row["confidence"],
|
|
306
|
+
"first_seen_at": row["first_seen_at"],
|
|
307
|
+
"last_seen_at": row["last_seen_at"],
|
|
308
|
+
"provenance": row["provenance"],
|
|
309
|
+
}
|
|
310
|
+
for row in rows
|
|
311
|
+
if self.canonical_entity_id(row["entity_id"]) == target
|
|
312
|
+
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: metaspn-entities
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Canonical entity resolution, aliasing, and merges for MetaSPN systems
|
|
5
5
|
Author: MetaSPN Contributors
|
|
6
6
|
License-Expression: MIT
|
|
@@ -69,3 +69,42 @@ schema-compatible with `metaspn-schemas` entity events.
|
|
|
69
69
|
- `entity_id`, `alias`, `alias_type`, `added_at`, `schema_version`
|
|
70
70
|
|
|
71
71
|
Datetime fields are emitted as UTC ISO-8601 strings for deterministic serialization.
|
|
72
|
+
|
|
73
|
+
## M0 Ingestion Adapter
|
|
74
|
+
|
|
75
|
+
For worker/runtime integration, use `resolve_normalized_social_signal(...)` with a
|
|
76
|
+
normalized signal envelope.
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from metaspn_entities import EntityResolver, resolve_normalized_social_signal
|
|
80
|
+
|
|
81
|
+
resolver = EntityResolver()
|
|
82
|
+
signal = {
|
|
83
|
+
"source": "social.ingest",
|
|
84
|
+
"payload_type": "SocialPostSeen",
|
|
85
|
+
"payload": {
|
|
86
|
+
"platform": "twitter",
|
|
87
|
+
"author_handle": "@some_handle",
|
|
88
|
+
"profile_url": "https://example.com/profiles/some-handle",
|
|
89
|
+
},
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
result = resolve_normalized_social_signal(resolver, signal)
|
|
93
|
+
print(result.entity_id, result.confidence)
|
|
94
|
+
for event in result.emitted_events:
|
|
95
|
+
print(event.event_type, event.payload)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Adapter behavior:
|
|
99
|
+
- Extracts deterministic identifier candidates from normalized payloads.
|
|
100
|
+
- Resolves a primary identifier, then adds remaining identifiers as aliases.
|
|
101
|
+
- Returns only events produced during the adapter call.
|
|
102
|
+
|
|
103
|
+
## M1 Context API
|
|
104
|
+
|
|
105
|
+
Profiler/router workers can read consolidated context using:
|
|
106
|
+
|
|
107
|
+
- `resolver.entity_context(entity_id, recent_limit=10)`
|
|
108
|
+
- `resolver.confidence_summary(entity_id)`
|
|
109
|
+
|
|
110
|
+
Both APIs resolve canonical redirects first, so merged IDs return coherent context.
|
|
@@ -2,6 +2,8 @@ LICENSE
|
|
|
2
2
|
README.md
|
|
3
3
|
pyproject.toml
|
|
4
4
|
metaspn_entities/__init__.py
|
|
5
|
+
metaspn_entities/adapter.py
|
|
6
|
+
metaspn_entities/context.py
|
|
5
7
|
metaspn_entities/events.py
|
|
6
8
|
metaspn_entities/models.py
|
|
7
9
|
metaspn_entities/normalize.py
|
|
@@ -12,5 +14,7 @@ metaspn_entities.egg-info/SOURCES.txt
|
|
|
12
14
|
metaspn_entities.egg-info/dependency_links.txt
|
|
13
15
|
metaspn_entities.egg-info/requires.txt
|
|
14
16
|
metaspn_entities.egg-info/top_level.txt
|
|
17
|
+
tests/test_adapter.py
|
|
18
|
+
tests/test_context.py
|
|
15
19
|
tests/test_event_contract.py
|
|
16
20
|
tests/test_resolver.py
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import sys
|
|
3
|
+
import tempfile
|
|
4
|
+
import unittest
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from metaspn_entities import SQLiteEntityStore
|
|
8
|
+
from metaspn_entities.adapter import resolve_normalized_social_signal
|
|
9
|
+
from metaspn_entities.resolver import EntityResolver
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class AdapterTests(unittest.TestCase):
|
|
13
|
+
def setUp(self) -> None:
|
|
14
|
+
self.tempdir = tempfile.TemporaryDirectory()
|
|
15
|
+
self.db_path = str(Path(self.tempdir.name) / "entities.db")
|
|
16
|
+
self.store = SQLiteEntityStore(self.db_path)
|
|
17
|
+
self.resolver = EntityResolver(self.store)
|
|
18
|
+
|
|
19
|
+
def tearDown(self) -> None:
|
|
20
|
+
self.store.close()
|
|
21
|
+
self.tempdir.cleanup()
|
|
22
|
+
|
|
23
|
+
def test_same_author_over_multiple_posts(self) -> None:
|
|
24
|
+
first_signal = {
|
|
25
|
+
"source": "social.ingest",
|
|
26
|
+
"payload_type": "SocialPostSeen",
|
|
27
|
+
"payload": {
|
|
28
|
+
"platform": "twitter",
|
|
29
|
+
"post_id": "p1",
|
|
30
|
+
"author_handle": "@same_author",
|
|
31
|
+
"url": "https://x.com/same_author/status/1",
|
|
32
|
+
},
|
|
33
|
+
}
|
|
34
|
+
second_signal = {
|
|
35
|
+
"source": "social.ingest",
|
|
36
|
+
"payload_type": "SocialPostSeen",
|
|
37
|
+
"payload": {
|
|
38
|
+
"platform": "twitter",
|
|
39
|
+
"post_id": "p2",
|
|
40
|
+
"author_handle": "same_author",
|
|
41
|
+
"url": "https://x.com/same_author/status/2",
|
|
42
|
+
},
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
first = resolve_normalized_social_signal(self.resolver, first_signal)
|
|
46
|
+
second = resolve_normalized_social_signal(self.resolver, second_signal)
|
|
47
|
+
|
|
48
|
+
self.assertEqual(first.entity_id, second.entity_id)
|
|
49
|
+
|
|
50
|
+
def test_cross_platform_identifier_normalization(self) -> None:
|
|
51
|
+
twitter_signal = {
|
|
52
|
+
"source": "social.ingest",
|
|
53
|
+
"payload_type": "SocialPostSeen",
|
|
54
|
+
"payload": {
|
|
55
|
+
"platform": "twitter",
|
|
56
|
+
"post_id": "p100",
|
|
57
|
+
"author_handle": "alice",
|
|
58
|
+
"profile_url": "https://example.com/team/alice/",
|
|
59
|
+
},
|
|
60
|
+
}
|
|
61
|
+
linkedin_signal = {
|
|
62
|
+
"source": "social.ingest",
|
|
63
|
+
"payload_type": "ProfileSnapshotSeen",
|
|
64
|
+
"payload": {
|
|
65
|
+
"platform": "linkedin",
|
|
66
|
+
"profile_id": "li-77",
|
|
67
|
+
"handle": "alice-smith",
|
|
68
|
+
"profile_url": "http://www.example.com/team/alice",
|
|
69
|
+
},
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
first = resolve_normalized_social_signal(self.resolver, twitter_signal)
|
|
73
|
+
second = resolve_normalized_social_signal(self.resolver, linkedin_signal)
|
|
74
|
+
|
|
75
|
+
self.assertEqual(first.entity_id, second.entity_id)
|
|
76
|
+
|
|
77
|
+
def test_idempotent_rerun_behavior(self) -> None:
|
|
78
|
+
signal = {
|
|
79
|
+
"source": "social.ingest",
|
|
80
|
+
"payload_type": "SocialPostSeen",
|
|
81
|
+
"payload": {
|
|
82
|
+
"platform": "twitter",
|
|
83
|
+
"post_id": "idempotent-1",
|
|
84
|
+
"author_handle": "rerun_author",
|
|
85
|
+
"profile_url": "https://example.org/rerun_author",
|
|
86
|
+
},
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
first = resolve_normalized_social_signal(self.resolver, signal)
|
|
90
|
+
second = resolve_normalized_social_signal(self.resolver, signal)
|
|
91
|
+
|
|
92
|
+
self.assertEqual(first.entity_id, second.entity_id)
|
|
93
|
+
self.assertEqual(
|
|
94
|
+
[event.event_type for event in second.emitted_events],
|
|
95
|
+
["EntityResolved"],
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def test_emitted_events_parse_with_metaspn_schemas(self) -> None:
|
|
99
|
+
entities_module = None
|
|
100
|
+
try:
|
|
101
|
+
entities_module = importlib.import_module("metaspn_schemas.entities")
|
|
102
|
+
except Exception:
|
|
103
|
+
sibling_src = Path(__file__).resolve().parents[2] / "metaspn-schemas" / "src"
|
|
104
|
+
if sibling_src.exists():
|
|
105
|
+
sys.path.insert(0, str(sibling_src))
|
|
106
|
+
entities_module = importlib.import_module("metaspn_schemas.entities")
|
|
107
|
+
|
|
108
|
+
if entities_module is None:
|
|
109
|
+
self.skipTest("metaspn_schemas is unavailable")
|
|
110
|
+
|
|
111
|
+
signal = {
|
|
112
|
+
"source": "social.ingest",
|
|
113
|
+
"payload_type": "SocialPostSeen",
|
|
114
|
+
"payload": {
|
|
115
|
+
"platform": "twitter",
|
|
116
|
+
"post_id": "schema-1",
|
|
117
|
+
"author_handle": "schema_user",
|
|
118
|
+
"profile_url": "https://example.net/schema_user",
|
|
119
|
+
},
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
result = resolve_normalized_social_signal(self.resolver, signal)
|
|
123
|
+
for event in result.emitted_events:
|
|
124
|
+
if event.event_type == "EntityResolved":
|
|
125
|
+
parsed = entities_module.EntityResolved.from_dict(event.payload)
|
|
126
|
+
self.assertEqual(parsed.entity_id, result.entity_id)
|
|
127
|
+
elif event.event_type == "EntityAliasAdded":
|
|
128
|
+
parsed = entities_module.EntityAliasAdded.from_dict(event.payload)
|
|
129
|
+
self.assertTrue(parsed.alias)
|
|
130
|
+
elif event.event_type == "EntityMerged":
|
|
131
|
+
parsed = entities_module.EntityMerged.from_dict(event.payload)
|
|
132
|
+
self.assertTrue(parsed.merged_from)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
unittest.main()
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
import unittest
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from metaspn_entities import SQLiteEntityStore
|
|
6
|
+
from metaspn_entities.adapter import resolve_normalized_social_signal
|
|
7
|
+
from metaspn_entities.resolver import EntityResolver
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ContextTests(unittest.TestCase):
|
|
11
|
+
def setUp(self) -> None:
|
|
12
|
+
self.tempdir = tempfile.TemporaryDirectory()
|
|
13
|
+
self.db_path = str(Path(self.tempdir.name) / "entities.db")
|
|
14
|
+
self.store = SQLiteEntityStore(self.db_path)
|
|
15
|
+
self.resolver = EntityResolver(self.store)
|
|
16
|
+
|
|
17
|
+
def tearDown(self) -> None:
|
|
18
|
+
self.store.close()
|
|
19
|
+
self.tempdir.cleanup()
|
|
20
|
+
|
|
21
|
+
def test_cross_platform_handle_normalization_in_context(self) -> None:
|
|
22
|
+
twitter = {
|
|
23
|
+
"source": "social.ingest",
|
|
24
|
+
"payload": {
|
|
25
|
+
"platform": "twitter",
|
|
26
|
+
"author_handle": "Alice_One",
|
|
27
|
+
"profile_url": "https://example.com/u/alice",
|
|
28
|
+
},
|
|
29
|
+
}
|
|
30
|
+
linkedin = {
|
|
31
|
+
"source": "social.ingest",
|
|
32
|
+
"payload": {
|
|
33
|
+
"platform": "linkedin",
|
|
34
|
+
"handle": "alice-one",
|
|
35
|
+
"profile_url": "http://www.example.com/u/alice/",
|
|
36
|
+
},
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
first = resolve_normalized_social_signal(self.resolver, twitter)
|
|
40
|
+
second = resolve_normalized_social_signal(self.resolver, linkedin)
|
|
41
|
+
self.assertEqual(first.entity_id, second.entity_id)
|
|
42
|
+
|
|
43
|
+
context = self.resolver.entity_context(first.entity_id)
|
|
44
|
+
identifier_types = {item["identifier_type"] for item in context.identifiers}
|
|
45
|
+
self.assertIn("twitter_handle", identifier_types)
|
|
46
|
+
self.assertIn("linkedin_handle", identifier_types)
|
|
47
|
+
self.assertIn("canonical_url", identifier_types)
|
|
48
|
+
|
|
49
|
+
def test_merged_entity_context_continuity(self) -> None:
|
|
50
|
+
a = self.resolver.resolve("twitter_handle", "merge_ctx_a")
|
|
51
|
+
b = self.resolver.resolve("twitter_handle", "merge_ctx_b")
|
|
52
|
+
self.resolver.add_alias(a.entity_id, "email", "a@example.com", caused_by="manual")
|
|
53
|
+
self.resolver.add_alias(b.entity_id, "domain", "example.com", caused_by="manual")
|
|
54
|
+
self.resolver.merge_entities(a.entity_id, b.entity_id, reason="dedupe", caused_by="manual")
|
|
55
|
+
|
|
56
|
+
context_from = self.resolver.entity_context(a.entity_id)
|
|
57
|
+
context_to = self.resolver.entity_context(b.entity_id)
|
|
58
|
+
|
|
59
|
+
self.assertEqual(context_from.entity_id, context_to.entity_id)
|
|
60
|
+
normalized_values = {item["normalized_value"] for item in context_from.aliases}
|
|
61
|
+
self.assertIn("merge_ctx_a", normalized_values)
|
|
62
|
+
self.assertIn("merge_ctx_b", normalized_values)
|
|
63
|
+
|
|
64
|
+
def test_rerun_stability_for_context_and_confidence(self) -> None:
|
|
65
|
+
signal = {
|
|
66
|
+
"source": "social.ingest",
|
|
67
|
+
"payload": {
|
|
68
|
+
"platform": "twitter",
|
|
69
|
+
"author_handle": "stable_ctx",
|
|
70
|
+
"profile_url": "https://example.org/stable_ctx",
|
|
71
|
+
},
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
first = resolve_normalized_social_signal(self.resolver, signal)
|
|
75
|
+
context_1 = self.resolver.entity_context(first.entity_id)
|
|
76
|
+
summary_1 = self.resolver.confidence_summary(first.entity_id)
|
|
77
|
+
|
|
78
|
+
second = resolve_normalized_social_signal(self.resolver, signal)
|
|
79
|
+
context_2 = self.resolver.entity_context(second.entity_id)
|
|
80
|
+
summary_2 = self.resolver.confidence_summary(second.entity_id)
|
|
81
|
+
|
|
82
|
+
self.assertEqual(first.entity_id, second.entity_id)
|
|
83
|
+
self.assertEqual(context_1.entity_id, context_2.entity_id)
|
|
84
|
+
self.assertEqual(context_1.aliases, context_2.aliases)
|
|
85
|
+
self.assertEqual(context_1.identifiers, context_2.identifiers)
|
|
86
|
+
self.assertEqual(summary_1, summary_2)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
if __name__ == "__main__":
|
|
90
|
+
unittest.main()
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
from .events import EmittedEvent
|
|
2
|
-
from .models import EntityResolution
|
|
3
|
-
from .resolver import EntityResolver
|
|
4
|
-
from .sqlite_backend import SQLiteEntityStore
|
|
5
|
-
|
|
6
|
-
__all__ = [
|
|
7
|
-
"EntityResolver",
|
|
8
|
-
"EntityResolution",
|
|
9
|
-
"EmittedEvent",
|
|
10
|
-
"SQLiteEntityStore",
|
|
11
|
-
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{metaspn_entities-0.1.3 → metaspn_entities-0.1.5}/metaspn_entities.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|