memorytrace 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- engram/__init__.py +8 -0
- engram/__main__.py +6 -0
- engram/cli/__init__.py +1 -0
- engram/cli/app.py +291 -0
- engram/cli/formatters.py +90 -0
- engram/cli/simple.py +267 -0
- engram/config.py +72 -0
- engram/engine.py +612 -0
- engram/exceptions.py +41 -0
- engram/extraction/__init__.py +6 -0
- engram/extraction/base.py +20 -0
- engram/extraction/llm_extractor.py +197 -0
- engram/extraction/ner/__init__.py +7 -0
- engram/extraction/ner/cjk.py +63 -0
- engram/extraction/ner/english.py +109 -0
- engram/extraction/ner/korean.py +106 -0
- engram/extraction/regex_extractor.py +188 -0
- engram/integrations/__init__.py +1 -0
- engram/integrations/mcp_server.py +213 -0
- engram/integrations/sdk.py +194 -0
- engram/models/__init__.py +19 -0
- engram/models/entity.py +72 -0
- engram/models/fact.py +58 -0
- engram/models/quality.py +61 -0
- engram/models/relation.py +26 -0
- engram/models/search.py +96 -0
- engram/models/session.py +53 -0
- engram/models/source.py +73 -0
- engram/quality/__init__.py +8 -0
- engram/quality/confidence.py +38 -0
- engram/quality/conflict.py +79 -0
- engram/quality/decay.py +28 -0
- engram/quality/gate.py +120 -0
- engram/quality/pii.py +80 -0
- engram/search/__init__.py +13 -0
- engram/search/base.py +20 -0
- engram/search/fts5_search.py +210 -0
- engram/search/hybrid.py +99 -0
- engram/search/semantic.py +186 -0
- engram/search/tokenizer.py +85 -0
- engram/session/__init__.py +6 -0
- engram/session/context.py +87 -0
- engram/session/manager.py +152 -0
- engram/session/working_memory.py +57 -0
- engram/storage/__init__.py +6 -0
- engram/storage/base.py +63 -0
- engram/storage/markdown_export.py +144 -0
- engram/storage/migrations.py +30 -0
- engram/storage/sqlite_store.py +615 -0
- memorytrace-0.1.0.dist-info/METADATA +138 -0
- memorytrace-0.1.0.dist-info/RECORD +54 -0
- memorytrace-0.1.0.dist-info/WHEEL +4 -0
- memorytrace-0.1.0.dist-info/entry_points.txt +3 -0
- memorytrace-0.1.0.dist-info/licenses/LICENSE +21 -0
engram/models/fact.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Fact data models with temporal dimension."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import uuid
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from engram.models.source import Source
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FactStatus(str, Enum):
|
|
15
|
+
UNVERIFIED = "unverified"
|
|
16
|
+
VERIFIED = "verified"
|
|
17
|
+
CONFLICTED = "conflicted"
|
|
18
|
+
EXPIRED = "expired"
|
|
19
|
+
RETRACTED = "retracted"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Fact:
|
|
24
|
+
"""A single piece of knowledge about an entity, with provenance and temporal bounds."""
|
|
25
|
+
|
|
26
|
+
id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
27
|
+
entity_id: str = ""
|
|
28
|
+
subject: str = ""
|
|
29
|
+
predicate: str = ""
|
|
30
|
+
object: str = ""
|
|
31
|
+
raw_text: str = ""
|
|
32
|
+
source: Source = field(default_factory=Source)
|
|
33
|
+
confidence: float = 0.5
|
|
34
|
+
status: FactStatus = FactStatus.UNVERIFIED
|
|
35
|
+
valid_from: Optional[datetime] = None
|
|
36
|
+
valid_to: Optional[datetime] = None # None = currently valid
|
|
37
|
+
superseded_by: Optional[str] = None
|
|
38
|
+
created_at: datetime = field(default_factory=datetime.now)
|
|
39
|
+
|
|
40
|
+
def __post_init__(self) -> None:
|
|
41
|
+
self.confidence = max(0.0, min(1.0, self.confidence))
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def is_current(self) -> bool:
|
|
45
|
+
"""Whether this fact is currently valid (not expired or superseded)."""
|
|
46
|
+
if self.valid_to is not None:
|
|
47
|
+
return False
|
|
48
|
+
if self.superseded_by is not None:
|
|
49
|
+
return False
|
|
50
|
+
if self.status in (FactStatus.EXPIRED, FactStatus.RETRACTED):
|
|
51
|
+
return False
|
|
52
|
+
return True
|
|
53
|
+
|
|
54
|
+
def supersede(self, new_fact_id: str) -> None:
|
|
55
|
+
"""Mark this fact as superseded by another."""
|
|
56
|
+
self.superseded_by = new_fact_id
|
|
57
|
+
self.valid_to = datetime.now()
|
|
58
|
+
self.status = FactStatus.EXPIRED
|
engram/models/quality.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Quality gate data models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from engram.models.fact import Fact
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Action(str, Enum):
|
|
13
|
+
ACCEPT = "accept"
|
|
14
|
+
QUARANTINE = "quarantine"
|
|
15
|
+
FLAG_CONFLICT = "flag_conflict"
|
|
16
|
+
REJECT = "reject"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class ConflictInfo:
|
|
21
|
+
"""Information about a data conflict between facts."""
|
|
22
|
+
|
|
23
|
+
conflict_id: str = ""
|
|
24
|
+
existing_fact: Optional[Fact] = None
|
|
25
|
+
new_fact: Optional[Fact] = None
|
|
26
|
+
conflict_type: str = "" # value_change, contradiction
|
|
27
|
+
suggested_resolution: str = "" # supersede, manual_review, keep_old
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class PIIMatch:
|
|
32
|
+
"""A detected PII occurrence in text."""
|
|
33
|
+
|
|
34
|
+
pii_type: str = "" # credit_card, ssn, api_key, email, phone, rrn
|
|
35
|
+
start: int = 0
|
|
36
|
+
end: int = 0
|
|
37
|
+
original: str = ""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class ValidationResult:
|
|
42
|
+
"""Result of quality gate validation."""
|
|
43
|
+
|
|
44
|
+
action: Action = Action.ACCEPT
|
|
45
|
+
reason: str = ""
|
|
46
|
+
confidence: float = 0.0
|
|
47
|
+
conflicts: list[ConflictInfo] = field(default_factory=list)
|
|
48
|
+
pii_matches: list[PIIMatch] = field(default_factory=list)
|
|
49
|
+
masked_text: Optional[str] = None # text after PII masking
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def is_accepted(self) -> bool:
|
|
53
|
+
return self.action == Action.ACCEPT
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def is_quarantined(self) -> bool:
|
|
57
|
+
return self.action == Action.QUARANTINE
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def has_conflicts(self) -> bool:
|
|
61
|
+
return self.action == Action.FLAG_CONFLICT
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Relation model for directed entity graph edges."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import uuid
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Relation:
|
|
13
|
+
"""A directed relationship between two entities."""
|
|
14
|
+
|
|
15
|
+
id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
16
|
+
from_entity_id: str = ""
|
|
17
|
+
to_entity_id: str = ""
|
|
18
|
+
relation_type: str = "" # CEO_OF, INVESTED_IN, WORKS_WITH, MEMBER_OF, etc.
|
|
19
|
+
metadata: dict = field(default_factory=dict)
|
|
20
|
+
valid_from: Optional[datetime] = None
|
|
21
|
+
valid_to: Optional[datetime] = None # None = currently valid
|
|
22
|
+
created_at: datetime = field(default_factory=datetime.now)
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def is_current(self) -> bool:
|
|
26
|
+
return self.valid_to is None
|
engram/models/search.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Search-related data models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from engram.models.entity import Entity
|
|
11
|
+
from engram.models.fact import Fact
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class SearchOptions:
|
|
16
|
+
"""Options for a memory search query."""
|
|
17
|
+
|
|
18
|
+
query: str = ""
|
|
19
|
+
max_results: int = 10
|
|
20
|
+
max_tokens: int = 500 # token budget for agent context
|
|
21
|
+
min_confidence: float = 0.0
|
|
22
|
+
entity_types: list[str] = field(default_factory=list)
|
|
23
|
+
tiers: list[str] = field(default_factory=list)
|
|
24
|
+
date_from: Optional[datetime] = None
|
|
25
|
+
date_to: Optional[datetime] = None
|
|
26
|
+
include_archived: bool = False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class SearchHit:
|
|
31
|
+
"""A single search result."""
|
|
32
|
+
|
|
33
|
+
entity: Entity
|
|
34
|
+
facts: list[Fact] = field(default_factory=list)
|
|
35
|
+
relevance_score: float = 0.0
|
|
36
|
+
snippet: str = ""
|
|
37
|
+
token_count: int = 0
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class SearchResult:
|
|
42
|
+
"""Complete search response."""
|
|
43
|
+
|
|
44
|
+
query: str = ""
|
|
45
|
+
hits: list[SearchHit] = field(default_factory=list)
|
|
46
|
+
total_count: int = 0
|
|
47
|
+
search_time_ms: float = 0.0
|
|
48
|
+
total_tokens: int = 0
|
|
49
|
+
|
|
50
|
+
def to_agent_context(self, max_tokens: int = 500) -> str:
|
|
51
|
+
"""Serialize to compact text for LLM context injection."""
|
|
52
|
+
if not self.hits:
|
|
53
|
+
return f"No results found for '{self.query}'."
|
|
54
|
+
|
|
55
|
+
parts: list[str] = []
|
|
56
|
+
tokens_used = 0
|
|
57
|
+
for hit in self.hits:
|
|
58
|
+
entry = f"[{hit.entity.name}] ({hit.entity.entity_type})"
|
|
59
|
+
if hit.entity.state.role:
|
|
60
|
+
entry += f" Role: {hit.entity.state.role}"
|
|
61
|
+
if hit.entity.state.affiliation:
|
|
62
|
+
entry += f" @ {hit.entity.state.affiliation}"
|
|
63
|
+
if hit.snippet:
|
|
64
|
+
entry += f" | {hit.snippet}"
|
|
65
|
+
# Rough token estimate: 1 token ≈ 4 chars
|
|
66
|
+
entry_tokens = len(entry) // 4
|
|
67
|
+
if tokens_used + entry_tokens > max_tokens:
|
|
68
|
+
break
|
|
69
|
+
parts.append(entry)
|
|
70
|
+
tokens_used += entry_tokens
|
|
71
|
+
|
|
72
|
+
return "\n".join(parts)
|
|
73
|
+
|
|
74
|
+
def to_dict(self) -> dict:
|
|
75
|
+
"""Structured dict for MCP/SDK responses."""
|
|
76
|
+
return {
|
|
77
|
+
"query": self.query,
|
|
78
|
+
"total_count": self.total_count,
|
|
79
|
+
"search_time_ms": self.search_time_ms,
|
|
80
|
+
"hits": [
|
|
81
|
+
{
|
|
82
|
+
"entity_name": h.entity.name,
|
|
83
|
+
"entity_type": h.entity.entity_type,
|
|
84
|
+
"relevance_score": h.relevance_score,
|
|
85
|
+
"snippet": h.snippet,
|
|
86
|
+
"facts": [
|
|
87
|
+
{"text": f.raw_text, "confidence": f.confidence, "status": f.status.value}
|
|
88
|
+
for f in h.facts
|
|
89
|
+
],
|
|
90
|
+
}
|
|
91
|
+
for h in self.hits
|
|
92
|
+
],
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
def to_json(self, indent: int = 2) -> str:
|
|
96
|
+
return json.dumps(self.to_dict(), ensure_ascii=False, indent=indent)
|
engram/models/session.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Session and session event models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import uuid
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Session:
|
|
13
|
+
"""A conversation session with an AI agent."""
|
|
14
|
+
|
|
15
|
+
session_id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
16
|
+
agent_id: str = "default"
|
|
17
|
+
started_at: datetime = field(default_factory=datetime.now)
|
|
18
|
+
ended_at: Optional[datetime] = None
|
|
19
|
+
parent_session_id: Optional[str] = None
|
|
20
|
+
entities_accessed: list[str] = field(default_factory=list)
|
|
21
|
+
entities_modified: list[str] = field(default_factory=list)
|
|
22
|
+
facts_added: list[str] = field(default_factory=list)
|
|
23
|
+
summary: Optional[str] = None
|
|
24
|
+
metadata: dict = field(default_factory=dict)
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def is_active(self) -> bool:
|
|
28
|
+
return self.ended_at is None
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def duration_seconds(self) -> Optional[float]:
|
|
32
|
+
if self.ended_at is None:
|
|
33
|
+
return None
|
|
34
|
+
return (self.ended_at - self.started_at).total_seconds()
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def duration_minutes(self) -> Optional[float]:
|
|
38
|
+
secs = self.duration_seconds
|
|
39
|
+
if secs is None:
|
|
40
|
+
return None
|
|
41
|
+
return round(secs / 60, 1)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class SessionEvent:
|
|
46
|
+
"""An auditable event within a session."""
|
|
47
|
+
|
|
48
|
+
id: Optional[int] = None # auto-increment in DB
|
|
49
|
+
session_id: str = ""
|
|
50
|
+
event_type: str = "" # search, read, write, update, delete
|
|
51
|
+
target: str = "" # entity_id or query string
|
|
52
|
+
detail: dict = field(default_factory=dict)
|
|
53
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
engram/models/source.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Source and provenance models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SourceType(str, Enum):
|
|
12
|
+
DIRECT_SPEECH = "direct_speech"
|
|
13
|
+
DOCUMENT = "document"
|
|
14
|
+
API = "api"
|
|
15
|
+
WEB = "web"
|
|
16
|
+
AGENT_INFERENCE = "agent_inference"
|
|
17
|
+
USER_INPUT = "user_input"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Base trust scores per source type
|
|
21
|
+
_SOURCE_BASE_TRUST: dict[SourceType, float] = {
|
|
22
|
+
SourceType.DIRECT_SPEECH: 0.95,
|
|
23
|
+
SourceType.DOCUMENT: 0.85,
|
|
24
|
+
SourceType.USER_INPUT: 0.90,
|
|
25
|
+
SourceType.API: 0.80,
|
|
26
|
+
SourceType.WEB: 0.60,
|
|
27
|
+
SourceType.AGENT_INFERENCE: 0.40,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class Source:
|
|
33
|
+
"""Provenance metadata for a fact."""
|
|
34
|
+
|
|
35
|
+
type: SourceType = SourceType.USER_INPUT
|
|
36
|
+
author: str = ""
|
|
37
|
+
channel: str = "" # meeting, email, slack, cli, mcp
|
|
38
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
39
|
+
confidence: float = 1.0 # submitter's own confidence (0.0–1.0)
|
|
40
|
+
url: Optional[str] = None
|
|
41
|
+
session_id: Optional[str] = None
|
|
42
|
+
|
|
43
|
+
def __post_init__(self) -> None:
|
|
44
|
+
self.confidence = max(0.0, min(1.0, self.confidence))
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def trust_score(self) -> float:
|
|
48
|
+
"""Combined trust: base score for source type × submitter confidence."""
|
|
49
|
+
base = _SOURCE_BASE_TRUST.get(self.type, 0.5)
|
|
50
|
+
return base * self.confidence
|
|
51
|
+
|
|
52
|
+
def to_dict(self) -> dict:
|
|
53
|
+
return {
|
|
54
|
+
"type": self.type.value,
|
|
55
|
+
"author": self.author,
|
|
56
|
+
"channel": self.channel,
|
|
57
|
+
"timestamp": self.timestamp.isoformat(),
|
|
58
|
+
"confidence": self.confidence,
|
|
59
|
+
"url": self.url,
|
|
60
|
+
"session_id": self.session_id,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def from_dict(cls, data: dict) -> Source:
|
|
65
|
+
return cls(
|
|
66
|
+
type=SourceType(data.get("type", "user_input")),
|
|
67
|
+
author=data.get("author", ""),
|
|
68
|
+
channel=data.get("channel", ""),
|
|
69
|
+
timestamp=datetime.fromisoformat(data["timestamp"]) if data.get("timestamp") else datetime.now(),
|
|
70
|
+
confidence=data.get("confidence", 1.0),
|
|
71
|
+
url=data.get("url"),
|
|
72
|
+
session_id=data.get("session_id"),
|
|
73
|
+
)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Quality gate system for Engram."""
|
|
2
|
+
|
|
3
|
+
from engram.quality.gate import QualityGate
|
|
4
|
+
from engram.quality.confidence import compute_confidence
|
|
5
|
+
from engram.quality.conflict import ConflictDetector
|
|
6
|
+
from engram.quality.pii import PIIDetector
|
|
7
|
+
|
|
8
|
+
__all__ = ["QualityGate", "compute_confidence", "ConflictDetector", "PIIDetector"]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Confidence scoring for facts based on source type and extraction method."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from engram.models.source import SourceType
|
|
6
|
+
|
|
7
|
+
# Base trust per source type
|
|
8
|
+
_SOURCE_BASE: dict[SourceType, float] = {
|
|
9
|
+
SourceType.DIRECT_SPEECH: 0.95,
|
|
10
|
+
SourceType.USER_INPUT: 0.90,
|
|
11
|
+
SourceType.DOCUMENT: 0.85,
|
|
12
|
+
SourceType.API: 0.80,
|
|
13
|
+
SourceType.WEB: 0.60,
|
|
14
|
+
SourceType.AGENT_INFERENCE: 0.40,
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
# Multiplier per extraction method
|
|
18
|
+
_EXTRACTION_MULT: dict[str, float] = {
|
|
19
|
+
"regex": 0.7,
|
|
20
|
+
"llm": 1.0,
|
|
21
|
+
"manual": 1.0,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def compute_confidence(
|
|
26
|
+
source_type: SourceType,
|
|
27
|
+
source_confidence: float,
|
|
28
|
+
extraction_method: str = "manual",
|
|
29
|
+
) -> float:
|
|
30
|
+
"""Compute final confidence score for a fact.
|
|
31
|
+
|
|
32
|
+
confidence = base_trust(source_type) × source_confidence × extraction_multiplier
|
|
33
|
+
Result is clamped to [0.0, 1.0].
|
|
34
|
+
"""
|
|
35
|
+
base = _SOURCE_BASE.get(source_type, 0.5)
|
|
36
|
+
method_mult = _EXTRACTION_MULT.get(extraction_method, 0.7)
|
|
37
|
+
raw = base * max(0.0, min(1.0, source_confidence)) * method_mult
|
|
38
|
+
return max(0.0, min(1.0, round(raw, 4)))
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Conflict detection — finds when new facts contradict existing ones."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import uuid
|
|
6
|
+
|
|
7
|
+
from engram.models.fact import Fact
|
|
8
|
+
from engram.models.quality import ConflictInfo
|
|
9
|
+
from engram.storage.base import StorageBackend
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ConflictDetector:
|
|
13
|
+
"""Detects when a new fact contradicts existing facts for the same entity+predicate."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, storage: StorageBackend):
|
|
16
|
+
self.storage = storage
|
|
17
|
+
|
|
18
|
+
def find_conflicts(self, new_fact: Fact) -> list[ConflictInfo]:
|
|
19
|
+
"""Check if new_fact conflicts with any current facts.
|
|
20
|
+
|
|
21
|
+
A conflict occurs when:
|
|
22
|
+
- Same entity_id AND same predicate
|
|
23
|
+
- Different object value
|
|
24
|
+
- Both are currently valid (valid_to is None)
|
|
25
|
+
"""
|
|
26
|
+
existing = self.storage.get_current_facts(new_fact.entity_id)
|
|
27
|
+
conflicts: list[ConflictInfo] = []
|
|
28
|
+
|
|
29
|
+
for old in existing:
|
|
30
|
+
if old.id == new_fact.id:
|
|
31
|
+
continue
|
|
32
|
+
if not self._same_predicate(new_fact, old):
|
|
33
|
+
continue
|
|
34
|
+
if self._same_object(new_fact, old):
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
# Conflict found
|
|
38
|
+
resolution = self._suggest_resolution(old, new_fact)
|
|
39
|
+
conflicts.append(ConflictInfo(
|
|
40
|
+
conflict_id=str(uuid.uuid4()),
|
|
41
|
+
existing_fact=old,
|
|
42
|
+
new_fact=new_fact,
|
|
43
|
+
conflict_type="value_change",
|
|
44
|
+
suggested_resolution=resolution,
|
|
45
|
+
))
|
|
46
|
+
|
|
47
|
+
return conflicts
|
|
48
|
+
|
|
49
|
+
def is_duplicate(self, new_fact: Fact) -> bool:
|
|
50
|
+
"""Check if an identical fact already exists (same entity+predicate+object)."""
|
|
51
|
+
existing = self.storage.get_current_facts(new_fact.entity_id)
|
|
52
|
+
for old in existing:
|
|
53
|
+
if self._same_predicate(new_fact, old) and self._same_object(new_fact, old):
|
|
54
|
+
return True
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
def _same_predicate(self, a: Fact, b: Fact) -> bool:
|
|
58
|
+
"""Check if two facts refer to the same predicate (case-insensitive)."""
|
|
59
|
+
return a.predicate.strip().lower() == b.predicate.strip().lower()
|
|
60
|
+
|
|
61
|
+
def _same_object(self, a: Fact, b: Fact) -> bool:
|
|
62
|
+
"""Check if two facts have the same value (case-insensitive, normalized)."""
|
|
63
|
+
import re
|
|
64
|
+
norm_a = re.sub(r'\b(the|a|an)\b', '', a.object.strip().lower()).strip()
|
|
65
|
+
norm_b = re.sub(r'\b(the|a|an)\b', '', b.object.strip().lower()).strip()
|
|
66
|
+
return norm_a == norm_b
|
|
67
|
+
|
|
68
|
+
def _suggest_resolution(self, old: Fact, new: Fact) -> str:
|
|
69
|
+
"""Suggest how to resolve a conflict based on trust scores."""
|
|
70
|
+
old_trust = old.source.trust_score
|
|
71
|
+
new_trust = new.source.trust_score
|
|
72
|
+
|
|
73
|
+
trust_diff = new_trust - old_trust
|
|
74
|
+
if trust_diff > 0.3:
|
|
75
|
+
return "supersede"
|
|
76
|
+
elif trust_diff < -0.3:
|
|
77
|
+
return "keep_old"
|
|
78
|
+
else:
|
|
79
|
+
return "manual_review"
|
engram/quality/decay.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Time-based decay — identifies stale facts that need review."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime, timedelta
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from engram.models.fact import Fact
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def find_stale_facts(
|
|
12
|
+
facts: list[Fact],
|
|
13
|
+
days: int = 90,
|
|
14
|
+
reference_time: Optional[datetime] = None,
|
|
15
|
+
) -> list[Fact]:
|
|
16
|
+
"""Find facts that haven't been updated within the given period.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
facts: Facts to check.
|
|
20
|
+
days: Number of days before a fact is considered stale.
|
|
21
|
+
reference_time: Time to compare against (default: now).
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
List of stale facts.
|
|
25
|
+
"""
|
|
26
|
+
now = reference_time or datetime.now()
|
|
27
|
+
cutoff = now - timedelta(days=days)
|
|
28
|
+
return [f for f in facts if f.is_current and f.created_at < cutoff]
|
engram/quality/gate.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Quality gate — pipeline orchestrator for write validation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from engram.config import EngramConfig
|
|
6
|
+
from engram.models.fact import Fact, FactStatus
|
|
7
|
+
from engram.models.quality import Action, ConflictInfo, ValidationResult
|
|
8
|
+
from engram.quality.confidence import compute_confidence
|
|
9
|
+
from engram.quality.conflict import ConflictDetector
|
|
10
|
+
from engram.quality.pii import PIIDetector
|
|
11
|
+
from engram.storage.base import StorageBackend
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class QualityGate:
|
|
15
|
+
"""Pipeline: confidence → PII masking → conflict detection → accept/reject/quarantine.
|
|
16
|
+
|
|
17
|
+
Every write operation passes through this gate before reaching storage.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, config: EngramConfig, storage: StorageBackend):
|
|
21
|
+
self.config = config
|
|
22
|
+
self.conflict_detector = ConflictDetector(storage)
|
|
23
|
+
self.pii_detector = PIIDetector()
|
|
24
|
+
self.min_confidence = config.min_confidence
|
|
25
|
+
self.auto_resolve_threshold = config.auto_resolve_threshold
|
|
26
|
+
self.enable_pii = config.enable_pii_detection
|
|
27
|
+
|
|
28
|
+
def validate(
|
|
29
|
+
self,
|
|
30
|
+
fact: Fact,
|
|
31
|
+
extraction_method: str = "manual",
|
|
32
|
+
) -> ValidationResult:
|
|
33
|
+
"""Run the full quality gate pipeline on a fact.
|
|
34
|
+
|
|
35
|
+
Returns ValidationResult with action:
|
|
36
|
+
- ACCEPT: fact is good, store it
|
|
37
|
+
- QUARANTINE: low confidence, store as unverified
|
|
38
|
+
- FLAG_CONFLICT: conflicts with existing data
|
|
39
|
+
- REJECT: should not be stored (e.g., all PII with no substance)
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
# Step 1: Compute confidence
|
|
43
|
+
computed_conf = compute_confidence(
|
|
44
|
+
source_type=fact.source.type,
|
|
45
|
+
source_confidence=fact.source.confidence,
|
|
46
|
+
extraction_method=extraction_method,
|
|
47
|
+
)
|
|
48
|
+
fact.confidence = computed_conf
|
|
49
|
+
|
|
50
|
+
# Step 2: PII detection and masking
|
|
51
|
+
pii_matches = []
|
|
52
|
+
masked_text = None
|
|
53
|
+
if self.enable_pii and fact.raw_text:
|
|
54
|
+
pii_matches = self.pii_detector.scan(fact.raw_text)
|
|
55
|
+
if pii_matches:
|
|
56
|
+
masked_text = self.pii_detector.mask(fact.raw_text, pii_matches)
|
|
57
|
+
fact.raw_text = masked_text
|
|
58
|
+
# Also mask subject/object if they contain PII
|
|
59
|
+
if self.pii_detector.has_pii(fact.object):
|
|
60
|
+
fact.object = self.pii_detector.mask(fact.object)
|
|
61
|
+
if self.pii_detector.has_pii(fact.subject):
|
|
62
|
+
fact.subject = self.pii_detector.mask(fact.subject)
|
|
63
|
+
|
|
64
|
+
# Step 3: Confidence threshold check
|
|
65
|
+
if computed_conf < self.min_confidence:
|
|
66
|
+
fact.status = FactStatus.UNVERIFIED
|
|
67
|
+
return ValidationResult(
|
|
68
|
+
action=Action.QUARANTINE,
|
|
69
|
+
reason=f"Confidence {computed_conf:.2f} below threshold {self.min_confidence:.2f}",
|
|
70
|
+
confidence=computed_conf,
|
|
71
|
+
pii_matches=pii_matches,
|
|
72
|
+
masked_text=masked_text,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Step 3.5: Duplicate detection
|
|
76
|
+
if fact.entity_id and self.conflict_detector.is_duplicate(fact):
|
|
77
|
+
return ValidationResult(
|
|
78
|
+
action=Action.REJECT,
|
|
79
|
+
reason="Duplicate fact already exists",
|
|
80
|
+
confidence=computed_conf,
|
|
81
|
+
pii_matches=pii_matches,
|
|
82
|
+
masked_text=masked_text,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Step 4: Conflict detection
|
|
86
|
+
if fact.entity_id:
|
|
87
|
+
conflicts = self.conflict_detector.find_conflicts(fact)
|
|
88
|
+
if conflicts:
|
|
89
|
+
# Check if we can auto-resolve
|
|
90
|
+
auto_resolved = self._try_auto_resolve(conflicts)
|
|
91
|
+
if not auto_resolved:
|
|
92
|
+
fact.status = FactStatus.CONFLICTED
|
|
93
|
+
return ValidationResult(
|
|
94
|
+
action=Action.FLAG_CONFLICT,
|
|
95
|
+
reason=f"Conflicts with {len(conflicts)} existing fact(s)",
|
|
96
|
+
confidence=computed_conf,
|
|
97
|
+
conflicts=conflicts,
|
|
98
|
+
pii_matches=pii_matches,
|
|
99
|
+
masked_text=masked_text,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Step 5: Accept
|
|
103
|
+
return ValidationResult(
|
|
104
|
+
action=Action.ACCEPT,
|
|
105
|
+
reason="Passed all quality checks",
|
|
106
|
+
confidence=computed_conf,
|
|
107
|
+
pii_matches=pii_matches,
|
|
108
|
+
masked_text=masked_text,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def _try_auto_resolve(self, conflicts: list[ConflictInfo]) -> bool:
|
|
112
|
+
"""Attempt to auto-resolve conflicts if trust difference is large enough."""
|
|
113
|
+
all_resolved = True
|
|
114
|
+
for conflict in conflicts:
|
|
115
|
+
if conflict.suggested_resolution == "supersede":
|
|
116
|
+
# New fact has much higher trust — auto-supersede
|
|
117
|
+
continue
|
|
118
|
+
else:
|
|
119
|
+
all_resolved = False
|
|
120
|
+
return all_resolved
|