agmem 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agmem-0.1.2.dist-info → agmem-0.1.4.dist-info}/METADATA +144 -14
- {agmem-0.1.2.dist-info → agmem-0.1.4.dist-info}/RECORD +48 -28
- memvcs/cli.py +10 -0
- memvcs/commands/add.py +6 -0
- memvcs/commands/audit.py +59 -0
- memvcs/commands/clone.py +7 -0
- memvcs/commands/daemon.py +45 -0
- memvcs/commands/distill.py +24 -0
- memvcs/commands/federated.py +59 -0
- memvcs/commands/fsck.py +31 -0
- memvcs/commands/garden.py +22 -0
- memvcs/commands/gc.py +66 -0
- memvcs/commands/merge.py +55 -1
- memvcs/commands/prove.py +66 -0
- memvcs/commands/pull.py +27 -0
- memvcs/commands/resolve.py +130 -0
- memvcs/commands/timeline.py +27 -0
- memvcs/commands/verify.py +74 -23
- memvcs/commands/when.py +27 -0
- memvcs/core/audit.py +124 -0
- memvcs/core/compression_pipeline.py +157 -0
- memvcs/core/consistency.py +9 -9
- memvcs/core/crypto_verify.py +291 -0
- memvcs/core/distiller.py +47 -29
- memvcs/core/encryption.py +169 -0
- memvcs/core/federated.py +147 -0
- memvcs/core/gardener.py +47 -29
- memvcs/core/ipfs_remote.py +200 -0
- memvcs/core/knowledge_graph.py +77 -5
- memvcs/core/llm/__init__.py +10 -0
- memvcs/core/llm/anthropic_provider.py +50 -0
- memvcs/core/llm/base.py +27 -0
- memvcs/core/llm/factory.py +30 -0
- memvcs/core/llm/openai_provider.py +36 -0
- memvcs/core/merge.py +36 -23
- memvcs/core/objects.py +39 -19
- memvcs/core/pack.py +278 -0
- memvcs/core/privacy_budget.py +63 -0
- memvcs/core/remote.py +229 -3
- memvcs/core/repository.py +82 -2
- memvcs/core/temporal_index.py +9 -0
- memvcs/core/trust.py +103 -0
- memvcs/core/vector_store.py +15 -1
- memvcs/core/zk_proofs.py +158 -0
- {agmem-0.1.2.dist-info → agmem-0.1.4.dist-info}/WHEEL +0 -0
- {agmem-0.1.2.dist-info → agmem-0.1.4.dist-info}/entry_points.txt +0 -0
- {agmem-0.1.2.dist-info → agmem-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {agmem-0.1.2.dist-info → agmem-0.1.4.dist-info}/top_level.txt +0 -0
memvcs/commands/verify.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
|
-
agmem verify - Belief consistency
|
|
2
|
+
agmem verify - Belief consistency and cryptographic commit verification.
|
|
3
3
|
|
|
4
|
-
Scans semantic memories for logical contradictions.
|
|
4
|
+
Scans semantic memories for logical contradictions; optionally verifies commit Merkle/signatures.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import argparse
|
|
@@ -12,10 +12,10 @@ from ..core.consistency import ConsistencyChecker, ConsistencyResult
|
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class VerifyCommand:
|
|
15
|
-
"""Verify belief consistency of
|
|
15
|
+
"""Verify belief consistency and/or cryptographic integrity of commits."""
|
|
16
16
|
|
|
17
17
|
name = "verify"
|
|
18
|
-
help = "Scan semantic memories for
|
|
18
|
+
help = "Scan semantic memories for contradictions; optionally verify commit signatures"
|
|
19
19
|
|
|
20
20
|
@staticmethod
|
|
21
21
|
def add_arguments(parser: argparse.ArgumentParser):
|
|
@@ -23,8 +23,17 @@ class VerifyCommand:
|
|
|
23
23
|
"--consistency",
|
|
24
24
|
"-c",
|
|
25
25
|
action="store_true",
|
|
26
|
-
|
|
27
|
-
|
|
26
|
+
help="Check semantic memories for contradictions",
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"--crypto",
|
|
30
|
+
action="store_true",
|
|
31
|
+
help="Verify Merkle tree and signatures for commits",
|
|
32
|
+
)
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"--ref",
|
|
35
|
+
metavar="REF",
|
|
36
|
+
help="Commit or ref to verify (with --crypto); default HEAD",
|
|
28
37
|
)
|
|
29
38
|
parser.add_argument(
|
|
30
39
|
"--llm",
|
|
@@ -32,28 +41,70 @@ class VerifyCommand:
|
|
|
32
41
|
help="Use LLM for triple extraction (requires OpenAI)",
|
|
33
42
|
)
|
|
34
43
|
|
|
44
|
+
@staticmethod
|
|
45
|
+
def _run_crypto_verify(repo, ref: str = None) -> int:
|
|
46
|
+
"""Run cryptographic verification. Returns 0 if all OK, 1 on failure."""
|
|
47
|
+
from ..core.crypto_verify import verify_commit, load_public_key
|
|
48
|
+
|
|
49
|
+
if ref:
|
|
50
|
+
commit_hash = repo.resolve_ref(ref)
|
|
51
|
+
if not commit_hash:
|
|
52
|
+
print(f"Ref not found: {ref}")
|
|
53
|
+
return 1
|
|
54
|
+
else:
|
|
55
|
+
head = repo.refs.get_head()
|
|
56
|
+
if head["type"] == "branch":
|
|
57
|
+
commit_hash = repo.refs.get_branch_commit(head["value"])
|
|
58
|
+
else:
|
|
59
|
+
commit_hash = head.get("value")
|
|
60
|
+
if not commit_hash:
|
|
61
|
+
print("No commit to verify (empty repo).")
|
|
62
|
+
return 0
|
|
63
|
+
pub = load_public_key(repo.mem_dir)
|
|
64
|
+
ok, err = verify_commit(
|
|
65
|
+
repo.object_store, commit_hash, public_key_pem=pub, mem_dir=repo.mem_dir
|
|
66
|
+
)
|
|
67
|
+
if ok:
|
|
68
|
+
print(f"Commit {commit_hash[:8]} verified (Merkle + signature OK).")
|
|
69
|
+
return 0
|
|
70
|
+
print(f"Commit {commit_hash[:8]} verification failed: {err}")
|
|
71
|
+
return 1
|
|
72
|
+
|
|
35
73
|
@staticmethod
|
|
36
74
|
def execute(args) -> int:
|
|
37
75
|
repo, code = require_repo()
|
|
38
76
|
if code != 0:
|
|
39
77
|
return code
|
|
40
78
|
|
|
41
|
-
|
|
42
|
-
|
|
79
|
+
run_consistency = args.consistency
|
|
80
|
+
run_crypto = args.crypto
|
|
81
|
+
if not run_consistency and not run_crypto:
|
|
82
|
+
run_consistency = True
|
|
43
83
|
|
|
44
|
-
|
|
45
|
-
if result.valid:
|
|
46
|
-
print("No contradictions found.")
|
|
47
|
-
return 0
|
|
84
|
+
exit_code = 0
|
|
48
85
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
)
|
|
58
|
-
|
|
59
|
-
|
|
86
|
+
if run_crypto:
|
|
87
|
+
if VerifyCommand._run_crypto_verify(repo, args.ref) != 0:
|
|
88
|
+
exit_code = 1
|
|
89
|
+
|
|
90
|
+
if run_consistency:
|
|
91
|
+
checker = ConsistencyChecker(repo, llm_provider="openai" if args.llm else None)
|
|
92
|
+
result = checker.check(use_llm=args.llm)
|
|
93
|
+
|
|
94
|
+
print(f"Checked {result.files_checked} semantic file(s)")
|
|
95
|
+
if result.valid:
|
|
96
|
+
print("No contradictions found.")
|
|
97
|
+
else:
|
|
98
|
+
exit_code = 1
|
|
99
|
+
print(f"\nFound {len(result.contradictions)} contradiction(s):")
|
|
100
|
+
for i, c in enumerate(result.contradictions, 1):
|
|
101
|
+
print(f"\n[{i}] {c.reason}")
|
|
102
|
+
print(
|
|
103
|
+
f" {c.triple1.source}:{c.triple1.line}: {c.triple1.subject} {c.triple1.predicate} {c.triple1.obj}"
|
|
104
|
+
)
|
|
105
|
+
print(
|
|
106
|
+
f" {c.triple2.source}:{c.triple2.line}: {c.triple2.subject} {c.triple2.predicate} {c.triple2.obj}"
|
|
107
|
+
)
|
|
108
|
+
print("\nUse 'agmem repair --strategy confidence' to attempt auto-fix.")
|
|
109
|
+
|
|
110
|
+
return exit_code
|
memvcs/commands/when.py
CHANGED
|
@@ -34,6 +34,18 @@ class WhenCommand:
|
|
|
34
34
|
default=10,
|
|
35
35
|
help="Max commits to report (default: 10)",
|
|
36
36
|
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--from",
|
|
39
|
+
dest="from_ts",
|
|
40
|
+
metavar="ISO",
|
|
41
|
+
help="Start of time range (ISO 8601)",
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"--to",
|
|
45
|
+
dest="to_ts",
|
|
46
|
+
metavar="ISO",
|
|
47
|
+
help="End of time range (ISO 8601)",
|
|
48
|
+
)
|
|
37
49
|
|
|
38
50
|
@staticmethod
|
|
39
51
|
def execute(args) -> int:
|
|
@@ -48,6 +60,17 @@ class WhenCommand:
|
|
|
48
60
|
|
|
49
61
|
fact_lower = args.fact.lower()
|
|
50
62
|
file_filter = args.file.replace("current/", "").lstrip("/") if args.file else None
|
|
63
|
+
from_ts = getattr(args, "from_ts", None)
|
|
64
|
+
to_ts = getattr(args, "to_ts", None)
|
|
65
|
+
commits_in_range = None
|
|
66
|
+
if from_ts and to_ts:
|
|
67
|
+
try:
|
|
68
|
+
from ..core.temporal_index import TemporalIndex
|
|
69
|
+
ti = TemporalIndex(repo.mem_dir, repo.object_store)
|
|
70
|
+
range_entries = ti.range_query(from_ts, to_ts)
|
|
71
|
+
commits_in_range = {ch for _, ch in range_entries}
|
|
72
|
+
except Exception:
|
|
73
|
+
pass
|
|
51
74
|
|
|
52
75
|
# Walk commit history from HEAD
|
|
53
76
|
head = repo.refs.get_head()
|
|
@@ -63,6 +86,10 @@ class WhenCommand:
|
|
|
63
86
|
if commit_hash in seen:
|
|
64
87
|
break
|
|
65
88
|
seen.add(commit_hash)
|
|
89
|
+
if commits_in_range is not None and commit_hash not in commits_in_range:
|
|
90
|
+
commit = Commit.load(repo.object_store, commit_hash)
|
|
91
|
+
commit_hash = commit.parents[0] if commit and commit.parents else None
|
|
92
|
+
continue
|
|
66
93
|
|
|
67
94
|
commit = Commit.load(repo.object_store, commit_hash)
|
|
68
95
|
if not commit:
|
memvcs/core/audit.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tamper-evident audit trail for agmem.
|
|
3
|
+
|
|
4
|
+
Append-only, hash-chained log of significant operations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import datetime
|
|
8
|
+
import hashlib
|
|
9
|
+
import hmac
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Optional, List, Dict, Any, Tuple
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _audit_dir(mem_dir: Path) -> Path:
|
|
17
|
+
return mem_dir / "audit"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _log_path(mem_dir: Path) -> Path:
|
|
21
|
+
return _audit_dir(mem_dir) / "log"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _get_previous_hash(mem_dir: Path) -> str:
|
|
25
|
+
"""Read last line of audit log and return its entry hash, or empty for first entry."""
|
|
26
|
+
path = _log_path(mem_dir)
|
|
27
|
+
if not path.exists():
|
|
28
|
+
return ""
|
|
29
|
+
lines = path.read_text().strip().split("\n")
|
|
30
|
+
if not lines:
|
|
31
|
+
return ""
|
|
32
|
+
# Format per line: entry_hash\tpayload_json
|
|
33
|
+
for line in reversed(lines):
|
|
34
|
+
line = line.strip()
|
|
35
|
+
if not line:
|
|
36
|
+
continue
|
|
37
|
+
if "\t" in line:
|
|
38
|
+
return line.split("\t", 1)[0]
|
|
39
|
+
return ""
|
|
40
|
+
return ""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _hash_entry(prev_hash: str, payload: str) -> str:
|
|
44
|
+
"""Compute this entry's hash: SHA-256(prev_hash + payload)."""
|
|
45
|
+
return hashlib.sha256((prev_hash + payload).encode()).hexdigest()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def append_audit(
|
|
49
|
+
mem_dir: Path,
|
|
50
|
+
operation: str,
|
|
51
|
+
details: Optional[Dict[str, Any]] = None,
|
|
52
|
+
) -> None:
|
|
53
|
+
"""
|
|
54
|
+
Append a tamper-evident audit entry. Write synchronously.
|
|
55
|
+
Each entry: entry_hash TAB payload_json (payload has timestamp, operation, details, prev_hash).
|
|
56
|
+
"""
|
|
57
|
+
mem_dir = Path(mem_dir)
|
|
58
|
+
_audit_dir(mem_dir).mkdir(parents=True, exist_ok=True)
|
|
59
|
+
path = _log_path(mem_dir)
|
|
60
|
+
prev_hash = _get_previous_hash(mem_dir)
|
|
61
|
+
payload = {
|
|
62
|
+
"timestamp": datetime.datetime.utcnow().isoformat() + "Z",
|
|
63
|
+
"operation": operation,
|
|
64
|
+
"details": details or {},
|
|
65
|
+
"prev_hash": prev_hash,
|
|
66
|
+
}
|
|
67
|
+
payload_str = json.dumps(payload, sort_keys=True)
|
|
68
|
+
entry_hash = _hash_entry(prev_hash, payload_str)
|
|
69
|
+
line = f"{entry_hash}\t{payload_str}\n"
|
|
70
|
+
with open(path, "a", encoding="utf-8") as f:
|
|
71
|
+
f.write(line)
|
|
72
|
+
f.flush()
|
|
73
|
+
try:
|
|
74
|
+
os.fsync(f.fileno())
|
|
75
|
+
except (AttributeError, OSError):
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def read_audit(mem_dir: Path, max_entries: int = 1000) -> List[Dict[str, Any]]:
|
|
80
|
+
"""Read audit log entries (newest first). Each entry has entry_hash, prev_hash, timestamp, operation, details."""
|
|
81
|
+
path = _log_path(mem_dir)
|
|
82
|
+
if not path.exists():
|
|
83
|
+
return []
|
|
84
|
+
entries = []
|
|
85
|
+
for line in reversed(path.read_text().strip().split("\n")):
|
|
86
|
+
line = line.strip()
|
|
87
|
+
if not line:
|
|
88
|
+
continue
|
|
89
|
+
if "\t" not in line:
|
|
90
|
+
continue
|
|
91
|
+
entry_hash, payload_str = line.split("\t", 1)
|
|
92
|
+
try:
|
|
93
|
+
payload = json.loads(payload_str)
|
|
94
|
+
except json.JSONDecodeError:
|
|
95
|
+
continue
|
|
96
|
+
payload["entry_hash"] = entry_hash
|
|
97
|
+
entries.append(payload)
|
|
98
|
+
if len(entries) >= max_entries:
|
|
99
|
+
break
|
|
100
|
+
return entries
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def verify_audit(mem_dir: Path) -> Tuple[bool, Optional[int]]:
|
|
104
|
+
"""
|
|
105
|
+
Verify the audit log chain. Returns (valid, first_bad_index).
|
|
106
|
+
first_bad_index is 0-based index of first entry that fails chain verification.
|
|
107
|
+
"""
|
|
108
|
+
path = _log_path(mem_dir)
|
|
109
|
+
if not path.exists():
|
|
110
|
+
return (True, None)
|
|
111
|
+
lines = path.read_text().strip().split("\n")
|
|
112
|
+
prev_hash = ""
|
|
113
|
+
for i, line in enumerate(lines):
|
|
114
|
+
line = line.strip()
|
|
115
|
+
if not line:
|
|
116
|
+
continue
|
|
117
|
+
if "\t" not in line:
|
|
118
|
+
return (False, i)
|
|
119
|
+
entry_hash, payload_str = line.split("\t", 1)
|
|
120
|
+
expected_hash = _hash_entry(prev_hash, payload_str)
|
|
121
|
+
if not hmac.compare_digest(entry_hash, expected_hash):
|
|
122
|
+
return (False, i)
|
|
123
|
+
prev_hash = entry_hash
|
|
124
|
+
return (True, None)
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Enhanced semantic compression pipeline for agmem (#11).
|
|
3
|
+
|
|
4
|
+
Multi-stage: chunk -> fact extraction -> dedup -> embed -> tiered storage.
|
|
5
|
+
Hybrid retrieval (keyword + vector) is in memvcs.retrieval.strategies.HybridStrategy.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import re
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import List, Optional, Tuple, Any
|
|
12
|
+
|
|
13
|
+
from .constants import MEMORY_TYPES
|
|
14
|
+
|
|
15
|
+
CHUNK_SIZE_DEFAULT = 512
|
|
16
|
+
CHUNK_OVERLAP = 64
|
|
17
|
+
DEDUP_HASH_ALGO = "sha256"
|
|
18
|
+
TIER_HOT_DAYS = 7
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def chunk_by_size(text: str, size: int = CHUNK_SIZE_DEFAULT, overlap: int = CHUNK_OVERLAP) -> List[str]:
|
|
22
|
+
"""Split text into chunks by character size with optional overlap."""
|
|
23
|
+
if not text or size <= 0:
|
|
24
|
+
return []
|
|
25
|
+
chunks = []
|
|
26
|
+
start = 0
|
|
27
|
+
while start < len(text):
|
|
28
|
+
end = min(start + size, len(text))
|
|
29
|
+
chunk = text[start:end].strip()
|
|
30
|
+
if chunk:
|
|
31
|
+
chunks.append(chunk)
|
|
32
|
+
start = end - overlap if end < len(text) else len(text)
|
|
33
|
+
return chunks
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def chunk_by_sentences(text: str, max_chunk_chars: int = 512) -> List[str]:
|
|
37
|
+
"""Split text into chunks by sentence boundaries, up to max_chunk_chars per chunk."""
|
|
38
|
+
if not text:
|
|
39
|
+
return []
|
|
40
|
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
41
|
+
chunks = []
|
|
42
|
+
current = []
|
|
43
|
+
current_len = 0
|
|
44
|
+
for s in sentences:
|
|
45
|
+
s = s.strip()
|
|
46
|
+
if not s:
|
|
47
|
+
continue
|
|
48
|
+
if current_len + len(s) + 1 <= max_chunk_chars:
|
|
49
|
+
current.append(s)
|
|
50
|
+
current_len += len(s) + 1
|
|
51
|
+
else:
|
|
52
|
+
if current:
|
|
53
|
+
chunks.append(" ".join(current))
|
|
54
|
+
current = [s]
|
|
55
|
+
current_len = len(s) + 1
|
|
56
|
+
if current:
|
|
57
|
+
chunks.append(" ".join(current))
|
|
58
|
+
return chunks
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def extract_facts_from_chunk(chunk: str) -> List[str]:
|
|
62
|
+
"""Extract fact-like lines (bullets or short statements). Reuse distiller logic in callers if needed."""
|
|
63
|
+
facts = []
|
|
64
|
+
for line in chunk.splitlines():
|
|
65
|
+
line = line.strip()
|
|
66
|
+
if not line or line.startswith("#"):
|
|
67
|
+
continue
|
|
68
|
+
if line.startswith("- ") and len(line) > 10:
|
|
69
|
+
facts.append(line)
|
|
70
|
+
elif len(line) > 20 and len(line) < 300 and not line.startswith("```"):
|
|
71
|
+
facts.append(line)
|
|
72
|
+
return facts[:15]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def dedup_by_hash(items: List[str]) -> List[Tuple[str, str]]:
|
|
76
|
+
"""Return (item, hash_hex) for unique items by content hash. Order preserved, first occurrence kept."""
|
|
77
|
+
seen_hashes = set()
|
|
78
|
+
result = []
|
|
79
|
+
for item in items:
|
|
80
|
+
h = hashlib.new(DEDUP_HASH_ALGO, item.encode()).hexdigest()
|
|
81
|
+
if h not in seen_hashes:
|
|
82
|
+
seen_hashes.add(h)
|
|
83
|
+
result.append((item, h))
|
|
84
|
+
return result
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def dedup_by_similarity_threshold(
|
|
88
|
+
items: List[str], vector_store: Any, threshold: float = 0.95
|
|
89
|
+
) -> List[str]:
|
|
90
|
+
"""Filter items by embedding similarity; keep first of clusters above threshold. Requires vector_store."""
|
|
91
|
+
if not items or vector_store is None:
|
|
92
|
+
return items
|
|
93
|
+
try:
|
|
94
|
+
embeddings = vector_store.embed(items)
|
|
95
|
+
kept = [items[0]]
|
|
96
|
+
for i in range(1, len(items)):
|
|
97
|
+
sims = [vector_store.similarity(embeddings[i], vector_store.embed([kept[j]])[0]) for j in range(len(kept))]
|
|
98
|
+
if not any(s >= threshold for s in sims):
|
|
99
|
+
kept.append(items[i])
|
|
100
|
+
return kept
|
|
101
|
+
except Exception:
|
|
102
|
+
return items
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class CompressionPipeline:
|
|
106
|
+
"""
|
|
107
|
+
Multi-stage compression: chunk -> optional fact extraction -> dedup -> optional embed -> tiered storage.
|
|
108
|
+
Wire to vector_store and retrieval for hybrid recall.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
def __init__(
|
|
112
|
+
self,
|
|
113
|
+
chunk_size: int = CHUNK_SIZE_DEFAULT,
|
|
114
|
+
use_sentences: bool = True,
|
|
115
|
+
extract_facts: bool = False,
|
|
116
|
+
dedup_hash: bool = True,
|
|
117
|
+
vector_store: Optional[Any] = None,
|
|
118
|
+
tier_by_recency: bool = True,
|
|
119
|
+
):
|
|
120
|
+
self.chunk_size = chunk_size
|
|
121
|
+
self.use_sentences = use_sentences
|
|
122
|
+
self.extract_facts = extract_facts
|
|
123
|
+
self.dedup_hash = dedup_hash
|
|
124
|
+
self.vector_store = vector_store
|
|
125
|
+
self.tier_by_recency = tier_by_recency
|
|
126
|
+
|
|
127
|
+
def chunk(self, text: str) -> List[str]:
|
|
128
|
+
"""Chunk text by size or sentences."""
|
|
129
|
+
if self.use_sentences:
|
|
130
|
+
return chunk_by_sentences(text, max_chunk_chars=self.chunk_size)
|
|
131
|
+
return chunk_by_size(text, size=self.chunk_size)
|
|
132
|
+
|
|
133
|
+
def run(self, text: str, path: Optional[Path] = None) -> List[Tuple[str, str, Optional[str]]]:
|
|
134
|
+
"""
|
|
135
|
+
Run pipeline: chunk -> optional fact extraction -> dedup.
|
|
136
|
+
Returns list of (content, content_hash, tier) where tier is "hot" or "cold" or None.
|
|
137
|
+
"""
|
|
138
|
+
chunks = self.chunk(text)
|
|
139
|
+
if self.extract_facts:
|
|
140
|
+
facts = []
|
|
141
|
+
for c in chunks:
|
|
142
|
+
facts.extend(extract_facts_from_chunk(c))
|
|
143
|
+
chunks = facts if facts else chunks
|
|
144
|
+
if self.dedup_hash:
|
|
145
|
+
chunk_tuples = dedup_by_hash(chunks)
|
|
146
|
+
else:
|
|
147
|
+
chunk_tuples = [(c, hashlib.new(DEDUP_HASH_ALGO, c.encode()).hexdigest()) for c in chunks]
|
|
148
|
+
tier = None
|
|
149
|
+
if self.tier_by_recency and path and path.exists():
|
|
150
|
+
try:
|
|
151
|
+
mtime = path.stat().st_mtime
|
|
152
|
+
from datetime import datetime, timezone
|
|
153
|
+
age_days = (datetime.now(timezone.utc).timestamp() - mtime) / 86400
|
|
154
|
+
tier = "hot" if age_days <= TIER_HOT_DAYS else "cold"
|
|
155
|
+
except Exception:
|
|
156
|
+
pass
|
|
157
|
+
return [(c, h, tier) for c, h in chunk_tuples]
|
memvcs/core/consistency.py
CHANGED
|
@@ -100,23 +100,23 @@ class ConsistencyChecker:
|
|
|
100
100
|
return triples
|
|
101
101
|
|
|
102
102
|
def _extract_triples_llm(self, content: str, source: str) -> List[Triple]:
|
|
103
|
-
"""Extract triples using LLM."""
|
|
103
|
+
"""Extract triples using LLM (multi-provider)."""
|
|
104
104
|
try:
|
|
105
|
-
import
|
|
105
|
+
from .llm import get_provider
|
|
106
106
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
107
|
+
provider = get_provider(provider_name=self.llm_provider)
|
|
108
|
+
if not provider:
|
|
109
|
+
return []
|
|
110
|
+
text = provider.complete(
|
|
111
|
+
[
|
|
110
112
|
{
|
|
111
113
|
"role": "system",
|
|
112
|
-
"content": "Extract factual statements as (subject, predicate, object) triples. "
|
|
113
|
-
"One per line, format: SUBJECT | PREDICATE | OBJECT",
|
|
114
|
+
"content": "Extract factual statements as (subject, predicate, object) triples. One per line, format: SUBJECT | PREDICATE | OBJECT",
|
|
114
115
|
},
|
|
115
116
|
{"role": "user", "content": content[:3000]},
|
|
116
117
|
],
|
|
117
118
|
max_tokens=500,
|
|
118
119
|
)
|
|
119
|
-
text = response.choices[0].message.content
|
|
120
120
|
triples = []
|
|
121
121
|
for i, line in enumerate(text.splitlines(), 1):
|
|
122
122
|
if "|" in line:
|
|
@@ -138,7 +138,7 @@ class ConsistencyChecker:
|
|
|
138
138
|
|
|
139
139
|
def extract_triples(self, content: str, source: str, use_llm: bool = False) -> List[Triple]:
|
|
140
140
|
"""Extract triples from content."""
|
|
141
|
-
if use_llm and self.llm_provider
|
|
141
|
+
if use_llm and self.llm_provider:
|
|
142
142
|
t = self._extract_triples_llm(content, source)
|
|
143
143
|
if t:
|
|
144
144
|
return t
|