agmem 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agmem-0.1.5.dist-info → agmem-0.2.0.dist-info}/METADATA +231 -54
- {agmem-0.1.5.dist-info → agmem-0.2.0.dist-info}/RECORD +18 -13
- memvcs/__init__.py +1 -1
- memvcs/commands/daemon.py +37 -1
- memvcs/commands/distill.py +6 -0
- memvcs/coordinator/__init__.py +5 -0
- memvcs/coordinator/server.py +223 -0
- memvcs/core/delta.py +258 -0
- memvcs/core/distiller.py +74 -50
- memvcs/core/pack.py +191 -33
- memvcs/core/remote.py +82 -2
- memvcs/core/zk_proofs.py +62 -5
- memvcs/health/__init__.py +25 -0
- memvcs/health/monitor.py +452 -0
- {agmem-0.1.5.dist-info → agmem-0.2.0.dist-info}/WHEEL +0 -0
- {agmem-0.1.5.dist-info → agmem-0.2.0.dist-info}/entry_points.txt +0 -0
- {agmem-0.1.5.dist-info → agmem-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {agmem-0.1.5.dist-info → agmem-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Minimal Federated Coordinator Server for agmem.
|
|
3
|
+
|
|
4
|
+
Implements the coordinator API from docs/FEDERATED.md:
|
|
5
|
+
- POST /push: Accept agent summaries
|
|
6
|
+
- GET /pull: Return merged summaries
|
|
7
|
+
|
|
8
|
+
This is a reference implementation. For production:
|
|
9
|
+
- Add authentication (API keys, OAuth)
|
|
10
|
+
- Use persistent storage (PostgreSQL, Redis)
|
|
11
|
+
- Add rate limiting
|
|
12
|
+
- Enable HTTPS
|
|
13
|
+
- Scale horizontally
|
|
14
|
+
|
|
15
|
+
Install: pip install "agmem[coordinator]"
|
|
16
|
+
Run: uvicorn memvcs.coordinator.server:app --host 0.0.0.0 --port 8000
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from datetime import datetime, timezone
|
|
20
|
+
from typing import Dict, List, Optional, Any
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
import json
|
|
23
|
+
import hashlib
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
from fastapi import FastAPI, HTTPException, Request
|
|
27
|
+
from fastapi.responses import JSONResponse
|
|
28
|
+
from pydantic import BaseModel, Field
|
|
29
|
+
|
|
30
|
+
FASTAPI_AVAILABLE = True
|
|
31
|
+
except ImportError:
|
|
32
|
+
FASTAPI_AVAILABLE = False
|
|
33
|
+
|
|
34
|
+
# Stub for when FastAPI not installed
|
|
35
|
+
class BaseModel:
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
def Field(*args, **kwargs):
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Storage: In-memory for simplicity (use Redis/PostgreSQL for production)
|
|
43
|
+
summaries_store: Dict[str, List[Dict[str, Any]]] = {}
|
|
44
|
+
metadata_store: Dict[str, Any] = {
|
|
45
|
+
"coordinator_version": "0.1.6",
|
|
46
|
+
"started_at": datetime.now(timezone.utc).isoformat(),
|
|
47
|
+
"total_pushes": 0,
|
|
48
|
+
"total_agents": 0,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class AgentSummary(BaseModel):
|
|
53
|
+
"""Agent summary for federated push."""
|
|
54
|
+
|
|
55
|
+
agent_id: str = Field(..., description="Unique agent identifier")
|
|
56
|
+
timestamp: str = Field(..., description="ISO 8601 timestamp")
|
|
57
|
+
topic_counts: Dict[str, int] = Field(default_factory=dict, description="Topic -> count")
|
|
58
|
+
fact_hashes: List[str] = Field(default_factory=list, description="SHA-256 hashes of facts")
|
|
59
|
+
metadata: Optional[Dict[str, Any]] = Field(default=None, description="Optional metadata")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class PushRequest(BaseModel):
|
|
63
|
+
"""Request body for /push endpoint."""
|
|
64
|
+
|
|
65
|
+
summary: AgentSummary
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class PullResponse(BaseModel):
|
|
69
|
+
"""Response body for /pull endpoint."""
|
|
70
|
+
|
|
71
|
+
merged_topic_counts: Dict[str, int]
|
|
72
|
+
unique_fact_hashes: List[str]
|
|
73
|
+
contributing_agents: int
|
|
74
|
+
last_updated: str
|
|
75
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if FASTAPI_AVAILABLE:
|
|
79
|
+
app = FastAPI(
|
|
80
|
+
title="agmem Federated Coordinator",
|
|
81
|
+
description="Minimal coordinator for federated agent memory collaboration",
|
|
82
|
+
version="0.1.6",
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
@app.get("/")
|
|
86
|
+
async def root():
|
|
87
|
+
"""Health check and API info."""
|
|
88
|
+
return {
|
|
89
|
+
"service": "agmem-coordinator",
|
|
90
|
+
"version": metadata_store["coordinator_version"],
|
|
91
|
+
"status": "running",
|
|
92
|
+
"endpoints": {
|
|
93
|
+
"push": "POST /push",
|
|
94
|
+
"pull": "GET /pull",
|
|
95
|
+
"health": "GET /health",
|
|
96
|
+
},
|
|
97
|
+
"started_at": metadata_store["started_at"],
|
|
98
|
+
"total_pushes": metadata_store["total_pushes"],
|
|
99
|
+
"total_agents": metadata_store["total_agents"],
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
@app.get("/health")
|
|
103
|
+
async def health():
|
|
104
|
+
"""Health check endpoint."""
|
|
105
|
+
return {
|
|
106
|
+
"status": "healthy",
|
|
107
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
@app.post("/push", response_model=Dict[str, Any])
|
|
111
|
+
async def push(request: PushRequest):
|
|
112
|
+
"""
|
|
113
|
+
Accept agent summary and store it.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Confirmation with push timestamp
|
|
117
|
+
"""
|
|
118
|
+
summary = request.summary
|
|
119
|
+
|
|
120
|
+
# Validate timestamp
|
|
121
|
+
try:
|
|
122
|
+
datetime.fromisoformat(summary.timestamp.replace("Z", "+00:00"))
|
|
123
|
+
except ValueError:
|
|
124
|
+
raise HTTPException(
|
|
125
|
+
status_code=400, detail="Invalid timestamp format (expected ISO 8601)"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Store summary by agent_id
|
|
129
|
+
if summary.agent_id not in summaries_store:
|
|
130
|
+
summaries_store[summary.agent_id] = []
|
|
131
|
+
metadata_store["total_agents"] += 1
|
|
132
|
+
|
|
133
|
+
summaries_store[summary.agent_id].append(summary.dict())
|
|
134
|
+
metadata_store["total_pushes"] += 1
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
"status": "accepted",
|
|
138
|
+
"agent_id": summary.agent_id,
|
|
139
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
140
|
+
"message": f"Summary from {summary.agent_id} stored successfully",
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
@app.get("/pull", response_model=PullResponse)
|
|
144
|
+
async def pull():
|
|
145
|
+
"""
|
|
146
|
+
Return merged summaries from all agents.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Aggregated topic counts, unique fact hashes, contributing agent count
|
|
150
|
+
"""
|
|
151
|
+
if not summaries_store:
|
|
152
|
+
return PullResponse(
|
|
153
|
+
merged_topic_counts={},
|
|
154
|
+
unique_fact_hashes=[],
|
|
155
|
+
contributing_agents=0,
|
|
156
|
+
last_updated=datetime.now(timezone.utc).isoformat(),
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Merge topic counts across all agents
|
|
160
|
+
merged_topics: Dict[str, int] = {}
|
|
161
|
+
all_fact_hashes = set()
|
|
162
|
+
latest_timestamp = None
|
|
163
|
+
|
|
164
|
+
for agent_id, summaries in summaries_store.items():
|
|
165
|
+
for summary in summaries:
|
|
166
|
+
# Aggregate topic counts
|
|
167
|
+
for topic, count in summary.get("topic_counts", {}).items():
|
|
168
|
+
merged_topics[topic] = merged_topics.get(topic, 0) + count
|
|
169
|
+
|
|
170
|
+
# Collect unique fact hashes
|
|
171
|
+
for fact_hash in summary.get("fact_hashes", []):
|
|
172
|
+
all_fact_hashes.add(fact_hash)
|
|
173
|
+
|
|
174
|
+
# Track latest update
|
|
175
|
+
ts = summary.get("timestamp")
|
|
176
|
+
if ts:
|
|
177
|
+
if latest_timestamp is None or ts > latest_timestamp:
|
|
178
|
+
latest_timestamp = ts
|
|
179
|
+
|
|
180
|
+
return PullResponse(
|
|
181
|
+
merged_topic_counts=merged_topics,
|
|
182
|
+
unique_fact_hashes=sorted(list(all_fact_hashes)),
|
|
183
|
+
contributing_agents=len(summaries_store),
|
|
184
|
+
last_updated=latest_timestamp or datetime.now(timezone.utc).isoformat(),
|
|
185
|
+
metadata={
|
|
186
|
+
"total_facts": len(all_fact_hashes),
|
|
187
|
+
"total_topics": len(merged_topics),
|
|
188
|
+
},
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
@app.delete("/admin/reset")
|
|
192
|
+
async def admin_reset(request: Request):
|
|
193
|
+
"""
|
|
194
|
+
Admin endpoint to reset all stored data.
|
|
195
|
+
In production, protect this with authentication!
|
|
196
|
+
"""
|
|
197
|
+
summaries_store.clear()
|
|
198
|
+
metadata_store["total_pushes"] = 0
|
|
199
|
+
metadata_store["total_agents"] = 0
|
|
200
|
+
metadata_store["started_at"] = datetime.now(timezone.utc).isoformat()
|
|
201
|
+
|
|
202
|
+
return {
|
|
203
|
+
"status": "reset",
|
|
204
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
else:
|
|
208
|
+
# Stub when FastAPI not available
|
|
209
|
+
app = None
|
|
210
|
+
print("FastAPI not available. Install with: pip install 'agmem[coordinator]'")
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
if __name__ == "__main__":
|
|
214
|
+
if not FASTAPI_AVAILABLE:
|
|
215
|
+
print("Error: FastAPI not installed")
|
|
216
|
+
print("Install with: pip install 'fastapi[all]' uvicorn")
|
|
217
|
+
exit(1)
|
|
218
|
+
|
|
219
|
+
import uvicorn
|
|
220
|
+
|
|
221
|
+
print("Starting agmem Federated Coordinator...")
|
|
222
|
+
print("API docs: http://localhost:8000/docs")
|
|
223
|
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
memvcs/core/delta.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Delta encoding for pack files.
|
|
3
|
+
|
|
4
|
+
Compress similar objects using delta encoding. For objects with similar content,
|
|
5
|
+
store the first in full and subsequent ones as deltas (differences).
|
|
6
|
+
|
|
7
|
+
This can achieve 5-10x compression improvement for highly similar content
|
|
8
|
+
(common in agent episodic logs, semantic consolidations, etc).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
from typing import List, Tuple, Dict, Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def levenshtein_distance(s1: bytes, s2: bytes) -> int:
|
|
16
|
+
"""
|
|
17
|
+
Compute Levenshtein distance between two byte sequences.
|
|
18
|
+
Returns edit distance (insertions, deletions, substitutions).
|
|
19
|
+
"""
|
|
20
|
+
if len(s1) < len(s2):
|
|
21
|
+
s1, s2 = s2, s1
|
|
22
|
+
|
|
23
|
+
if len(s2) == 0:
|
|
24
|
+
return len(s1)
|
|
25
|
+
|
|
26
|
+
prev = list(range(len(s2) + 1))
|
|
27
|
+
for i, c1 in enumerate(s1):
|
|
28
|
+
curr = [i + 1]
|
|
29
|
+
for j, c2 in enumerate(s2):
|
|
30
|
+
insertions = prev[j + 1] + 1
|
|
31
|
+
deletions = curr[j] + 1
|
|
32
|
+
substitutions = prev[j] + (c1 != c2)
|
|
33
|
+
curr.append(min(insertions, deletions, substitutions))
|
|
34
|
+
prev = curr
|
|
35
|
+
|
|
36
|
+
return prev[-1]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def content_similarity(data1: bytes, data2: bytes) -> float:
|
|
40
|
+
"""
|
|
41
|
+
Calculate similarity between two byte sequences (0.0 to 1.0).
|
|
42
|
+
Based on Levenshtein distance normalized by max length.
|
|
43
|
+
"""
|
|
44
|
+
if not data1 or not data2:
|
|
45
|
+
return 0.0
|
|
46
|
+
|
|
47
|
+
distance = levenshtein_distance(data1, data2)
|
|
48
|
+
max_len = max(len(data1), len(data2))
|
|
49
|
+
|
|
50
|
+
if max_len == 0:
|
|
51
|
+
return 1.0
|
|
52
|
+
|
|
53
|
+
return 1.0 - (distance / max_len)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def find_similar_objects(
|
|
57
|
+
objects: Dict[str, bytes],
|
|
58
|
+
similarity_threshold: float = 0.7,
|
|
59
|
+
min_size: int = 100,
|
|
60
|
+
) -> List[List[str]]:
|
|
61
|
+
"""
|
|
62
|
+
Group objects by similarity.
|
|
63
|
+
|
|
64
|
+
Returns list of groups, where each group is a list of object hashes
|
|
65
|
+
sorted by size (smallest first - best compression base).
|
|
66
|
+
Only includes objects >= min_size.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
objects: dict of hash_id -> content
|
|
70
|
+
similarity_threshold: minimum similarity (0.0-1.0) to group
|
|
71
|
+
min_size: minimum object size to consider for delta
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
List of similarity groups, each sorted by size ascending
|
|
75
|
+
"""
|
|
76
|
+
candidates = {h: content for h, content in objects.items() if len(content) >= min_size}
|
|
77
|
+
|
|
78
|
+
if not candidates:
|
|
79
|
+
return []
|
|
80
|
+
|
|
81
|
+
grouped = {}
|
|
82
|
+
used = set()
|
|
83
|
+
|
|
84
|
+
for hash_id, content in candidates.items():
|
|
85
|
+
if hash_id in used:
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
group = [hash_id]
|
|
89
|
+
used.add(hash_id)
|
|
90
|
+
|
|
91
|
+
for other_id, other_content in candidates.items():
|
|
92
|
+
if other_id in used:
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
similarity = content_similarity(content, other_content)
|
|
96
|
+
if similarity >= similarity_threshold:
|
|
97
|
+
group.append(other_id)
|
|
98
|
+
used.add(other_id)
|
|
99
|
+
|
|
100
|
+
if len(group) > 1:
|
|
101
|
+
# Sort by size ascending (smallest first = best base)
|
|
102
|
+
group.sort(key=lambda h: len(candidates[h]))
|
|
103
|
+
grouped[group[0]] = group
|
|
104
|
+
|
|
105
|
+
return list(grouped.values())
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def compute_delta(base: bytes, target: bytes) -> bytes:
|
|
109
|
+
"""
|
|
110
|
+
Compute delta from base to target using simple run-length + offset encoding.
|
|
111
|
+
|
|
112
|
+
Format:
|
|
113
|
+
- 0x00: Copy op - next 4 bytes = offset in base, next 4 bytes = length
|
|
114
|
+
- 0x01: Insert op - next 4 bytes = length, then <length> bytes of data
|
|
115
|
+
- 0x02: End marker
|
|
116
|
+
|
|
117
|
+
This is NOT the most efficient delta algorithm but simple and effective
|
|
118
|
+
for similar objects. Production code could use bsdiff, xdelta3, etc.
|
|
119
|
+
"""
|
|
120
|
+
from difflib import SequenceMatcher
|
|
121
|
+
|
|
122
|
+
matcher = SequenceMatcher(None, base, target)
|
|
123
|
+
matching_blocks = matcher.get_matching_blocks()
|
|
124
|
+
|
|
125
|
+
delta = bytearray()
|
|
126
|
+
target_pos = 0
|
|
127
|
+
|
|
128
|
+
for block in matching_blocks:
|
|
129
|
+
base_start, target_start, size = block.a, block.b, block.size
|
|
130
|
+
|
|
131
|
+
# Insert any unmapped target bytes before this block
|
|
132
|
+
if target_start > target_pos:
|
|
133
|
+
insert_len = target_start - target_pos
|
|
134
|
+
insert_data = target[target_pos:target_start]
|
|
135
|
+
delta.append(0x01) # Insert op
|
|
136
|
+
delta.extend(insert_len.to_bytes(4, "big"))
|
|
137
|
+
delta.extend(insert_data)
|
|
138
|
+
|
|
139
|
+
# Copy block from base
|
|
140
|
+
if size > 0:
|
|
141
|
+
delta.append(0x00) # Copy op
|
|
142
|
+
delta.extend(base_start.to_bytes(4, "big"))
|
|
143
|
+
delta.extend(size.to_bytes(4, "big"))
|
|
144
|
+
|
|
145
|
+
target_pos = target_start + size
|
|
146
|
+
|
|
147
|
+
# Insert any remaining target bytes
|
|
148
|
+
if target_pos < len(target):
|
|
149
|
+
insert_len = len(target) - target_pos
|
|
150
|
+
insert_data = target[target_pos:]
|
|
151
|
+
delta.append(0x01) # Insert op
|
|
152
|
+
delta.extend(insert_len.to_bytes(4, "big"))
|
|
153
|
+
delta.extend(insert_data)
|
|
154
|
+
|
|
155
|
+
delta.append(0x02) # End marker
|
|
156
|
+
|
|
157
|
+
return bytes(delta)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def apply_delta(base: bytes, delta: bytes) -> bytes:
|
|
161
|
+
"""Apply delta to base to reconstruct target."""
|
|
162
|
+
result = bytearray()
|
|
163
|
+
pos = 0
|
|
164
|
+
|
|
165
|
+
while pos < len(delta):
|
|
166
|
+
op = delta[pos]
|
|
167
|
+
pos += 1
|
|
168
|
+
|
|
169
|
+
if op == 0x00: # Copy op
|
|
170
|
+
if pos + 8 > len(delta):
|
|
171
|
+
break
|
|
172
|
+
offset = int.from_bytes(delta[pos : pos + 4], "big")
|
|
173
|
+
length = int.from_bytes(delta[pos + 4 : pos + 8], "big")
|
|
174
|
+
pos += 8
|
|
175
|
+
result.extend(base[offset : offset + length])
|
|
176
|
+
|
|
177
|
+
elif op == 0x01: # Insert op
|
|
178
|
+
if pos + 4 > len(delta):
|
|
179
|
+
break
|
|
180
|
+
length = int.from_bytes(delta[pos : pos + 4], "big")
|
|
181
|
+
pos += 4
|
|
182
|
+
if pos + length > len(delta):
|
|
183
|
+
break
|
|
184
|
+
result.extend(delta[pos : pos + length])
|
|
185
|
+
pos += length
|
|
186
|
+
|
|
187
|
+
elif op == 0x02: # End marker
|
|
188
|
+
break
|
|
189
|
+
|
|
190
|
+
return bytes(result)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def estimate_delta_compression(base: bytes, target: bytes, delta: bytes) -> Tuple[int, float]:
|
|
194
|
+
"""
|
|
195
|
+
Estimate compression achieved by delta.
|
|
196
|
+
|
|
197
|
+
Returns (original_size, ratio) where ratio = 1.0 is no compression,
|
|
198
|
+
ratio = 0.5 means delta is 50% of original target size.
|
|
199
|
+
"""
|
|
200
|
+
original_size = len(target)
|
|
201
|
+
delta_size = len(delta)
|
|
202
|
+
|
|
203
|
+
if original_size == 0:
|
|
204
|
+
return (0, 0.0)
|
|
205
|
+
|
|
206
|
+
ratio = delta_size / original_size
|
|
207
|
+
return (original_size, ratio)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class DeltaCache:
|
|
211
|
+
"""
|
|
212
|
+
Cache deltas between similar objects.
|
|
213
|
+
|
|
214
|
+
Tracks base->target relationships and stores pre-computed deltas
|
|
215
|
+
to avoid recomputation.
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
def __init__(self):
|
|
219
|
+
self.deltas: Dict[Tuple[str, str], bytes] = {} # (base_hash, target_hash) -> delta
|
|
220
|
+
self.bases: Dict[str, bytes] = {} # target_hash -> base_hash (reconstruction path)
|
|
221
|
+
|
|
222
|
+
def add_delta(self, base_hash: str, target_hash: str, delta: bytes):
|
|
223
|
+
"""Register a delta relationship."""
|
|
224
|
+
self.deltas[(base_hash, target_hash)] = delta
|
|
225
|
+
self.bases[target_hash] = base_hash
|
|
226
|
+
|
|
227
|
+
def get_delta(self, base_hash: str, target_hash: str) -> Optional[bytes]:
|
|
228
|
+
"""Retrieve cached delta."""
|
|
229
|
+
return self.deltas.get((base_hash, target_hash))
|
|
230
|
+
|
|
231
|
+
def get_base(self, target_hash: str) -> Optional[str]:
|
|
232
|
+
"""Get the base hash for a target."""
|
|
233
|
+
return self.bases.get(target_hash)
|
|
234
|
+
|
|
235
|
+
def estimate_total_savings(self, objects: Dict[str, int]) -> Tuple[int, int]:
|
|
236
|
+
"""
|
|
237
|
+
Estimate total size savings from all deltas.
|
|
238
|
+
|
|
239
|
+
Returns (original_total, compressed_total).
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
objects: dict of hash_id -> original_size
|
|
243
|
+
"""
|
|
244
|
+
original_total = sum(objects.values())
|
|
245
|
+
compressed_total = 0
|
|
246
|
+
|
|
247
|
+
for (base_hash, target_hash), delta in self.deltas.items():
|
|
248
|
+
# Target stored as delta instead of full copy
|
|
249
|
+
compressed_total += len(delta)
|
|
250
|
+
|
|
251
|
+
# Add all non-delta objects
|
|
252
|
+
all_objects = set(objects.keys())
|
|
253
|
+
delta_targets = set(self.bases.keys())
|
|
254
|
+
non_delta = all_objects - delta_targets
|
|
255
|
+
for obj_hash in non_delta:
|
|
256
|
+
compressed_total += objects.get(obj_hash, 0)
|
|
257
|
+
|
|
258
|
+
return (original_total, compressed_total)
|
memvcs/core/distiller.py
CHANGED
|
@@ -20,6 +20,7 @@ except ImportError:
|
|
|
20
20
|
YAML_AVAILABLE = False
|
|
21
21
|
|
|
22
22
|
from .gardener import Gardener, GardenerConfig, EpisodeCluster
|
|
23
|
+
from .compression_pipeline import CompressionPipeline
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
@dataclass
|
|
@@ -35,6 +36,7 @@ class DistillerConfig:
|
|
|
35
36
|
llm_provider: Optional[str] = None
|
|
36
37
|
llm_model: Optional[str] = None
|
|
37
38
|
create_safety_branch: bool = True
|
|
39
|
+
use_compression_pipeline: bool = True # Enable compression preprocessing
|
|
38
40
|
use_dp: bool = False
|
|
39
41
|
dp_epsilon: Optional[float] = None
|
|
40
42
|
dp_delta: Optional[float] = None
|
|
@@ -82,6 +84,19 @@ class Distiller:
|
|
|
82
84
|
llm_model=self.config.llm_model,
|
|
83
85
|
),
|
|
84
86
|
)
|
|
87
|
+
# Initialize compression pipeline for pre-processing
|
|
88
|
+
self.compression_pipeline = (
|
|
89
|
+
CompressionPipeline(
|
|
90
|
+
chunk_size=512,
|
|
91
|
+
use_sentences=True,
|
|
92
|
+
extract_facts=True,
|
|
93
|
+
dedup_hash=True,
|
|
94
|
+
vector_store=None, # Can be wired to repo's vector store if available
|
|
95
|
+
tier_by_recency=True,
|
|
96
|
+
)
|
|
97
|
+
if self.config.use_compression_pipeline
|
|
98
|
+
else None
|
|
99
|
+
)
|
|
85
100
|
|
|
86
101
|
def load_episodes_from(self, source_path: Path) -> List[Tuple[Path, str]]:
|
|
87
102
|
"""Load episodes from source directory."""
|
|
@@ -104,7 +119,7 @@ class Distiller:
|
|
|
104
119
|
return self.gardener.cluster_episodes(episodes)
|
|
105
120
|
|
|
106
121
|
def extract_facts(self, cluster: EpisodeCluster) -> List[str]:
|
|
107
|
-
"""Extract factual statements from cluster via LLM or heuristics."""
|
|
122
|
+
"""Extract factual statements from cluster via LLM or heuristics with optional compression."""
|
|
108
123
|
contents = []
|
|
109
124
|
for ep_path in cluster.episodes[:10]:
|
|
110
125
|
try:
|
|
@@ -113,6 +128,15 @@ class Distiller:
|
|
|
113
128
|
continue
|
|
114
129
|
combined = "\n---\n".join(contents)
|
|
115
130
|
|
|
131
|
+
# Apply compression pipeline if enabled (pre-processing before LLM)
|
|
132
|
+
if self.compression_pipeline:
|
|
133
|
+
try:
|
|
134
|
+
compressed_chunks = self.compression_pipeline.run(combined)
|
|
135
|
+
# Extract content from (content, hash, tier) tuples
|
|
136
|
+
combined = "\n".join([chunk[0] for chunk in compressed_chunks[:20]])
|
|
137
|
+
except Exception:
|
|
138
|
+
pass # Fall back to uncompressed content
|
|
139
|
+
|
|
116
140
|
if self.config.llm_provider and self.config.llm_model:
|
|
117
141
|
try:
|
|
118
142
|
from .llm import get_provider
|
|
@@ -136,9 +160,15 @@ class Distiller:
|
|
|
136
160
|
],
|
|
137
161
|
max_tokens=500,
|
|
138
162
|
)
|
|
139
|
-
|
|
163
|
+
facts = [
|
|
140
164
|
line.strip() for line in text.splitlines() if line.strip().startswith("-")
|
|
141
165
|
][:15]
|
|
166
|
+
|
|
167
|
+
# Apply DP to actual facts (not metadata) if enabled
|
|
168
|
+
if self.config.use_dp and self.config.dp_epsilon and self.config.dp_delta:
|
|
169
|
+
facts = self._apply_dp_to_facts(facts)
|
|
170
|
+
|
|
171
|
+
return facts
|
|
142
172
|
except Exception:
|
|
143
173
|
pass
|
|
144
174
|
|
|
@@ -149,7 +179,46 @@ class Distiller:
|
|
|
149
179
|
if len(line) > 20 and not line.startswith("#") and not line.startswith("-"):
|
|
150
180
|
if any(w in line.lower() for w in ["prefers", "likes", "uses", "learned", "user"]):
|
|
151
181
|
facts.append(f"- {line[:200]}")
|
|
152
|
-
|
|
182
|
+
|
|
183
|
+
result = facts[:10] if facts else [f"- Learned about {cluster.topic}"]
|
|
184
|
+
|
|
185
|
+
# Apply DP to fallback facts as well
|
|
186
|
+
if self.config.use_dp and self.config.dp_epsilon and self.config.dp_delta:
|
|
187
|
+
result = self._apply_dp_to_facts(result)
|
|
188
|
+
|
|
189
|
+
return result
|
|
190
|
+
|
|
191
|
+
def _apply_dp_to_facts(self, facts: List[str]) -> List[str]:
|
|
192
|
+
"""
|
|
193
|
+
Apply differential privacy to actual facts (not metadata).
|
|
194
|
+
This ensures removing one episode produces statistically similar output.
|
|
195
|
+
Uses fact sampling with noise to limit individual episode influence.
|
|
196
|
+
"""
|
|
197
|
+
if not facts:
|
|
198
|
+
return facts
|
|
199
|
+
|
|
200
|
+
from .privacy_budget import add_noise
|
|
201
|
+
|
|
202
|
+
# Add noise to fact count (sample with DP)
|
|
203
|
+
noisy_count = add_noise(
|
|
204
|
+
float(len(facts)),
|
|
205
|
+
sensitivity=1.0,
|
|
206
|
+
epsilon=self.config.dp_epsilon,
|
|
207
|
+
delta=self.config.dp_delta,
|
|
208
|
+
)
|
|
209
|
+
noisy_count = max(1, min(len(facts), int(round(noisy_count))))
|
|
210
|
+
|
|
211
|
+
# Sample facts with noise - prevents any single episode from dominating
|
|
212
|
+
import random
|
|
213
|
+
|
|
214
|
+
random.seed(42) # Deterministic but different per cluster due to content
|
|
215
|
+
sampled = random.sample(facts, min(noisy_count, len(facts)))
|
|
216
|
+
|
|
217
|
+
# Optional: Add slight noise to fact embeddings if vector store available
|
|
218
|
+
# This would further obscure individual episode contributions
|
|
219
|
+
# For now, sampling provides basic DP guarantee
|
|
220
|
+
|
|
221
|
+
return sampled
|
|
153
222
|
|
|
154
223
|
def write_consolidated(self, cluster: EpisodeCluster, facts: List[str]) -> Path:
|
|
155
224
|
"""Write consolidated semantic file."""
|
|
@@ -284,53 +353,8 @@ class Distiller:
|
|
|
284
353
|
clusters_processed = len(clusters)
|
|
285
354
|
facts_extracted = facts_count
|
|
286
355
|
episodes_archived = archived
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
and self.config.dp_epsilon is not None
|
|
290
|
-
and self.config.dp_delta is not None
|
|
291
|
-
):
|
|
292
|
-
from .privacy_budget import add_noise
|
|
293
|
-
|
|
294
|
-
sensitivity = 1.0
|
|
295
|
-
clusters_processed = max(
|
|
296
|
-
0,
|
|
297
|
-
int(
|
|
298
|
-
round(
|
|
299
|
-
add_noise(
|
|
300
|
-
float(clusters_processed),
|
|
301
|
-
sensitivity,
|
|
302
|
-
self.config.dp_epsilon,
|
|
303
|
-
self.config.dp_delta,
|
|
304
|
-
)
|
|
305
|
-
)
|
|
306
|
-
),
|
|
307
|
-
)
|
|
308
|
-
facts_extracted = max(
|
|
309
|
-
0,
|
|
310
|
-
int(
|
|
311
|
-
round(
|
|
312
|
-
add_noise(
|
|
313
|
-
float(facts_extracted),
|
|
314
|
-
sensitivity,
|
|
315
|
-
self.config.dp_epsilon,
|
|
316
|
-
self.config.dp_delta,
|
|
317
|
-
)
|
|
318
|
-
)
|
|
319
|
-
),
|
|
320
|
-
)
|
|
321
|
-
episodes_archived = max(
|
|
322
|
-
0,
|
|
323
|
-
int(
|
|
324
|
-
round(
|
|
325
|
-
add_noise(
|
|
326
|
-
float(episodes_archived),
|
|
327
|
-
sensitivity,
|
|
328
|
-
self.config.dp_epsilon,
|
|
329
|
-
self.config.dp_delta,
|
|
330
|
-
)
|
|
331
|
-
)
|
|
332
|
-
),
|
|
333
|
-
)
|
|
356
|
+
# Note: DP is now applied to actual facts during extraction, not metadata.
|
|
357
|
+
# Metadata noise removed as it doesn't provide meaningful privacy guarantees.
|
|
334
358
|
|
|
335
359
|
return DistillerResult(
|
|
336
360
|
success=True,
|