agmem 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,248 @@
1
+ """
2
+ Delta compression metrics and observability.
3
+
4
+ Tracks compression effectiveness across object types to enable future
5
+ optimization and auto-tuning of delta encoding parameters.
6
+
7
+ Provides:
8
+ - DeltaCompressionMetrics: Tracks compression ratio, object types, benefits
9
+ - CompressionHeatmap: Visualizes which types compress best
10
+ - Statistics reporting for gc --repack operations
11
+ """
12
+
13
+ from dataclasses import dataclass, field
14
+ from typing import Dict, List, Any, Optional, Tuple
15
+ from collections import defaultdict
16
+
17
+
18
+ @dataclass
19
+ class ObjectCompressionStats:
20
+ """Statistics for a single object's compression."""
21
+
22
+ object_id: str
23
+ object_type: str # "semantic", "episodic", "procedural"
24
+ original_size: int # bytes
25
+ compressed_size: int # bytes after delta encoding
26
+ compression_ratio: float # compressed_size / original_size (0.0 = 100% compression)
27
+ delta_used: bool # Whether delta encoding was applied
28
+ compression_benefit: float # original_size - compressed_size
29
+
30
+
31
+ @dataclass
32
+ class TypeCompressionStats:
33
+ """Aggregated statistics for an object type."""
34
+
35
+ object_type: str
36
+ count: int = 0
37
+ total_original_size: int = 0
38
+ total_compressed_size: int = 0
39
+ avg_compression_ratio: float = 0.0
40
+ total_benefit: int = 0 # Total bytes saved
41
+ objects_with_delta: int = 0 # How many used delta encoding
42
+ min_ratio: float = 1.0
43
+ max_ratio: float = 0.0
44
+
45
+ def update_from_object(self, obj_stats: ObjectCompressionStats) -> None:
46
+ """Update type stats with a single object's stats."""
47
+ self.count += 1
48
+ self.total_original_size += obj_stats.original_size
49
+ self.total_compressed_size += obj_stats.compressed_size
50
+ self.total_benefit += int(obj_stats.compression_benefit)
51
+ if obj_stats.delta_used:
52
+ self.objects_with_delta += 1
53
+ self.min_ratio = min(self.min_ratio, obj_stats.compression_ratio)
54
+ self.max_ratio = max(self.max_ratio, obj_stats.compression_ratio)
55
+
56
+ # Recalculate average
57
+ if self.total_original_size > 0:
58
+ self.avg_compression_ratio = self.total_compressed_size / self.total_original_size
59
+
60
+ def to_dict(self) -> Dict[str, Any]:
61
+ """Convert to dict for reporting."""
62
+ savings_pct = 0.0
63
+ if self.total_original_size > 0:
64
+ savings_pct = (self.total_benefit / self.total_original_size) * 100
65
+
66
+ return {
67
+ "object_type": self.object_type,
68
+ "count": self.count,
69
+ "total_original_bytes": self.total_original_size,
70
+ "total_compressed_bytes": self.total_compressed_size,
71
+ "avg_compression_ratio": round(self.avg_compression_ratio, 3),
72
+ "compression_range": f"{self.min_ratio:.1%} - {self.max_ratio:.1%}",
73
+ "total_bytes_saved": self.total_benefit,
74
+ "savings_percentage": round(savings_pct, 1),
75
+ "objects_using_delta": self.objects_with_delta,
76
+ "delta_adoption_rate": (
77
+ round((self.objects_with_delta / self.count * 100), 1) if self.count > 0 else 0
78
+ ),
79
+ }
80
+
81
+
82
+ class DeltaCompressionMetrics:
83
+ """Tracks delta compression statistics across all objects.
84
+
85
+ Usage:
86
+ metrics = DeltaCompressionMetrics()
87
+ # ... during packing ...
88
+ metrics.record_object(ObjectCompressionStats(...))
89
+ # ... after packing ...
90
+ report = metrics.get_report()
91
+ """
92
+
93
+ def __init__(self):
94
+ self.objects: List[ObjectCompressionStats] = []
95
+ self.type_stats: Dict[str, TypeCompressionStats] = {}
96
+ self.total_original_size: int = 0
97
+ self.total_compressed_size: int = 0
98
+
99
+ def record_object(self, obj_stats: ObjectCompressionStats) -> None:
100
+ """Record compression stats for a single object."""
101
+ self.objects.append(obj_stats)
102
+ self.total_original_size += obj_stats.original_size
103
+ self.total_compressed_size += obj_stats.compressed_size
104
+
105
+ # Update type-specific stats
106
+ if obj_stats.object_type not in self.type_stats:
107
+ self.type_stats[obj_stats.object_type] = TypeCompressionStats(
108
+ object_type=obj_stats.object_type
109
+ )
110
+ self.type_stats[obj_stats.object_type].update_from_object(obj_stats)
111
+
112
+ def get_type_stats(self, object_type: str) -> Optional[TypeCompressionStats]:
113
+ """Get stats for a specific object type."""
114
+ return self.type_stats.get(object_type)
115
+
116
+ def get_overall_ratio(self) -> float:
117
+ """Get overall compression ratio across all objects."""
118
+ if self.total_original_size == 0:
119
+ return 0.0
120
+ return self.total_compressed_size / self.total_original_size
121
+
122
+ def get_overall_savings(self) -> int:
123
+ """Get total bytes saved across all objects."""
124
+ return self.total_original_size - self.total_compressed_size
125
+
126
+ def get_report(self) -> Dict[str, Any]:
127
+ """Generate a comprehensive compression report."""
128
+ overall_ratio = self.get_overall_ratio()
129
+ overall_savings = self.get_overall_savings()
130
+ savings_pct = (
131
+ (overall_savings / self.total_original_size * 100)
132
+ if self.total_original_size > 0
133
+ else 0
134
+ )
135
+
136
+ return {
137
+ "timestamp": None, # Set by caller if needed
138
+ "total_objects": len(self.objects),
139
+ "total_original_bytes": self.total_original_size,
140
+ "total_compressed_bytes": self.total_compressed_size,
141
+ "overall_compression_ratio": round(overall_ratio, 3),
142
+ "total_bytes_saved": overall_savings,
143
+ "compression_percentage": round(savings_pct, 1),
144
+ "type_statistics": {otype: stats.to_dict() for otype, stats in self.type_stats.items()},
145
+ "recommendations": self._generate_recommendations(),
146
+ }
147
+
148
+ def _generate_recommendations(self) -> List[str]:
149
+ """Generate optimization recommendations based on compression stats."""
150
+ recommendations = []
151
+
152
+ # Check if delta encoding is worth it
153
+ objects_with_delta = sum(s.objects_with_delta for s in self.type_stats.values())
154
+ if objects_with_delta == 0:
155
+ recommendations.append("No objects used delta encoding. Check similarity thresholds.")
156
+
157
+ # Check for types with poor compression
158
+ for otype, stats in self.type_stats.items():
159
+ if stats.count > 0 and stats.avg_compression_ratio > 0.9:
160
+ recommendations.append(
161
+ f"Type '{otype}' compresses poorly (ratio: {stats.avg_compression_ratio:.1%}). "
162
+ f"Consider increasing similarity threshold or reducing delta cost."
163
+ )
164
+
165
+ # Check for types with excellent compression
166
+ for otype, stats in self.type_stats.items():
167
+ if stats.count > 0 and stats.avg_compression_ratio < 0.5:
168
+ recommendations.append(
169
+ f"Type '{otype}' compresses very well (ratio: {stats.avg_compression_ratio:.1%}). "
170
+ f"Consider aggressive delta encoding or reduced threshold."
171
+ )
172
+
173
+ if not recommendations:
174
+ recommendations.append("Compression is operating normally.")
175
+
176
+ return recommendations
177
+
178
+ def get_heatmap(self) -> str:
179
+ """Generate a text-based compression heatmap."""
180
+ lines = ["Delta Compression Heatmap", "=" * 50]
181
+
182
+ if not self.type_stats:
183
+ lines.append("No compression data available")
184
+ return "\n".join(lines)
185
+
186
+ # Sort by compression ratio
187
+ sorted_types = sorted(
188
+ self.type_stats.values(),
189
+ key=lambda s: s.avg_compression_ratio,
190
+ )
191
+
192
+ for stats in sorted_types:
193
+ if stats.count == 0:
194
+ continue
195
+ ratio = stats.avg_compression_ratio
196
+ # Create a simple bar chart
197
+ bar_width = 30
198
+ filled = int(bar_width * ratio)
199
+ bar = "█" * filled + "░" * (bar_width - filled)
200
+ saved_pct = (
201
+ (stats.total_benefit / stats.total_original_size * 100)
202
+ if stats.total_original_size > 0
203
+ else 0
204
+ )
205
+ lines.append(
206
+ f"{stats.object_type:12} {bar} {saved_pct:5.1f}% saved ({stats.objects_with_delta}/{stats.count} using delta)"
207
+ )
208
+
209
+ return "\n".join(lines)
210
+
211
+ def log_report(self, logger: Any = None) -> None:
212
+ """Log the compression report."""
213
+ report = self.get_report()
214
+ heatmap = self.get_heatmap()
215
+
216
+ output = [
217
+ "=" * 70,
218
+ "Delta Compression Report",
219
+ "=" * 70,
220
+ f"Total Objects: {report['total_objects']}",
221
+ f"Total Original: {report['total_original_bytes']:,} bytes",
222
+ f"Total Compressed: {report['total_compressed_bytes']:,} bytes",
223
+ f"Overall Ratio: {report['overall_compression_ratio']:.1%}",
224
+ f"Bytes Saved: {report['total_bytes_saved']:,} ({report['compression_percentage']:.1f}%)",
225
+ "",
226
+ heatmap,
227
+ "",
228
+ "Type Breakdown:",
229
+ ]
230
+
231
+ for otype, stats in sorted(report["type_statistics"].items()):
232
+ output.append(f" {otype}:")
233
+ output.append(f" Count: {stats['count']}")
234
+ output.append(f" Compression: {stats['avg_compression_ratio']:.1%}")
235
+ output.append(f" Saved: {stats['total_bytes_saved']:,} bytes")
236
+ output.append(f" Delta adoption: {stats['delta_adoption_rate']:.0f}%")
237
+
238
+ output.extend(["", "Recommendations:"])
239
+ for rec in report["recommendations"]:
240
+ output.append(f" - {rec}")
241
+
242
+ output.append("=" * 70)
243
+
244
+ full_output = "\n".join(output)
245
+ if logger:
246
+ logger.info(full_output)
247
+ else:
248
+ print(full_output)
memvcs/core/delta.py ADDED
@@ -0,0 +1,258 @@
1
+ """
2
+ Delta encoding for pack files.
3
+
4
+ Compress similar objects using delta encoding. For objects with similar content,
5
+ store the first in full and subsequent ones as deltas (differences).
6
+
7
+ This can achieve 5-10x compression improvement for highly similar content
8
+ (common in agent episodic logs, semantic consolidations, etc).
9
+ """
10
+
11
+ import hashlib
12
+ from typing import List, Tuple, Dict, Optional
13
+
14
+
15
+ def levenshtein_distance(s1: bytes, s2: bytes) -> int:
16
+ """
17
+ Compute Levenshtein distance between two byte sequences.
18
+ Returns edit distance (insertions, deletions, substitutions).
19
+ """
20
+ if len(s1) < len(s2):
21
+ s1, s2 = s2, s1
22
+
23
+ if len(s2) == 0:
24
+ return len(s1)
25
+
26
+ prev = list(range(len(s2) + 1))
27
+ for i, c1 in enumerate(s1):
28
+ curr = [i + 1]
29
+ for j, c2 in enumerate(s2):
30
+ insertions = prev[j + 1] + 1
31
+ deletions = curr[j] + 1
32
+ substitutions = prev[j] + (c1 != c2)
33
+ curr.append(min(insertions, deletions, substitutions))
34
+ prev = curr
35
+
36
+ return prev[-1]
37
+
38
+
39
+ def content_similarity(data1: bytes, data2: bytes) -> float:
40
+ """
41
+ Calculate similarity between two byte sequences (0.0 to 1.0).
42
+ Based on Levenshtein distance normalized by max length.
43
+ """
44
+ if not data1 or not data2:
45
+ return 0.0
46
+
47
+ distance = levenshtein_distance(data1, data2)
48
+ max_len = max(len(data1), len(data2))
49
+
50
+ if max_len == 0:
51
+ return 1.0
52
+
53
+ return 1.0 - (distance / max_len)
54
+
55
+
56
+ def find_similar_objects(
57
+ objects: Dict[str, bytes],
58
+ similarity_threshold: float = 0.7,
59
+ min_size: int = 100,
60
+ ) -> List[List[str]]:
61
+ """
62
+ Group objects by similarity.
63
+
64
+ Returns list of groups, where each group is a list of object hashes
65
+ sorted by size (smallest first - best compression base).
66
+ Only includes objects >= min_size.
67
+
68
+ Args:
69
+ objects: dict of hash_id -> content
70
+ similarity_threshold: minimum similarity (0.0-1.0) to group
71
+ min_size: minimum object size to consider for delta
72
+
73
+ Returns:
74
+ List of similarity groups, each sorted by size ascending
75
+ """
76
+ candidates = {h: content for h, content in objects.items() if len(content) >= min_size}
77
+
78
+ if not candidates:
79
+ return []
80
+
81
+ grouped = {}
82
+ used = set()
83
+
84
+ for hash_id, content in candidates.items():
85
+ if hash_id in used:
86
+ continue
87
+
88
+ group = [hash_id]
89
+ used.add(hash_id)
90
+
91
+ for other_id, other_content in candidates.items():
92
+ if other_id in used:
93
+ continue
94
+
95
+ similarity = content_similarity(content, other_content)
96
+ if similarity >= similarity_threshold:
97
+ group.append(other_id)
98
+ used.add(other_id)
99
+
100
+ if len(group) > 1:
101
+ # Sort by size ascending (smallest first = best base)
102
+ group.sort(key=lambda h: len(candidates[h]))
103
+ grouped[group[0]] = group
104
+
105
+ return list(grouped.values())
106
+
107
+
108
+ def compute_delta(base: bytes, target: bytes) -> bytes:
109
+ """
110
+ Compute delta from base to target using simple run-length + offset encoding.
111
+
112
+ Format:
113
+ - 0x00: Copy op - next 4 bytes = offset in base, next 4 bytes = length
114
+ - 0x01: Insert op - next 4 bytes = length, then <length> bytes of data
115
+ - 0x02: End marker
116
+
117
+ This is NOT the most efficient delta algorithm but simple and effective
118
+ for similar objects. Production code could use bsdiff, xdelta3, etc.
119
+ """
120
+ from difflib import SequenceMatcher
121
+
122
+ matcher = SequenceMatcher(None, base, target)
123
+ matching_blocks = matcher.get_matching_blocks()
124
+
125
+ delta = bytearray()
126
+ target_pos = 0
127
+
128
+ for block in matching_blocks:
129
+ base_start, target_start, size = block.a, block.b, block.size
130
+
131
+ # Insert any unmapped target bytes before this block
132
+ if target_start > target_pos:
133
+ insert_len = target_start - target_pos
134
+ insert_data = target[target_pos:target_start]
135
+ delta.append(0x01) # Insert op
136
+ delta.extend(insert_len.to_bytes(4, "big"))
137
+ delta.extend(insert_data)
138
+
139
+ # Copy block from base
140
+ if size > 0:
141
+ delta.append(0x00) # Copy op
142
+ delta.extend(base_start.to_bytes(4, "big"))
143
+ delta.extend(size.to_bytes(4, "big"))
144
+
145
+ target_pos = target_start + size
146
+
147
+ # Insert any remaining target bytes
148
+ if target_pos < len(target):
149
+ insert_len = len(target) - target_pos
150
+ insert_data = target[target_pos:]
151
+ delta.append(0x01) # Insert op
152
+ delta.extend(insert_len.to_bytes(4, "big"))
153
+ delta.extend(insert_data)
154
+
155
+ delta.append(0x02) # End marker
156
+
157
+ return bytes(delta)
158
+
159
+
160
+ def apply_delta(base: bytes, delta: bytes) -> bytes:
161
+ """Apply delta to base to reconstruct target."""
162
+ result = bytearray()
163
+ pos = 0
164
+
165
+ while pos < len(delta):
166
+ op = delta[pos]
167
+ pos += 1
168
+
169
+ if op == 0x00: # Copy op
170
+ if pos + 8 > len(delta):
171
+ break
172
+ offset = int.from_bytes(delta[pos : pos + 4], "big")
173
+ length = int.from_bytes(delta[pos + 4 : pos + 8], "big")
174
+ pos += 8
175
+ result.extend(base[offset : offset + length])
176
+
177
+ elif op == 0x01: # Insert op
178
+ if pos + 4 > len(delta):
179
+ break
180
+ length = int.from_bytes(delta[pos : pos + 4], "big")
181
+ pos += 4
182
+ if pos + length > len(delta):
183
+ break
184
+ result.extend(delta[pos : pos + length])
185
+ pos += length
186
+
187
+ elif op == 0x02: # End marker
188
+ break
189
+
190
+ return bytes(result)
191
+
192
+
193
+ def estimate_delta_compression(base: bytes, target: bytes, delta: bytes) -> Tuple[int, float]:
194
+ """
195
+ Estimate compression achieved by delta.
196
+
197
+ Returns (original_size, ratio) where ratio = 1.0 is no compression,
198
+ ratio = 0.5 means delta is 50% of original target size.
199
+ """
200
+ original_size = len(target)
201
+ delta_size = len(delta)
202
+
203
+ if original_size == 0:
204
+ return (0, 0.0)
205
+
206
+ ratio = delta_size / original_size
207
+ return (original_size, ratio)
208
+
209
+
210
+ class DeltaCache:
211
+ """
212
+ Cache deltas between similar objects.
213
+
214
+ Tracks base->target relationships and stores pre-computed deltas
215
+ to avoid recomputation.
216
+ """
217
+
218
+ def __init__(self):
219
+ self.deltas: Dict[Tuple[str, str], bytes] = {} # (base_hash, target_hash) -> delta
220
+ self.bases: Dict[str, bytes] = {} # target_hash -> base_hash (reconstruction path)
221
+
222
+ def add_delta(self, base_hash: str, target_hash: str, delta: bytes):
223
+ """Register a delta relationship."""
224
+ self.deltas[(base_hash, target_hash)] = delta
225
+ self.bases[target_hash] = base_hash
226
+
227
+ def get_delta(self, base_hash: str, target_hash: str) -> Optional[bytes]:
228
+ """Retrieve cached delta."""
229
+ return self.deltas.get((base_hash, target_hash))
230
+
231
+ def get_base(self, target_hash: str) -> Optional[str]:
232
+ """Get the base hash for a target."""
233
+ return self.bases.get(target_hash)
234
+
235
+ def estimate_total_savings(self, objects: Dict[str, int]) -> Tuple[int, int]:
236
+ """
237
+ Estimate total size savings from all deltas.
238
+
239
+ Returns (original_total, compressed_total).
240
+
241
+ Args:
242
+ objects: dict of hash_id -> original_size
243
+ """
244
+ original_total = sum(objects.values())
245
+ compressed_total = 0
246
+
247
+ for (base_hash, target_hash), delta in self.deltas.items():
248
+ # Target stored as delta instead of full copy
249
+ compressed_total += len(delta)
250
+
251
+ # Add all non-delta objects
252
+ all_objects = set(objects.keys())
253
+ delta_targets = set(self.bases.keys())
254
+ non_delta = all_objects - delta_targets
255
+ for obj_hash in non_delta:
256
+ compressed_total += objects.get(obj_hash, 0)
257
+
258
+ return (original_total, compressed_total)