agmem 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/METADATA +15 -8
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/RECORD +25 -16
- memvcs/__init__.py +1 -1
- memvcs/cli.py +1 -1
- memvcs/commands/daemon.py +37 -1
- memvcs/commands/distill.py +6 -0
- memvcs/coordinator/__init__.py +5 -0
- memvcs/coordinator/server.py +239 -0
- memvcs/core/compression_metrics.py +248 -0
- memvcs/core/delta.py +258 -0
- memvcs/core/distiller.py +76 -61
- memvcs/core/fast_similarity.py +404 -0
- memvcs/core/federated.py +13 -2
- memvcs/core/gardener.py +8 -68
- memvcs/core/pack.py +192 -34
- memvcs/core/privacy_validator.py +187 -0
- memvcs/core/protocol_builder.py +198 -0
- memvcs/core/remote.py +82 -2
- memvcs/core/zk_proofs.py +62 -5
- memvcs/health/__init__.py +25 -0
- memvcs/health/monitor.py +452 -0
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/WHEEL +0 -0
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/entry_points.txt +0 -0
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Delta compression metrics and observability.
|
|
3
|
+
|
|
4
|
+
Tracks compression effectiveness across object types to enable future
|
|
5
|
+
optimization and auto-tuning of delta encoding parameters.
|
|
6
|
+
|
|
7
|
+
Provides:
|
|
8
|
+
- DeltaCompressionMetrics: Tracks compression ratio, object types, benefits
|
|
9
|
+
- CompressionHeatmap: Visualizes which types compress best
|
|
10
|
+
- Statistics reporting for gc --repack operations
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from typing import Dict, List, Any, Optional, Tuple
|
|
15
|
+
from collections import defaultdict
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ObjectCompressionStats:
|
|
20
|
+
"""Statistics for a single object's compression."""
|
|
21
|
+
|
|
22
|
+
object_id: str
|
|
23
|
+
object_type: str # "semantic", "episodic", "procedural"
|
|
24
|
+
original_size: int # bytes
|
|
25
|
+
compressed_size: int # bytes after delta encoding
|
|
26
|
+
compression_ratio: float # compressed_size / original_size (0.0 = 100% compression)
|
|
27
|
+
delta_used: bool # Whether delta encoding was applied
|
|
28
|
+
compression_benefit: float # original_size - compressed_size
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class TypeCompressionStats:
|
|
33
|
+
"""Aggregated statistics for an object type."""
|
|
34
|
+
|
|
35
|
+
object_type: str
|
|
36
|
+
count: int = 0
|
|
37
|
+
total_original_size: int = 0
|
|
38
|
+
total_compressed_size: int = 0
|
|
39
|
+
avg_compression_ratio: float = 0.0
|
|
40
|
+
total_benefit: int = 0 # Total bytes saved
|
|
41
|
+
objects_with_delta: int = 0 # How many used delta encoding
|
|
42
|
+
min_ratio: float = 1.0
|
|
43
|
+
max_ratio: float = 0.0
|
|
44
|
+
|
|
45
|
+
def update_from_object(self, obj_stats: ObjectCompressionStats) -> None:
|
|
46
|
+
"""Update type stats with a single object's stats."""
|
|
47
|
+
self.count += 1
|
|
48
|
+
self.total_original_size += obj_stats.original_size
|
|
49
|
+
self.total_compressed_size += obj_stats.compressed_size
|
|
50
|
+
self.total_benefit += int(obj_stats.compression_benefit)
|
|
51
|
+
if obj_stats.delta_used:
|
|
52
|
+
self.objects_with_delta += 1
|
|
53
|
+
self.min_ratio = min(self.min_ratio, obj_stats.compression_ratio)
|
|
54
|
+
self.max_ratio = max(self.max_ratio, obj_stats.compression_ratio)
|
|
55
|
+
|
|
56
|
+
# Recalculate average
|
|
57
|
+
if self.total_original_size > 0:
|
|
58
|
+
self.avg_compression_ratio = self.total_compressed_size / self.total_original_size
|
|
59
|
+
|
|
60
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
61
|
+
"""Convert to dict for reporting."""
|
|
62
|
+
savings_pct = 0.0
|
|
63
|
+
if self.total_original_size > 0:
|
|
64
|
+
savings_pct = (self.total_benefit / self.total_original_size) * 100
|
|
65
|
+
|
|
66
|
+
return {
|
|
67
|
+
"object_type": self.object_type,
|
|
68
|
+
"count": self.count,
|
|
69
|
+
"total_original_bytes": self.total_original_size,
|
|
70
|
+
"total_compressed_bytes": self.total_compressed_size,
|
|
71
|
+
"avg_compression_ratio": round(self.avg_compression_ratio, 3),
|
|
72
|
+
"compression_range": f"{self.min_ratio:.1%} - {self.max_ratio:.1%}",
|
|
73
|
+
"total_bytes_saved": self.total_benefit,
|
|
74
|
+
"savings_percentage": round(savings_pct, 1),
|
|
75
|
+
"objects_using_delta": self.objects_with_delta,
|
|
76
|
+
"delta_adoption_rate": (
|
|
77
|
+
round((self.objects_with_delta / self.count * 100), 1) if self.count > 0 else 0
|
|
78
|
+
),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class DeltaCompressionMetrics:
|
|
83
|
+
"""Tracks delta compression statistics across all objects.
|
|
84
|
+
|
|
85
|
+
Usage:
|
|
86
|
+
metrics = DeltaCompressionMetrics()
|
|
87
|
+
# ... during packing ...
|
|
88
|
+
metrics.record_object(ObjectCompressionStats(...))
|
|
89
|
+
# ... after packing ...
|
|
90
|
+
report = metrics.get_report()
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
def __init__(self):
|
|
94
|
+
self.objects: List[ObjectCompressionStats] = []
|
|
95
|
+
self.type_stats: Dict[str, TypeCompressionStats] = {}
|
|
96
|
+
self.total_original_size: int = 0
|
|
97
|
+
self.total_compressed_size: int = 0
|
|
98
|
+
|
|
99
|
+
def record_object(self, obj_stats: ObjectCompressionStats) -> None:
|
|
100
|
+
"""Record compression stats for a single object."""
|
|
101
|
+
self.objects.append(obj_stats)
|
|
102
|
+
self.total_original_size += obj_stats.original_size
|
|
103
|
+
self.total_compressed_size += obj_stats.compressed_size
|
|
104
|
+
|
|
105
|
+
# Update type-specific stats
|
|
106
|
+
if obj_stats.object_type not in self.type_stats:
|
|
107
|
+
self.type_stats[obj_stats.object_type] = TypeCompressionStats(
|
|
108
|
+
object_type=obj_stats.object_type
|
|
109
|
+
)
|
|
110
|
+
self.type_stats[obj_stats.object_type].update_from_object(obj_stats)
|
|
111
|
+
|
|
112
|
+
def get_type_stats(self, object_type: str) -> Optional[TypeCompressionStats]:
|
|
113
|
+
"""Get stats for a specific object type."""
|
|
114
|
+
return self.type_stats.get(object_type)
|
|
115
|
+
|
|
116
|
+
def get_overall_ratio(self) -> float:
|
|
117
|
+
"""Get overall compression ratio across all objects."""
|
|
118
|
+
if self.total_original_size == 0:
|
|
119
|
+
return 0.0
|
|
120
|
+
return self.total_compressed_size / self.total_original_size
|
|
121
|
+
|
|
122
|
+
def get_overall_savings(self) -> int:
|
|
123
|
+
"""Get total bytes saved across all objects."""
|
|
124
|
+
return self.total_original_size - self.total_compressed_size
|
|
125
|
+
|
|
126
|
+
def get_report(self) -> Dict[str, Any]:
|
|
127
|
+
"""Generate a comprehensive compression report."""
|
|
128
|
+
overall_ratio = self.get_overall_ratio()
|
|
129
|
+
overall_savings = self.get_overall_savings()
|
|
130
|
+
savings_pct = (
|
|
131
|
+
(overall_savings / self.total_original_size * 100)
|
|
132
|
+
if self.total_original_size > 0
|
|
133
|
+
else 0
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
"timestamp": None, # Set by caller if needed
|
|
138
|
+
"total_objects": len(self.objects),
|
|
139
|
+
"total_original_bytes": self.total_original_size,
|
|
140
|
+
"total_compressed_bytes": self.total_compressed_size,
|
|
141
|
+
"overall_compression_ratio": round(overall_ratio, 3),
|
|
142
|
+
"total_bytes_saved": overall_savings,
|
|
143
|
+
"compression_percentage": round(savings_pct, 1),
|
|
144
|
+
"type_statistics": {otype: stats.to_dict() for otype, stats in self.type_stats.items()},
|
|
145
|
+
"recommendations": self._generate_recommendations(),
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
def _generate_recommendations(self) -> List[str]:
|
|
149
|
+
"""Generate optimization recommendations based on compression stats."""
|
|
150
|
+
recommendations = []
|
|
151
|
+
|
|
152
|
+
# Check if delta encoding is worth it
|
|
153
|
+
objects_with_delta = sum(s.objects_with_delta for s in self.type_stats.values())
|
|
154
|
+
if objects_with_delta == 0:
|
|
155
|
+
recommendations.append("No objects used delta encoding. Check similarity thresholds.")
|
|
156
|
+
|
|
157
|
+
# Check for types with poor compression
|
|
158
|
+
for otype, stats in self.type_stats.items():
|
|
159
|
+
if stats.count > 0 and stats.avg_compression_ratio > 0.9:
|
|
160
|
+
recommendations.append(
|
|
161
|
+
f"Type '{otype}' compresses poorly (ratio: {stats.avg_compression_ratio:.1%}). "
|
|
162
|
+
f"Consider increasing similarity threshold or reducing delta cost."
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Check for types with excellent compression
|
|
166
|
+
for otype, stats in self.type_stats.items():
|
|
167
|
+
if stats.count > 0 and stats.avg_compression_ratio < 0.5:
|
|
168
|
+
recommendations.append(
|
|
169
|
+
f"Type '{otype}' compresses very well (ratio: {stats.avg_compression_ratio:.1%}). "
|
|
170
|
+
f"Consider aggressive delta encoding or reduced threshold."
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if not recommendations:
|
|
174
|
+
recommendations.append("Compression is operating normally.")
|
|
175
|
+
|
|
176
|
+
return recommendations
|
|
177
|
+
|
|
178
|
+
def get_heatmap(self) -> str:
|
|
179
|
+
"""Generate a text-based compression heatmap."""
|
|
180
|
+
lines = ["Delta Compression Heatmap", "=" * 50]
|
|
181
|
+
|
|
182
|
+
if not self.type_stats:
|
|
183
|
+
lines.append("No compression data available")
|
|
184
|
+
return "\n".join(lines)
|
|
185
|
+
|
|
186
|
+
# Sort by compression ratio
|
|
187
|
+
sorted_types = sorted(
|
|
188
|
+
self.type_stats.values(),
|
|
189
|
+
key=lambda s: s.avg_compression_ratio,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
for stats in sorted_types:
|
|
193
|
+
if stats.count == 0:
|
|
194
|
+
continue
|
|
195
|
+
ratio = stats.avg_compression_ratio
|
|
196
|
+
# Create a simple bar chart
|
|
197
|
+
bar_width = 30
|
|
198
|
+
filled = int(bar_width * ratio)
|
|
199
|
+
bar = "█" * filled + "░" * (bar_width - filled)
|
|
200
|
+
saved_pct = (
|
|
201
|
+
(stats.total_benefit / stats.total_original_size * 100)
|
|
202
|
+
if stats.total_original_size > 0
|
|
203
|
+
else 0
|
|
204
|
+
)
|
|
205
|
+
lines.append(
|
|
206
|
+
f"{stats.object_type:12} {bar} {saved_pct:5.1f}% saved ({stats.objects_with_delta}/{stats.count} using delta)"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return "\n".join(lines)
|
|
210
|
+
|
|
211
|
+
def log_report(self, logger: Any = None) -> None:
|
|
212
|
+
"""Log the compression report."""
|
|
213
|
+
report = self.get_report()
|
|
214
|
+
heatmap = self.get_heatmap()
|
|
215
|
+
|
|
216
|
+
output = [
|
|
217
|
+
"=" * 70,
|
|
218
|
+
"Delta Compression Report",
|
|
219
|
+
"=" * 70,
|
|
220
|
+
f"Total Objects: {report['total_objects']}",
|
|
221
|
+
f"Total Original: {report['total_original_bytes']:,} bytes",
|
|
222
|
+
f"Total Compressed: {report['total_compressed_bytes']:,} bytes",
|
|
223
|
+
f"Overall Ratio: {report['overall_compression_ratio']:.1%}",
|
|
224
|
+
f"Bytes Saved: {report['total_bytes_saved']:,} ({report['compression_percentage']:.1f}%)",
|
|
225
|
+
"",
|
|
226
|
+
heatmap,
|
|
227
|
+
"",
|
|
228
|
+
"Type Breakdown:",
|
|
229
|
+
]
|
|
230
|
+
|
|
231
|
+
for otype, stats in sorted(report["type_statistics"].items()):
|
|
232
|
+
output.append(f" {otype}:")
|
|
233
|
+
output.append(f" Count: {stats['count']}")
|
|
234
|
+
output.append(f" Compression: {stats['avg_compression_ratio']:.1%}")
|
|
235
|
+
output.append(f" Saved: {stats['total_bytes_saved']:,} bytes")
|
|
236
|
+
output.append(f" Delta adoption: {stats['delta_adoption_rate']:.0f}%")
|
|
237
|
+
|
|
238
|
+
output.extend(["", "Recommendations:"])
|
|
239
|
+
for rec in report["recommendations"]:
|
|
240
|
+
output.append(f" - {rec}")
|
|
241
|
+
|
|
242
|
+
output.append("=" * 70)
|
|
243
|
+
|
|
244
|
+
full_output = "\n".join(output)
|
|
245
|
+
if logger:
|
|
246
|
+
logger.info(full_output)
|
|
247
|
+
else:
|
|
248
|
+
print(full_output)
|
memvcs/core/delta.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Delta encoding for pack files.
|
|
3
|
+
|
|
4
|
+
Compress similar objects using delta encoding. For objects with similar content,
|
|
5
|
+
store the first in full and subsequent ones as deltas (differences).
|
|
6
|
+
|
|
7
|
+
This can achieve 5-10x compression improvement for highly similar content
|
|
8
|
+
(common in agent episodic logs, semantic consolidations, etc).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
from typing import List, Tuple, Dict, Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def levenshtein_distance(s1: bytes, s2: bytes) -> int:
|
|
16
|
+
"""
|
|
17
|
+
Compute Levenshtein distance between two byte sequences.
|
|
18
|
+
Returns edit distance (insertions, deletions, substitutions).
|
|
19
|
+
"""
|
|
20
|
+
if len(s1) < len(s2):
|
|
21
|
+
s1, s2 = s2, s1
|
|
22
|
+
|
|
23
|
+
if len(s2) == 0:
|
|
24
|
+
return len(s1)
|
|
25
|
+
|
|
26
|
+
prev = list(range(len(s2) + 1))
|
|
27
|
+
for i, c1 in enumerate(s1):
|
|
28
|
+
curr = [i + 1]
|
|
29
|
+
for j, c2 in enumerate(s2):
|
|
30
|
+
insertions = prev[j + 1] + 1
|
|
31
|
+
deletions = curr[j] + 1
|
|
32
|
+
substitutions = prev[j] + (c1 != c2)
|
|
33
|
+
curr.append(min(insertions, deletions, substitutions))
|
|
34
|
+
prev = curr
|
|
35
|
+
|
|
36
|
+
return prev[-1]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def content_similarity(data1: bytes, data2: bytes) -> float:
|
|
40
|
+
"""
|
|
41
|
+
Calculate similarity between two byte sequences (0.0 to 1.0).
|
|
42
|
+
Based on Levenshtein distance normalized by max length.
|
|
43
|
+
"""
|
|
44
|
+
if not data1 or not data2:
|
|
45
|
+
return 0.0
|
|
46
|
+
|
|
47
|
+
distance = levenshtein_distance(data1, data2)
|
|
48
|
+
max_len = max(len(data1), len(data2))
|
|
49
|
+
|
|
50
|
+
if max_len == 0:
|
|
51
|
+
return 1.0
|
|
52
|
+
|
|
53
|
+
return 1.0 - (distance / max_len)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def find_similar_objects(
|
|
57
|
+
objects: Dict[str, bytes],
|
|
58
|
+
similarity_threshold: float = 0.7,
|
|
59
|
+
min_size: int = 100,
|
|
60
|
+
) -> List[List[str]]:
|
|
61
|
+
"""
|
|
62
|
+
Group objects by similarity.
|
|
63
|
+
|
|
64
|
+
Returns list of groups, where each group is a list of object hashes
|
|
65
|
+
sorted by size (smallest first - best compression base).
|
|
66
|
+
Only includes objects >= min_size.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
objects: dict of hash_id -> content
|
|
70
|
+
similarity_threshold: minimum similarity (0.0-1.0) to group
|
|
71
|
+
min_size: minimum object size to consider for delta
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
List of similarity groups, each sorted by size ascending
|
|
75
|
+
"""
|
|
76
|
+
candidates = {h: content for h, content in objects.items() if len(content) >= min_size}
|
|
77
|
+
|
|
78
|
+
if not candidates:
|
|
79
|
+
return []
|
|
80
|
+
|
|
81
|
+
grouped = {}
|
|
82
|
+
used = set()
|
|
83
|
+
|
|
84
|
+
for hash_id, content in candidates.items():
|
|
85
|
+
if hash_id in used:
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
group = [hash_id]
|
|
89
|
+
used.add(hash_id)
|
|
90
|
+
|
|
91
|
+
for other_id, other_content in candidates.items():
|
|
92
|
+
if other_id in used:
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
similarity = content_similarity(content, other_content)
|
|
96
|
+
if similarity >= similarity_threshold:
|
|
97
|
+
group.append(other_id)
|
|
98
|
+
used.add(other_id)
|
|
99
|
+
|
|
100
|
+
if len(group) > 1:
|
|
101
|
+
# Sort by size ascending (smallest first = best base)
|
|
102
|
+
group.sort(key=lambda h: len(candidates[h]))
|
|
103
|
+
grouped[group[0]] = group
|
|
104
|
+
|
|
105
|
+
return list(grouped.values())
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def compute_delta(base: bytes, target: bytes) -> bytes:
|
|
109
|
+
"""
|
|
110
|
+
Compute delta from base to target using simple run-length + offset encoding.
|
|
111
|
+
|
|
112
|
+
Format:
|
|
113
|
+
- 0x00: Copy op - next 4 bytes = offset in base, next 4 bytes = length
|
|
114
|
+
- 0x01: Insert op - next 4 bytes = length, then <length> bytes of data
|
|
115
|
+
- 0x02: End marker
|
|
116
|
+
|
|
117
|
+
This is NOT the most efficient delta algorithm but simple and effective
|
|
118
|
+
for similar objects. Production code could use bsdiff, xdelta3, etc.
|
|
119
|
+
"""
|
|
120
|
+
from difflib import SequenceMatcher
|
|
121
|
+
|
|
122
|
+
matcher = SequenceMatcher(None, base, target)
|
|
123
|
+
matching_blocks = matcher.get_matching_blocks()
|
|
124
|
+
|
|
125
|
+
delta = bytearray()
|
|
126
|
+
target_pos = 0
|
|
127
|
+
|
|
128
|
+
for block in matching_blocks:
|
|
129
|
+
base_start, target_start, size = block.a, block.b, block.size
|
|
130
|
+
|
|
131
|
+
# Insert any unmapped target bytes before this block
|
|
132
|
+
if target_start > target_pos:
|
|
133
|
+
insert_len = target_start - target_pos
|
|
134
|
+
insert_data = target[target_pos:target_start]
|
|
135
|
+
delta.append(0x01) # Insert op
|
|
136
|
+
delta.extend(insert_len.to_bytes(4, "big"))
|
|
137
|
+
delta.extend(insert_data)
|
|
138
|
+
|
|
139
|
+
# Copy block from base
|
|
140
|
+
if size > 0:
|
|
141
|
+
delta.append(0x00) # Copy op
|
|
142
|
+
delta.extend(base_start.to_bytes(4, "big"))
|
|
143
|
+
delta.extend(size.to_bytes(4, "big"))
|
|
144
|
+
|
|
145
|
+
target_pos = target_start + size
|
|
146
|
+
|
|
147
|
+
# Insert any remaining target bytes
|
|
148
|
+
if target_pos < len(target):
|
|
149
|
+
insert_len = len(target) - target_pos
|
|
150
|
+
insert_data = target[target_pos:]
|
|
151
|
+
delta.append(0x01) # Insert op
|
|
152
|
+
delta.extend(insert_len.to_bytes(4, "big"))
|
|
153
|
+
delta.extend(insert_data)
|
|
154
|
+
|
|
155
|
+
delta.append(0x02) # End marker
|
|
156
|
+
|
|
157
|
+
return bytes(delta)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def apply_delta(base: bytes, delta: bytes) -> bytes:
|
|
161
|
+
"""Apply delta to base to reconstruct target."""
|
|
162
|
+
result = bytearray()
|
|
163
|
+
pos = 0
|
|
164
|
+
|
|
165
|
+
while pos < len(delta):
|
|
166
|
+
op = delta[pos]
|
|
167
|
+
pos += 1
|
|
168
|
+
|
|
169
|
+
if op == 0x00: # Copy op
|
|
170
|
+
if pos + 8 > len(delta):
|
|
171
|
+
break
|
|
172
|
+
offset = int.from_bytes(delta[pos : pos + 4], "big")
|
|
173
|
+
length = int.from_bytes(delta[pos + 4 : pos + 8], "big")
|
|
174
|
+
pos += 8
|
|
175
|
+
result.extend(base[offset : offset + length])
|
|
176
|
+
|
|
177
|
+
elif op == 0x01: # Insert op
|
|
178
|
+
if pos + 4 > len(delta):
|
|
179
|
+
break
|
|
180
|
+
length = int.from_bytes(delta[pos : pos + 4], "big")
|
|
181
|
+
pos += 4
|
|
182
|
+
if pos + length > len(delta):
|
|
183
|
+
break
|
|
184
|
+
result.extend(delta[pos : pos + length])
|
|
185
|
+
pos += length
|
|
186
|
+
|
|
187
|
+
elif op == 0x02: # End marker
|
|
188
|
+
break
|
|
189
|
+
|
|
190
|
+
return bytes(result)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def estimate_delta_compression(base: bytes, target: bytes, delta: bytes) -> Tuple[int, float]:
|
|
194
|
+
"""
|
|
195
|
+
Estimate compression achieved by delta.
|
|
196
|
+
|
|
197
|
+
Returns (original_size, ratio) where ratio = 1.0 is no compression,
|
|
198
|
+
ratio = 0.5 means delta is 50% of original target size.
|
|
199
|
+
"""
|
|
200
|
+
original_size = len(target)
|
|
201
|
+
delta_size = len(delta)
|
|
202
|
+
|
|
203
|
+
if original_size == 0:
|
|
204
|
+
return (0, 0.0)
|
|
205
|
+
|
|
206
|
+
ratio = delta_size / original_size
|
|
207
|
+
return (original_size, ratio)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class DeltaCache:
|
|
211
|
+
"""
|
|
212
|
+
Cache deltas between similar objects.
|
|
213
|
+
|
|
214
|
+
Tracks base->target relationships and stores pre-computed deltas
|
|
215
|
+
to avoid recomputation.
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
def __init__(self):
|
|
219
|
+
self.deltas: Dict[Tuple[str, str], bytes] = {} # (base_hash, target_hash) -> delta
|
|
220
|
+
self.bases: Dict[str, bytes] = {} # target_hash -> base_hash (reconstruction path)
|
|
221
|
+
|
|
222
|
+
def add_delta(self, base_hash: str, target_hash: str, delta: bytes):
|
|
223
|
+
"""Register a delta relationship."""
|
|
224
|
+
self.deltas[(base_hash, target_hash)] = delta
|
|
225
|
+
self.bases[target_hash] = base_hash
|
|
226
|
+
|
|
227
|
+
def get_delta(self, base_hash: str, target_hash: str) -> Optional[bytes]:
|
|
228
|
+
"""Retrieve cached delta."""
|
|
229
|
+
return self.deltas.get((base_hash, target_hash))
|
|
230
|
+
|
|
231
|
+
def get_base(self, target_hash: str) -> Optional[str]:
|
|
232
|
+
"""Get the base hash for a target."""
|
|
233
|
+
return self.bases.get(target_hash)
|
|
234
|
+
|
|
235
|
+
def estimate_total_savings(self, objects: Dict[str, int]) -> Tuple[int, int]:
|
|
236
|
+
"""
|
|
237
|
+
Estimate total size savings from all deltas.
|
|
238
|
+
|
|
239
|
+
Returns (original_total, compressed_total).
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
objects: dict of hash_id -> original_size
|
|
243
|
+
"""
|
|
244
|
+
original_total = sum(objects.values())
|
|
245
|
+
compressed_total = 0
|
|
246
|
+
|
|
247
|
+
for (base_hash, target_hash), delta in self.deltas.items():
|
|
248
|
+
# Target stored as delta instead of full copy
|
|
249
|
+
compressed_total += len(delta)
|
|
250
|
+
|
|
251
|
+
# Add all non-delta objects
|
|
252
|
+
all_objects = set(objects.keys())
|
|
253
|
+
delta_targets = set(self.bases.keys())
|
|
254
|
+
non_delta = all_objects - delta_targets
|
|
255
|
+
for obj_hash in non_delta:
|
|
256
|
+
compressed_total += objects.get(obj_hash, 0)
|
|
257
|
+
|
|
258
|
+
return (original_total, compressed_total)
|