tokenshrink 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokenshrink/__init__.py +9 -5
- tokenshrink/cli.py +59 -2
- tokenshrink/pipeline.py +282 -5
- {tokenshrink-0.1.0.dist-info → tokenshrink-0.2.0.dist-info}/METADATA +76 -2
- tokenshrink-0.2.0.dist-info/RECORD +8 -0
- tokenshrink-0.1.0.dist-info/RECORD +0 -8
- {tokenshrink-0.1.0.dist-info → tokenshrink-0.2.0.dist-info}/WHEEL +0 -0
- {tokenshrink-0.1.0.dist-info → tokenshrink-0.2.0.dist-info}/entry_points.txt +0 -0
- {tokenshrink-0.1.0.dist-info → tokenshrink-0.2.0.dist-info}/licenses/LICENSE +0 -0
tokenshrink/__init__.py
CHANGED
|
@@ -3,6 +3,9 @@ TokenShrink: Cut your AI costs 50-80%.
|
|
|
3
3
|
|
|
4
4
|
FAISS semantic retrieval + LLMLingua compression for token-efficient context loading.
|
|
5
5
|
|
|
6
|
+
v0.2.0: REFRAG-inspired adaptive compression, cross-passage deduplication,
|
|
7
|
+
importance scoring. See README for details.
|
|
8
|
+
|
|
6
9
|
Usage:
|
|
7
10
|
from tokenshrink import TokenShrink
|
|
8
11
|
|
|
@@ -10,8 +13,9 @@ Usage:
|
|
|
10
13
|
ts.index("./docs")
|
|
11
14
|
|
|
12
15
|
result = ts.query("What are the API limits?")
|
|
13
|
-
print(result.context)
|
|
14
|
-
print(result.savings)
|
|
16
|
+
print(result.context) # Compressed, relevant context
|
|
17
|
+
print(result.savings) # "Saved 72% (1200 → 336 tokens, 2 redundant chunks removed)"
|
|
18
|
+
print(result.chunk_scores) # Per-chunk importance scores
|
|
15
19
|
|
|
16
20
|
CLI:
|
|
17
21
|
tokenshrink index ./docs
|
|
@@ -19,7 +23,7 @@ CLI:
|
|
|
19
23
|
tokenshrink stats
|
|
20
24
|
"""
|
|
21
25
|
|
|
22
|
-
from tokenshrink.pipeline import TokenShrink, ShrinkResult
|
|
26
|
+
from tokenshrink.pipeline import TokenShrink, ShrinkResult, ChunkScore
|
|
23
27
|
|
|
24
|
-
__version__ = "0.
|
|
25
|
-
__all__ = ["TokenShrink", "ShrinkResult"]
|
|
28
|
+
__version__ = "0.2.0"
|
|
29
|
+
__all__ = ["TokenShrink", "ShrinkResult", "ChunkScore"]
|
tokenshrink/cli.py
CHANGED
|
@@ -74,6 +74,27 @@ def main():
|
|
|
74
74
|
default=2000,
|
|
75
75
|
help="Target token limit (default: 2000)",
|
|
76
76
|
)
|
|
77
|
+
query_parser.add_argument(
|
|
78
|
+
"--adaptive",
|
|
79
|
+
action="store_true",
|
|
80
|
+
default=None,
|
|
81
|
+
help="Enable REFRAG-inspired adaptive compression (default: on)",
|
|
82
|
+
)
|
|
83
|
+
query_parser.add_argument(
|
|
84
|
+
"--no-adaptive",
|
|
85
|
+
action="store_true",
|
|
86
|
+
help="Disable adaptive compression",
|
|
87
|
+
)
|
|
88
|
+
query_parser.add_argument(
|
|
89
|
+
"--no-dedup",
|
|
90
|
+
action="store_true",
|
|
91
|
+
help="Disable cross-passage deduplication",
|
|
92
|
+
)
|
|
93
|
+
query_parser.add_argument(
|
|
94
|
+
"--scores",
|
|
95
|
+
action="store_true",
|
|
96
|
+
help="Show per-chunk importance scores",
|
|
97
|
+
)
|
|
77
98
|
|
|
78
99
|
# search (alias for query without compression)
|
|
79
100
|
search_parser = subparsers.add_parser("search", help="Search without compression")
|
|
@@ -128,25 +149,61 @@ def main():
|
|
|
128
149
|
elif args.no_compress:
|
|
129
150
|
compress = False
|
|
130
151
|
|
|
152
|
+
adaptive_flag = None
|
|
153
|
+
if getattr(args, 'adaptive', None):
|
|
154
|
+
adaptive_flag = True
|
|
155
|
+
elif getattr(args, 'no_adaptive', False):
|
|
156
|
+
adaptive_flag = False
|
|
157
|
+
|
|
158
|
+
dedup_flag = None
|
|
159
|
+
if getattr(args, 'no_dedup', False):
|
|
160
|
+
dedup_flag = False
|
|
161
|
+
|
|
131
162
|
result = ts.query(
|
|
132
163
|
args.question,
|
|
133
164
|
k=args.k,
|
|
134
165
|
max_tokens=args.max_tokens,
|
|
135
166
|
compress=compress,
|
|
167
|
+
adaptive=adaptive_flag,
|
|
168
|
+
dedup=dedup_flag,
|
|
136
169
|
)
|
|
137
170
|
|
|
138
171
|
if args.json:
|
|
139
|
-
|
|
172
|
+
output = {
|
|
140
173
|
"context": result.context,
|
|
141
174
|
"sources": result.sources,
|
|
142
175
|
"original_tokens": result.original_tokens,
|
|
143
176
|
"compressed_tokens": result.compressed_tokens,
|
|
144
177
|
"savings_pct": result.savings_pct,
|
|
145
|
-
|
|
178
|
+
"dedup_removed": result.dedup_removed,
|
|
179
|
+
}
|
|
180
|
+
if getattr(args, 'scores', False) and result.chunk_scores:
|
|
181
|
+
output["chunk_scores"] = [
|
|
182
|
+
{
|
|
183
|
+
"source": Path(cs.source).name,
|
|
184
|
+
"similarity": round(cs.similarity, 3),
|
|
185
|
+
"density": round(cs.density, 3),
|
|
186
|
+
"importance": round(cs.importance, 3),
|
|
187
|
+
"compression_ratio": round(cs.compression_ratio, 3),
|
|
188
|
+
"deduplicated": cs.deduplicated,
|
|
189
|
+
}
|
|
190
|
+
for cs in result.chunk_scores
|
|
191
|
+
]
|
|
192
|
+
print(json.dumps(output, indent=2))
|
|
146
193
|
else:
|
|
147
194
|
if result.sources:
|
|
148
195
|
print(f"Sources: {', '.join(Path(s).name for s in result.sources)}")
|
|
149
196
|
print(f"Stats: {result.savings}")
|
|
197
|
+
|
|
198
|
+
if getattr(args, 'scores', False) and result.chunk_scores:
|
|
199
|
+
print("\nChunk Importance Scores:")
|
|
200
|
+
for cs in result.chunk_scores:
|
|
201
|
+
status = " [DEDUP]" if cs.deduplicated else ""
|
|
202
|
+
print(f" {Path(cs.source).name}: "
|
|
203
|
+
f"sim={cs.similarity:.2f} density={cs.density:.2f} "
|
|
204
|
+
f"importance={cs.importance:.2f} ratio={cs.compression_ratio:.2f}"
|
|
205
|
+
f"{status}")
|
|
206
|
+
|
|
150
207
|
print()
|
|
151
208
|
print(result.context)
|
|
152
209
|
else:
|
tokenshrink/pipeline.py
CHANGED
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
"""
|
|
2
2
|
TokenShrink core: FAISS retrieval + LLMLingua compression.
|
|
3
|
+
|
|
4
|
+
v0.2.0: REFRAG-inspired adaptive compression, deduplication, importance scoring.
|
|
3
5
|
"""
|
|
4
6
|
|
|
5
7
|
import os
|
|
6
8
|
import json
|
|
7
9
|
import hashlib
|
|
10
|
+
import math
|
|
8
11
|
from pathlib import Path
|
|
9
|
-
from dataclasses import dataclass
|
|
12
|
+
from dataclasses import dataclass, field
|
|
10
13
|
from typing import Optional
|
|
11
14
|
|
|
12
15
|
import faiss
|
|
@@ -21,6 +24,19 @@ except ImportError:
|
|
|
21
24
|
HAS_COMPRESSION = False
|
|
22
25
|
|
|
23
26
|
|
|
27
|
+
@dataclass
|
|
28
|
+
class ChunkScore:
|
|
29
|
+
"""Per-chunk scoring metadata (REFRAG-inspired)."""
|
|
30
|
+
index: int
|
|
31
|
+
text: str
|
|
32
|
+
source: str
|
|
33
|
+
similarity: float # Cosine similarity to query
|
|
34
|
+
density: float # Information density (entropy proxy)
|
|
35
|
+
importance: float # Combined importance score
|
|
36
|
+
compression_ratio: float # Adaptive ratio assigned to this chunk
|
|
37
|
+
deduplicated: bool = False # Flagged as redundant
|
|
38
|
+
|
|
39
|
+
|
|
24
40
|
@dataclass
|
|
25
41
|
class ShrinkResult:
|
|
26
42
|
"""Result from a query."""
|
|
@@ -29,17 +45,122 @@ class ShrinkResult:
|
|
|
29
45
|
original_tokens: int
|
|
30
46
|
compressed_tokens: int
|
|
31
47
|
ratio: float
|
|
48
|
+
chunk_scores: list[ChunkScore] = field(default_factory=list)
|
|
49
|
+
dedup_removed: int = 0
|
|
32
50
|
|
|
33
51
|
@property
|
|
34
52
|
def savings(self) -> str:
|
|
35
53
|
pct = (1 - self.ratio) * 100
|
|
36
|
-
|
|
54
|
+
extra = ""
|
|
55
|
+
if self.dedup_removed > 0:
|
|
56
|
+
extra = f", {self.dedup_removed} redundant chunks removed"
|
|
57
|
+
return f"Saved {pct:.0f}% ({self.original_tokens} → {self.compressed_tokens} tokens{extra})"
|
|
37
58
|
|
|
38
59
|
@property
|
|
39
60
|
def savings_pct(self) -> float:
|
|
40
61
|
return (1 - self.ratio) * 100
|
|
41
62
|
|
|
42
63
|
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# REFRAG-inspired utilities
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
def _information_density(text: str) -> float:
|
|
69
|
+
"""
|
|
70
|
+
Estimate information density of text via character-level entropy.
|
|
71
|
+
Higher entropy ≈ more information-dense (code, data, technical content).
|
|
72
|
+
Lower entropy ≈ more redundant (boilerplate, filler).
|
|
73
|
+
Returns 0.0-1.0 normalized score.
|
|
74
|
+
"""
|
|
75
|
+
if not text:
|
|
76
|
+
return 0.0
|
|
77
|
+
|
|
78
|
+
freq = {}
|
|
79
|
+
for ch in text.lower():
|
|
80
|
+
freq[ch] = freq.get(ch, 0) + 1
|
|
81
|
+
|
|
82
|
+
total = len(text)
|
|
83
|
+
entropy = 0.0
|
|
84
|
+
for count in freq.values():
|
|
85
|
+
p = count / total
|
|
86
|
+
if p > 0:
|
|
87
|
+
entropy -= p * math.log2(p)
|
|
88
|
+
|
|
89
|
+
# Normalize: English text entropy is ~4.0-4.5 bits/char
|
|
90
|
+
# Code/data is ~5.0-6.0, very repetitive text is ~2.0-3.0
|
|
91
|
+
# Map to 0-1 range with midpoint at ~4.5
|
|
92
|
+
normalized = min(1.0, max(0.0, (entropy - 2.0) / 4.0))
|
|
93
|
+
return normalized
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _compute_importance(similarity: float, density: float,
|
|
97
|
+
sim_weight: float = 0.7, density_weight: float = 0.3) -> float:
|
|
98
|
+
"""
|
|
99
|
+
Combined importance score from similarity and density.
|
|
100
|
+
REFRAG insight: not all retrieved chunks contribute equally.
|
|
101
|
+
High similarity + high density = most important (compress less).
|
|
102
|
+
Low similarity + low density = least important (compress more or drop).
|
|
103
|
+
"""
|
|
104
|
+
return sim_weight * similarity + density_weight * density
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _adaptive_ratio(importance: float, base_ratio: float = 0.5,
|
|
108
|
+
min_ratio: float = 0.2, max_ratio: float = 0.9) -> float:
|
|
109
|
+
"""
|
|
110
|
+
Map importance score to compression ratio.
|
|
111
|
+
High importance → keep more (higher ratio, less compression).
|
|
112
|
+
Low importance → compress harder (lower ratio).
|
|
113
|
+
|
|
114
|
+
ratio=1.0 means keep everything, ratio=0.2 means keep 20%.
|
|
115
|
+
"""
|
|
116
|
+
# Linear interpolation: low importance → min_ratio, high → max_ratio
|
|
117
|
+
ratio = min_ratio + importance * (max_ratio - min_ratio)
|
|
118
|
+
return min(max_ratio, max(min_ratio, ratio))
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _deduplicate_chunks(chunks: list[dict], embeddings: np.ndarray,
|
|
122
|
+
threshold: float = 0.85) -> tuple[list[dict], list[int]]:
|
|
123
|
+
"""
|
|
124
|
+
Remove near-duplicate chunks using embedding cosine similarity.
|
|
125
|
+
REFRAG insight: block-diagonal attention means redundant passages waste compute.
|
|
126
|
+
|
|
127
|
+
Returns: (deduplicated_chunks, removed_indices)
|
|
128
|
+
"""
|
|
129
|
+
if len(chunks) <= 1:
|
|
130
|
+
return chunks, []
|
|
131
|
+
|
|
132
|
+
# Compute pairwise similarities
|
|
133
|
+
# embeddings should already be normalized (from SentenceTransformer with normalize_embeddings=True)
|
|
134
|
+
sim_matrix = embeddings @ embeddings.T
|
|
135
|
+
|
|
136
|
+
keep = []
|
|
137
|
+
removed = []
|
|
138
|
+
kept_indices = set()
|
|
139
|
+
|
|
140
|
+
# Greedy: keep highest-scored chunks, remove near-duplicates
|
|
141
|
+
# Sort by score descending
|
|
142
|
+
scored = sorted(enumerate(chunks), key=lambda x: x[1].get("score", 0), reverse=True)
|
|
143
|
+
|
|
144
|
+
for idx, chunk in scored:
|
|
145
|
+
if idx in removed:
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
# Check if this chunk is too similar to any already-kept chunk
|
|
149
|
+
is_dup = False
|
|
150
|
+
for kept_idx in kept_indices:
|
|
151
|
+
if sim_matrix[idx, kept_idx] > threshold:
|
|
152
|
+
is_dup = True
|
|
153
|
+
break
|
|
154
|
+
|
|
155
|
+
if is_dup:
|
|
156
|
+
removed.append(idx)
|
|
157
|
+
else:
|
|
158
|
+
keep.append(chunk)
|
|
159
|
+
kept_indices.add(idx)
|
|
160
|
+
|
|
161
|
+
return keep, removed
|
|
162
|
+
|
|
163
|
+
|
|
43
164
|
class TokenShrink:
|
|
44
165
|
"""
|
|
45
166
|
Token-efficient context loading.
|
|
@@ -59,6 +180,9 @@ class TokenShrink:
|
|
|
59
180
|
chunk_overlap: int = 50,
|
|
60
181
|
device: str = "auto",
|
|
61
182
|
compression: bool = True,
|
|
183
|
+
adaptive: bool = True,
|
|
184
|
+
dedup: bool = True,
|
|
185
|
+
dedup_threshold: float = 0.85,
|
|
62
186
|
):
|
|
63
187
|
"""
|
|
64
188
|
Initialize TokenShrink.
|
|
@@ -70,11 +194,17 @@ class TokenShrink:
|
|
|
70
194
|
chunk_overlap: Overlap between chunks.
|
|
71
195
|
device: Device for compression (auto, mps, cuda, cpu).
|
|
72
196
|
compression: Enable LLMLingua compression.
|
|
197
|
+
adaptive: Enable REFRAG-inspired adaptive compression (v0.2).
|
|
198
|
+
dedup: Enable cross-passage deduplication (v0.2).
|
|
199
|
+
dedup_threshold: Cosine similarity threshold for dedup (0-1).
|
|
73
200
|
"""
|
|
74
201
|
self.index_dir = Path(index_dir or ".tokenshrink")
|
|
75
202
|
self.chunk_size = chunk_size
|
|
76
203
|
self.chunk_overlap = chunk_overlap
|
|
77
204
|
self._compression_enabled = compression and HAS_COMPRESSION
|
|
205
|
+
self._adaptive = adaptive
|
|
206
|
+
self._dedup = dedup
|
|
207
|
+
self._dedup_threshold = dedup_threshold
|
|
78
208
|
|
|
79
209
|
# Auto-detect device
|
|
80
210
|
if device == "auto":
|
|
@@ -219,6 +349,8 @@ class TokenShrink:
|
|
|
219
349
|
min_score: float = 0.3,
|
|
220
350
|
max_tokens: int = 2000,
|
|
221
351
|
compress: Optional[bool] = None,
|
|
352
|
+
adaptive: Optional[bool] = None,
|
|
353
|
+
dedup: Optional[bool] = None,
|
|
222
354
|
) -> ShrinkResult:
|
|
223
355
|
"""
|
|
224
356
|
Get relevant, compressed context for a question.
|
|
@@ -229,9 +361,11 @@ class TokenShrink:
|
|
|
229
361
|
min_score: Minimum similarity score (0-1).
|
|
230
362
|
max_tokens: Target token limit for compression.
|
|
231
363
|
compress: Override compression setting.
|
|
364
|
+
adaptive: Override adaptive compression (REFRAG-inspired).
|
|
365
|
+
dedup: Override deduplication setting.
|
|
232
366
|
|
|
233
367
|
Returns:
|
|
234
|
-
ShrinkResult with context, sources, and
|
|
368
|
+
ShrinkResult with context, sources, token stats, and chunk scores.
|
|
235
369
|
"""
|
|
236
370
|
if self._index.ntotal == 0:
|
|
237
371
|
return ShrinkResult(
|
|
@@ -242,6 +376,9 @@ class TokenShrink:
|
|
|
242
376
|
ratio=1.0,
|
|
243
377
|
)
|
|
244
378
|
|
|
379
|
+
use_adaptive = adaptive if adaptive is not None else self._adaptive
|
|
380
|
+
use_dedup = dedup if dedup is not None else self._dedup
|
|
381
|
+
|
|
245
382
|
# Retrieve
|
|
246
383
|
embedding = self._model.encode([question], normalize_embeddings=True)
|
|
247
384
|
scores, indices = self._index.search(
|
|
@@ -250,10 +387,12 @@ class TokenShrink:
|
|
|
250
387
|
)
|
|
251
388
|
|
|
252
389
|
results = []
|
|
390
|
+
result_embeddings = []
|
|
253
391
|
for score, idx in zip(scores[0], indices[0]):
|
|
254
392
|
if idx >= 0 and score >= min_score:
|
|
255
393
|
chunk = self._chunks[idx].copy()
|
|
256
394
|
chunk["score"] = float(score)
|
|
395
|
+
chunk["_idx"] = int(idx)
|
|
257
396
|
results.append(chunk)
|
|
258
397
|
|
|
259
398
|
if not results:
|
|
@@ -265,6 +404,60 @@ class TokenShrink:
|
|
|
265
404
|
ratio=1.0,
|
|
266
405
|
)
|
|
267
406
|
|
|
407
|
+
# ── REFRAG Step 1: Importance scoring ──
|
|
408
|
+
chunk_scores = []
|
|
409
|
+
for i, chunk in enumerate(results):
|
|
410
|
+
density = _information_density(chunk["text"])
|
|
411
|
+
importance = _compute_importance(chunk["score"], density)
|
|
412
|
+
comp_ratio = _adaptive_ratio(importance) if use_adaptive else 0.5
|
|
413
|
+
|
|
414
|
+
chunk_scores.append(ChunkScore(
|
|
415
|
+
index=i,
|
|
416
|
+
text=chunk["text"][:100] + "..." if len(chunk["text"]) > 100 else chunk["text"],
|
|
417
|
+
source=chunk["source"],
|
|
418
|
+
similarity=chunk["score"],
|
|
419
|
+
density=density,
|
|
420
|
+
importance=importance,
|
|
421
|
+
compression_ratio=comp_ratio,
|
|
422
|
+
))
|
|
423
|
+
|
|
424
|
+
# ── REFRAG Step 2: Cross-passage deduplication ──
|
|
425
|
+
dedup_removed = 0
|
|
426
|
+
if use_dedup and len(results) > 1:
|
|
427
|
+
# Get embeddings for dedup
|
|
428
|
+
chunk_texts = [c["text"] for c in results]
|
|
429
|
+
chunk_embs = self._model.encode(chunk_texts, normalize_embeddings=True)
|
|
430
|
+
|
|
431
|
+
deduped, removed_indices = _deduplicate_chunks(
|
|
432
|
+
results, np.array(chunk_embs, dtype=np.float32),
|
|
433
|
+
threshold=self._dedup_threshold
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
dedup_removed = len(removed_indices)
|
|
437
|
+
|
|
438
|
+
# Mark removed chunks in scores
|
|
439
|
+
for idx in removed_indices:
|
|
440
|
+
if idx < len(chunk_scores):
|
|
441
|
+
chunk_scores[idx].deduplicated = True
|
|
442
|
+
|
|
443
|
+
results = deduped
|
|
444
|
+
|
|
445
|
+
# Sort remaining by importance (highest first)
|
|
446
|
+
if use_adaptive:
|
|
447
|
+
# Pair results with their scores for sorting
|
|
448
|
+
result_score_pairs = []
|
|
449
|
+
for chunk in results:
|
|
450
|
+
# Find matching score
|
|
451
|
+
for cs in chunk_scores:
|
|
452
|
+
if not cs.deduplicated and cs.source == chunk["source"] and cs.similarity == chunk["score"]:
|
|
453
|
+
result_score_pairs.append((chunk, cs))
|
|
454
|
+
break
|
|
455
|
+
else:
|
|
456
|
+
result_score_pairs.append((chunk, None))
|
|
457
|
+
|
|
458
|
+
result_score_pairs.sort(key=lambda x: x[1].importance if x[1] else 0, reverse=True)
|
|
459
|
+
results = [pair[0] for pair in result_score_pairs]
|
|
460
|
+
|
|
268
461
|
# Combine chunks
|
|
269
462
|
combined = "\n\n---\n\n".join(
|
|
270
463
|
f"[{Path(c['source']).name}]\n{c['text']}" for c in results
|
|
@@ -274,17 +467,23 @@ class TokenShrink:
|
|
|
274
467
|
# Estimate tokens
|
|
275
468
|
original_tokens = len(combined.split())
|
|
276
469
|
|
|
277
|
-
#
|
|
470
|
+
# ── REFRAG Step 3: Adaptive compression ──
|
|
278
471
|
should_compress = compress if compress is not None else self._compression_enabled
|
|
279
472
|
|
|
280
473
|
if should_compress and original_tokens > 100:
|
|
281
|
-
|
|
474
|
+
if use_adaptive:
|
|
475
|
+
compressed, stats = self._compress_adaptive(results, chunk_scores, max_tokens)
|
|
476
|
+
else:
|
|
477
|
+
compressed, stats = self._compress(combined, max_tokens)
|
|
478
|
+
|
|
282
479
|
return ShrinkResult(
|
|
283
480
|
context=compressed,
|
|
284
481
|
sources=sources,
|
|
285
482
|
original_tokens=stats["original"],
|
|
286
483
|
compressed_tokens=stats["compressed"],
|
|
287
484
|
ratio=stats["ratio"],
|
|
485
|
+
chunk_scores=chunk_scores,
|
|
486
|
+
dedup_removed=dedup_removed,
|
|
288
487
|
)
|
|
289
488
|
|
|
290
489
|
return ShrinkResult(
|
|
@@ -293,8 +492,86 @@ class TokenShrink:
|
|
|
293
492
|
original_tokens=original_tokens,
|
|
294
493
|
compressed_tokens=original_tokens,
|
|
295
494
|
ratio=1.0,
|
|
495
|
+
chunk_scores=chunk_scores,
|
|
496
|
+
dedup_removed=dedup_removed,
|
|
296
497
|
)
|
|
297
498
|
|
|
499
|
+
def _compress_adaptive(self, chunks: list[dict], scores: list[ChunkScore],
|
|
500
|
+
max_tokens: int) -> tuple[str, dict]:
|
|
501
|
+
"""
|
|
502
|
+
REFRAG-inspired adaptive compression: each chunk gets a different
|
|
503
|
+
compression ratio based on its importance score.
|
|
504
|
+
|
|
505
|
+
High-importance chunks (high similarity + high density) are kept
|
|
506
|
+
nearly intact. Low-importance chunks are compressed aggressively.
|
|
507
|
+
"""
|
|
508
|
+
compressor = self._get_compressor()
|
|
509
|
+
|
|
510
|
+
# Build a map from chunk source+score to its ChunkScore
|
|
511
|
+
score_map = {}
|
|
512
|
+
for cs in scores:
|
|
513
|
+
if not cs.deduplicated:
|
|
514
|
+
score_map[(cs.source, cs.similarity)] = cs
|
|
515
|
+
|
|
516
|
+
compressed_parts = []
|
|
517
|
+
total_original = 0
|
|
518
|
+
total_compressed = 0
|
|
519
|
+
|
|
520
|
+
for chunk in chunks:
|
|
521
|
+
text = f"[{Path(chunk['source']).name}]\n{chunk['text']}"
|
|
522
|
+
cs = score_map.get((chunk["source"], chunk.get("score", 0)))
|
|
523
|
+
|
|
524
|
+
# Determine per-chunk ratio
|
|
525
|
+
if cs:
|
|
526
|
+
target_ratio = cs.compression_ratio
|
|
527
|
+
else:
|
|
528
|
+
target_ratio = 0.5 # Default fallback
|
|
529
|
+
|
|
530
|
+
est_tokens = len(text.split())
|
|
531
|
+
|
|
532
|
+
if est_tokens < 20:
|
|
533
|
+
# Too short to compress meaningfully
|
|
534
|
+
compressed_parts.append(text)
|
|
535
|
+
total_original += est_tokens
|
|
536
|
+
total_compressed += est_tokens
|
|
537
|
+
continue
|
|
538
|
+
|
|
539
|
+
try:
|
|
540
|
+
# Compress with chunk-specific ratio
|
|
541
|
+
max_chars = 1500
|
|
542
|
+
if len(text) <= max_chars:
|
|
543
|
+
result = compressor.compress_prompt(
|
|
544
|
+
text,
|
|
545
|
+
rate=target_ratio,
|
|
546
|
+
force_tokens=["\n", ".", "!", "?"],
|
|
547
|
+
)
|
|
548
|
+
compressed_parts.append(result["compressed_prompt"])
|
|
549
|
+
total_original += result["origin_tokens"]
|
|
550
|
+
total_compressed += result["compressed_tokens"]
|
|
551
|
+
else:
|
|
552
|
+
# Sub-chunk large texts
|
|
553
|
+
parts = [text[i:i+max_chars] for i in range(0, len(text), max_chars)]
|
|
554
|
+
for part in parts:
|
|
555
|
+
if not part.strip():
|
|
556
|
+
continue
|
|
557
|
+
r = compressor.compress_prompt(part, rate=target_ratio)
|
|
558
|
+
compressed_parts.append(r["compressed_prompt"])
|
|
559
|
+
total_original += r["origin_tokens"]
|
|
560
|
+
total_compressed += r["compressed_tokens"]
|
|
561
|
+
except Exception:
|
|
562
|
+
# Fallback: use uncompressed
|
|
563
|
+
compressed_parts.append(text)
|
|
564
|
+
total_original += est_tokens
|
|
565
|
+
total_compressed += est_tokens
|
|
566
|
+
|
|
567
|
+
combined = "\n\n---\n\n".join(compressed_parts)
|
|
568
|
+
|
|
569
|
+
return combined, {
|
|
570
|
+
"original": total_original,
|
|
571
|
+
"compressed": total_compressed,
|
|
572
|
+
"ratio": total_compressed / total_original if total_original else 1.0,
|
|
573
|
+
}
|
|
574
|
+
|
|
298
575
|
def _compress(self, text: str, max_tokens: int) -> tuple[str, dict]:
|
|
299
576
|
"""Compress text using LLMLingua-2."""
|
|
300
577
|
compressor = self._get_compressor()
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tokenshrink
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: Cut your AI costs 50-80%. FAISS retrieval + LLMLingua compression.
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Cut your AI costs 50-80%. FAISS retrieval + LLMLingua compression + REFRAG-inspired adaptive optimization.
|
|
5
5
|
Project-URL: Homepage, https://tokenshrink.dev
|
|
6
6
|
Project-URL: Repository, https://github.com/MusashiMiyamoto1-cloud/tokenshrink
|
|
7
7
|
Project-URL: Documentation, https://tokenshrink.dev/docs
|
|
@@ -194,6 +194,54 @@ template = PromptTemplate(
|
|
|
194
194
|
2. **Search**: Finds relevant chunks via semantic similarity
|
|
195
195
|
3. **Compress**: Removes redundancy while preserving meaning
|
|
196
196
|
|
|
197
|
+
## REFRAG-Inspired Features (v0.2)
|
|
198
|
+
|
|
199
|
+
Inspired by [REFRAG](https://arxiv.org/abs/2509.01092) (Meta, 2025) — which showed RAG contexts have sparse, block-diagonal attention patterns — TokenShrink v0.2 applies similar insights **upstream**, before tokens even reach the model:
|
|
200
|
+
|
|
201
|
+
### Adaptive Compression
|
|
202
|
+
|
|
203
|
+
Not all chunks are equal. v0.2 scores each chunk by **importance** (semantic similarity × information density) and compresses accordingly:
|
|
204
|
+
|
|
205
|
+
- High-importance chunks (relevant + information-dense) → kept nearly intact
|
|
206
|
+
- Low-importance chunks → compressed aggressively
|
|
207
|
+
- Net effect: better quality context within the same token budget
|
|
208
|
+
|
|
209
|
+
```python
|
|
210
|
+
result = ts.query("What are the rate limits?")
|
|
211
|
+
for cs in result.chunk_scores:
|
|
212
|
+
print(f"{cs.source}: importance={cs.importance:.2f}, ratio={cs.compression_ratio:.2f}")
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Cross-Passage Deduplication
|
|
216
|
+
|
|
217
|
+
Retrieved chunks often overlap (especially from similar documents). v0.2 detects near-duplicate passages via embedding similarity and removes redundant ones before compression:
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
ts = TokenShrink(dedup_threshold=0.85) # Default: 0.85
|
|
221
|
+
result = ts.query("How to authenticate?")
|
|
222
|
+
print(f"Removed {result.dedup_removed} redundant chunks")
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Chunk Importance Scoring
|
|
226
|
+
|
|
227
|
+
Every chunk gets a composite score combining:
|
|
228
|
+
- **Similarity** (0.7 weight) — How relevant is this to the query?
|
|
229
|
+
- **Information density** (0.3 weight) — How much unique information does it contain?
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
# See scores in CLI
|
|
233
|
+
tokenshrink query "deployment steps" --scores
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
### Stacking with REFRAG
|
|
237
|
+
|
|
238
|
+
TokenShrink handles **upstream** optimization (retrieval + compression). REFRAG handles **downstream** decode-time optimization. Stack them:
|
|
239
|
+
|
|
240
|
+
```
|
|
241
|
+
Your files → TokenShrink (retrieve + dedupe + adaptive compress) → LLM → REFRAG (decode-time)
|
|
242
|
+
↓ 50-80% fewer tokens ↓ 30x faster TTFT
|
|
243
|
+
```
|
|
244
|
+
|
|
197
245
|
## Configuration
|
|
198
246
|
|
|
199
247
|
```python
|
|
@@ -204,6 +252,9 @@ ts = TokenShrink(
|
|
|
204
252
|
chunk_overlap=50, # Overlap between chunks
|
|
205
253
|
device="auto", # auto, mps, cuda, cpu
|
|
206
254
|
compression=True, # Enable LLMLingua
|
|
255
|
+
adaptive=True, # REFRAG-inspired adaptive compression (v0.2)
|
|
256
|
+
dedup=True, # Cross-passage deduplication (v0.2)
|
|
257
|
+
dedup_threshold=0.85, # Similarity threshold for dedup (v0.2)
|
|
207
258
|
)
|
|
208
259
|
```
|
|
209
260
|
|
|
@@ -250,6 +301,29 @@ pip uninstall tokenshrink
|
|
|
250
301
|
rm -rf .tokenshrink # Remove local index
|
|
251
302
|
```
|
|
252
303
|
|
|
304
|
+
## Feedback — Humans & Agents Welcome
|
|
305
|
+
|
|
306
|
+
We want to hear from you. Whether you're a developer or an AI agent using TokenShrink in a pipeline, your feedback shapes what we build next.
|
|
307
|
+
|
|
308
|
+
**Quick feedback (30 seconds):**
|
|
309
|
+
- 👍 Working great? [Tell us what you're using it for](https://github.com/MusashiMiyamoto1-cloud/tokenshrink/discussions/categories/show-and-tell)
|
|
310
|
+
- 💡 Feature idea? [Share it](https://github.com/MusashiMiyamoto1-cloud/tokenshrink/discussions/categories/ideas)
|
|
311
|
+
- 🐛 Bug? [Open an issue](https://github.com/MusashiMiyamoto1-cloud/tokenshrink/issues/new?template=bug.md)
|
|
312
|
+
- ❓ Question? [Ask here](https://github.com/MusashiMiyamoto1-cloud/tokenshrink/discussions/categories/q-a)
|
|
313
|
+
|
|
314
|
+
**For agents:**
|
|
315
|
+
```bash
|
|
316
|
+
# Report feedback programmatically
|
|
317
|
+
tokenshrink feedback "compression ratio lower than expected on code files"
|
|
318
|
+
|
|
319
|
+
# Or post directly via GitHub API
|
|
320
|
+
curl -X POST https://api.github.com/repos/MusashiMiyamoto1-cloud/tokenshrink/issues \
|
|
321
|
+
-H "Authorization: token YOUR_TOKEN" \
|
|
322
|
+
-d '{"title":"Feedback: ...","labels":["feedback"]}'
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
Every piece of feedback gets read. We're building this in the open.
|
|
326
|
+
|
|
253
327
|
---
|
|
254
328
|
|
|
255
329
|
Built by [Musashi](https://github.com/MusashiMiyamoto1-cloud) · Part of [Agent Guard](https://agentguard.co)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
tokenshrink/__init__.py,sha256=ybxGRkBPQTLIBckBYSZxebHKlAilMOoWVJxOyhv1Hgw,883
|
|
2
|
+
tokenshrink/cli.py,sha256=dmP1BPbMow_NBm8fFXo05vJlU4vgyhDuzxL5q1a6n20,8102
|
|
3
|
+
tokenshrink/pipeline.py,sha256=H3T3UlvHOIc1VOVyNFL-HEP0Cf_v7fVlAY-BFVT4V4w,24055
|
|
4
|
+
tokenshrink-0.2.0.dist-info/METADATA,sha256=quql1c1tRTp7lF4t1YOVEFGx1dT8tf3bXD04KvK0TW4,10615
|
|
5
|
+
tokenshrink-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
+
tokenshrink-0.2.0.dist-info/entry_points.txt,sha256=vwr3PMC25J8f-ppDVngO3MmXuY_cdR2rNM_syUmT7lc,53
|
|
7
|
+
tokenshrink-0.2.0.dist-info/licenses/LICENSE,sha256=LsUNAvKJnhwbhmOWCjLq-Zf0HllrifthQ9TZkv1UUig,1064
|
|
8
|
+
tokenshrink-0.2.0.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
tokenshrink/__init__.py,sha256=kobJJ4XI3bcxoWBH_HkJ4gK86bF9FcBAWDuKlVyKPYQ,637
|
|
2
|
-
tokenshrink/cli.py,sha256=kuseTPxq1jxHcnQ7nOiqCPnI8JqQWIcynpkboQ_YFig,5879
|
|
3
|
-
tokenshrink/pipeline.py,sha256=OYEa3MjYrSlwtymmbhwnDG2JCdonZnlcfhDH7Fev2YI,13149
|
|
4
|
-
tokenshrink-0.1.0.dist-info/METADATA,sha256=Ee2QCeU11A0QcjVVkEaegUmQCkgyk8sbSzOXh7jveI8,7331
|
|
5
|
-
tokenshrink-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
-
tokenshrink-0.1.0.dist-info/entry_points.txt,sha256=vwr3PMC25J8f-ppDVngO3MmXuY_cdR2rNM_syUmT7lc,53
|
|
7
|
-
tokenshrink-0.1.0.dist-info/licenses/LICENSE,sha256=LsUNAvKJnhwbhmOWCjLq-Zf0HllrifthQ9TZkv1UUig,1064
|
|
8
|
-
tokenshrink-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|