tokenmizer 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. tokenmizer/__init__.py +21 -0
  2. tokenmizer/agents/__init__.py +0 -0
  3. tokenmizer/analytics/__init__.py +0 -0
  4. tokenmizer/analytics/engine.py +188 -0
  5. tokenmizer/api/__init__.py +0 -0
  6. tokenmizer/api/app.py +958 -0
  7. tokenmizer/api/rate_limiter.py +110 -0
  8. tokenmizer/checkpoints/__init__.py +0 -0
  9. tokenmizer/checkpoints/manager.py +383 -0
  10. tokenmizer/cli.py +153 -0
  11. tokenmizer/compression/__init__.py +0 -0
  12. tokenmizer/compression/engine.py +669 -0
  13. tokenmizer/compression/output_trimmer.py +95 -0
  14. tokenmizer/compression/window.py +104 -0
  15. tokenmizer/config/__init__.py +0 -0
  16. tokenmizer/config/settings.py +170 -0
  17. tokenmizer/core/__init__.py +0 -0
  18. tokenmizer/core/dto.py +196 -0
  19. tokenmizer/core/errors.py +35 -0
  20. tokenmizer/core/tokenizer.py +96 -0
  21. tokenmizer/dashboard/__init__.py +0 -0
  22. tokenmizer/dashboard/page.py +267 -0
  23. tokenmizer/filters/__init__.py +0 -0
  24. tokenmizer/filters/file_intelligence.py +960 -0
  25. tokenmizer/graph_memory/__init__.py +0 -0
  26. tokenmizer/graph_memory/decision_tracker.py +225 -0
  27. tokenmizer/graph_memory/graph.py +1287 -0
  28. tokenmizer/graph_memory/helpers.py +121 -0
  29. tokenmizer/graph_memory/hybrid_extractor.py +703 -0
  30. tokenmizer/graph_memory/types.py +134 -0
  31. tokenmizer/graph_memory/validator.py +304 -0
  32. tokenmizer/graph_memory/visualization.py +228 -0
  33. tokenmizer/mcp/__init__.py +0 -0
  34. tokenmizer/mcp/server.py +368 -0
  35. tokenmizer/providers/__init__.py +0 -0
  36. tokenmizer/providers/providers.py +456 -0
  37. tokenmizer/security/__init__.py +0 -0
  38. tokenmizer/security/auth.py +95 -0
  39. tokenmizer/security/middleware.py +138 -0
  40. tokenmizer/security/redaction.py +126 -0
  41. tokenmizer/semantic_cache/__init__.py +0 -0
  42. tokenmizer/semantic_cache/cache.py +383 -0
  43. tokenmizer/state/__init__.py +0 -0
  44. tokenmizer/state/backend.py +137 -0
  45. tokenmizer/storage/__init__.py +56 -0
  46. tokenmizer-0.2.4.dist-info/METADATA +529 -0
  47. tokenmizer-0.2.4.dist-info/RECORD +50 -0
  48. tokenmizer-0.2.4.dist-info/WHEEL +4 -0
  49. tokenmizer-0.2.4.dist-info/entry_points.txt +2 -0
  50. tokenmizer-0.2.4.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,669 @@
1
+ """
2
+ Layer 1: Advanced Prompt Compression
3
+ =====================================
4
+ Strategies (applied in pipeline order):
5
+ 1. Filler phrase removal — regex-based, zero deps
6
+ 2. Duplicate line suppression — remove exact repeat lines
7
+ 3. Whitespace normalization — collapse blank lines/spaces
8
+ 4. Comment stripping — strip code comments from heavy files
9
+ 5. Repetitive history pruning — deduplicate assistant boilerplate
10
+ 6. Smart truncation — truncate low-value file blocks
11
+ 7. LLMLingua-2 — ML-based token-level compression
12
+ 8. LongLLMLingua — for >4k token documents
13
+
14
+ File-type filters (new):
15
+ - PDF/docx text extraction — don't send raw binary markers
16
+ - Large JSON flattening — remove nested nulls/empty arrays
17
+ - CSV summarization — send schema + sample, not full file
18
+ - Code deduplication — remove duplicate function bodies
19
+ - Log trimming — keep first+last N lines of logs
20
+
21
+ CORRECTNESS FIX — code blocks are now excluded from LLMLingua entirely
22
+ (see CodeBlockGuard below). LLMLingua-2 is a lossy, ML-based token
23
+ compressor — `force_tokens` only hints at preservation, it does not
24
+ guarantee it. Applied to code, this risks dropping or reordering tokens
25
+ that change program semantics: a removed `not`, a dropped `except`
26
+ clause, mangled indentation in Python (where whitespace IS syntax), a
27
+ truncated regex. A tool whose target use case is "coding sessions with
28
+ an LLM" must not silently corrupt the code it's supposed to be helping
29
+ with. Code fences (```...```) and indented code blocks are now segmented
30
+ out before LLMLingua runs and passed through untouched (only the
31
+ lossless heuristics — whitespace/dedup/optional comment-stripping — ever
32
+ touch code); only prose segments are sent to the ML compressor.
33
+ """
34
+ from __future__ import annotations
35
+
36
+ import json
37
+ import logging
38
+ import re
39
+ from dataclasses import dataclass, field
40
+ from typing import Dict, List, Optional, Tuple
41
+
42
+ from tokenmizer.core.tokenizer import count_tokens
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+ # ─── Code-block protection ──────────────────────────────────────────────────
47
+
48
+ # Matches fenced code blocks: ```lang\n...\n``` or ```\n...\n```
49
+ _FENCED_CODE_RE = re.compile(r'```[^\n]*\n.*?```', re.DOTALL)
50
+ # Matches inline code spans: `like this`
51
+ _INLINE_CODE_RE = re.compile(r'`[^`\n]+`')
52
+
53
+
54
+ class CodeBlockGuard:
55
+ """
56
+ Segments text into (is_code, segment) pairs so callers can route code
57
+ around lossy ML compression while still compressing surrounding prose.
58
+
59
+ Only handles fenced (```) and inline (`) code markup — the common case
60
+ for chat-style content. Code pasted without any fence markup (a raw
61
+ paste with no backticks) cannot be reliably distinguished from prose by
62
+ this guard and will still reach LLMLingua; that residual risk is
63
+ smaller in practice since most coding-assistant conversations use
64
+ fences, but it is a real, documented gap rather than a solved problem.
65
+ """
66
+
67
+ @staticmethod
68
+ def segment(text: str) -> List[Tuple[bool, str]]:
69
+ """Returns ordered (is_code, segment_text) pairs covering the
70
+ entire input losslessly — concatenating all segment_text values
71
+ back together reproduces the original text exactly."""
72
+ segments: List[Tuple[bool, str]] = []
73
+ pos = 0
74
+ # Fenced blocks first (they take priority over inline spans found inside them)
75
+ for m in _FENCED_CODE_RE.finditer(text):
76
+ if m.start() > pos:
77
+ segments.extend(CodeBlockGuard._segment_inline(text[pos:m.start()]))
78
+ segments.append((True, m.group(0)))
79
+ pos = m.end()
80
+ if pos < len(text):
81
+ segments.extend(CodeBlockGuard._segment_inline(text[pos:]))
82
+ return segments
83
+
84
+ @staticmethod
85
+ def _segment_inline(text: str) -> List[Tuple[bool, str]]:
86
+ """Within non-fenced text, also protect inline `code spans`."""
87
+ segments: List[Tuple[bool, str]] = []
88
+ pos = 0
89
+ for m in _INLINE_CODE_RE.finditer(text):
90
+ if m.start() > pos:
91
+ segments.append((False, text[pos:m.start()]))
92
+ segments.append((True, m.group(0)))
93
+ pos = m.end()
94
+ if pos < len(text):
95
+ segments.append((False, text[pos:]))
96
+ return segments
97
+
98
+ @staticmethod
99
+ def reassemble(segments: List[Tuple[bool, str]]) -> str:
100
+ return "".join(seg for _, seg in segments)
101
+
102
+ # ─── Filler patterns ────────────────────────────────────────────────────────
103
+
104
+ _FILLER = [
105
+ r"As an AI(?:\s+language model)?,?\s*",
106
+ r"I(?:'d| would) be (?:happy|glad|pleased) to\s+(?:help\s+)?",
107
+ r"(?:That'?s?\s+a?\s*)?(?:great|excellent|good|wonderful|fantastic)\s+question[.!]\s*",
108
+ r"(?:Certainly|Of course|Sure|Absolutely|Indeed)[!.]?\s*",
109
+ r"It(?:'s| is) (?:worth noting|important to note|crucial to understand) that\s+",
110
+ r"In this (?:case|context|scenario),?\s*",
111
+ r"(?:Essentially|Basically|Simply put|In other words),?\s*",
112
+ r"As you can see(?:,| from)?\s*",
113
+ r"As (?:mentioned|noted|discussed) (?:earlier|above|previously|before),?\s*",
114
+ r"Let me (?:explain|clarify|elaborate|break this down)(?:\s+for you)?\s*",
115
+ r"I hope this (?:helps|answers your question|clarifies things)[.!]\s*",
116
+ r"Feel free to (?:ask|reach out)[^.]*[.!]\s*",
117
+ r"Please (?:let me know|don't hesitate)[^.]*[.!]\s*",
118
+ r"(?:Thank you for|Thanks for) (?:asking|your question|reaching out)[.!]\s*",
119
+ ]
120
+ _FILLER_RE = [re.compile(p, re.IGNORECASE) for p in _FILLER]
121
+
122
+ # ─── Data classes ────────────────────────────────────────────────────────────
123
+
124
+ @dataclass
125
+ class CompressionResult:
126
+ original_tokens: int
127
+ compressed_tokens: int
128
+ original_text: str
129
+ compressed_text: str
130
+ strategies_applied: List[str] = field(default_factory=list)
131
+ quality_score: float = 1.0 # 0-1, estimated
132
+
133
+ @property
134
+ def ratio(self) -> float:
135
+ if self.original_tokens == 0:
136
+ return 1.0
137
+ return self.compressed_tokens / self.original_tokens
138
+
139
+ @property
140
+ def savings_pct(self) -> float:
141
+ return (1 - self.ratio) * 100
142
+
143
+ def __repr__(self) -> str:
144
+ return (
145
+ f"CompressionResult("
146
+ f"orig={self.original_tokens}, "
147
+ f"compressed={self.compressed_tokens}, "
148
+ f"ratio={self.ratio:.2f}, "
149
+ f"saved={self.savings_pct:.1f}%, "
150
+ f"strategies={self.strategies_applied})"
151
+ )
152
+
153
+
154
+ # ─── Heuristic strategies ────────────────────────────────────────────────────
155
+
156
+ class FillerRemover:
157
+ """Remove AI filler phrases. Zero dependencies. ~10-20% reduction on verbose responses."""
158
+
159
+ def apply(self, text: str) -> Tuple[str, str]:
160
+ for pat in _FILLER_RE:
161
+ text = pat.sub("", text)
162
+ text = re.sub(r'\n{3,}', '\n\n', text)
163
+ text = re.sub(r' +', ' ', text)
164
+ return text.strip(), "filler_removal"
165
+
166
+
167
+ class DuplicateLineRemover:
168
+ """Remove exact duplicate lines (common in repeated context). ~5-15% on long chats."""
169
+
170
+ def apply(self, text: str) -> Tuple[str, str]:
171
+ seen: set = set()
172
+ lines = []
173
+ for line in text.splitlines():
174
+ stripped = line.strip()
175
+ if stripped and stripped in seen and len(stripped) > 40:
176
+ continue # skip duplicate non-trivial lines
177
+ seen.add(stripped)
178
+ lines.append(line)
179
+ return "\n".join(lines), "duplicate_removal"
180
+
181
+
182
+ class WhitespaceNormalizer:
183
+ """Collapse excessive whitespace. ~2-5% reduction."""
184
+
185
+ def apply(self, text: str) -> Tuple[str, str]:
186
+ text = re.sub(r'\t', ' ', text)
187
+ text = re.sub(r' {4,}', ' ', text)
188
+ text = re.sub(r'\n{3,}', '\n\n', text)
189
+ return text.strip(), "whitespace_normalization"
190
+
191
+
192
+ class CommentStripper:
193
+ """Strip comments from code blocks. ~10-30% on comment-heavy code.
194
+
195
+ CORRECTNESS FIX: the JS line-comment pattern previously matched `//`
196
+ anywhere on a line, including inside string literals — most commonly
197
+ URLs like "https://example.com", which would get truncated to
198
+ "https:" with everything after silently deleted. This is real code
199
+ corruption, not a cosmetic issue: a stripped URL, connection string,
200
+ or comparison-with-division (`a //= b` truncation edge cases aside)
201
+ changes program behavior.
202
+
203
+ Fix: a line is only treated as having a `//` comment if there's an
204
+ even number of unescaped double-quote AND single-quote characters
205
+ before the `//` — i.e. the `//` is outside any open string on that
206
+ line. This is a heuristic (not a real tokenizer — it doesn't know
207
+ about template literals, regex literals, or multi-line strings), but
208
+ it correctly handles the dominant real-world case (URLs in string
209
+ literals) instead of ignoring the problem entirely.
210
+ """
211
+
212
+ _BLOCK_COMMENT = re.compile(r'/\*.*?\*/', re.DOTALL)
213
+ _DOCSTRING = re.compile(r'""".*?"""', re.DOTALL)
214
+
215
+ @staticmethod
216
+ def _strip_line_comments(text: str, markers: Tuple[str, ...]) -> str:
217
+ """
218
+ String-aware line-comment stripper, shared by both Python (#) and
219
+ JS-style (//) comments.
220
+
221
+ FIXED BUGS (found via actually running tests against this code,
222
+ not just reading it):
223
+ 1. The original Python regex `^\\s*#.*$` only matched comments
224
+ where `#` was the first non-whitespace character on the
225
+ line — it silently did NOT strip the far more common
226
+ trailing-comment style `x = 1 # comment`, because that line
227
+ doesn't match `^\\s*#`. So "comment stripping" was already
228
+ failing to strip most real-world comments before this audit
229
+ touched it at all.
230
+ 2. The original JS regex `//[^\\n]*` matched `//` anywhere on a
231
+ line including inside string literals (URLs), corrupting
232
+ code — this was the bug this audit set out to fix.
233
+
234
+ This single string-aware scanner handles both correctly: it
235
+ strips a line-comment marker only when found outside an open
236
+ quoted string, regardless of whether it's a leading or trailing
237
+ comment, for any of the given marker strings.
238
+ """
239
+ out_lines = []
240
+ for line in text.split('\n'):
241
+ best_pos = None
242
+ for marker in markers:
243
+ idx = 0
244
+ while True:
245
+ pos = line.find(marker, idx)
246
+ if pos == -1:
247
+ break
248
+ before = line[:pos]
249
+ if before.count('"') % 2 == 1 or before.count("'") % 2 == 1:
250
+ idx = pos + len(marker)
251
+ continue
252
+ if best_pos is None or pos < best_pos:
253
+ best_pos = pos
254
+ break
255
+ out_lines.append(line[:best_pos].rstrip() if best_pos is not None else line)
256
+ return '\n'.join(out_lines)
257
+
258
+ def apply(self, text: str, strip_docstrings: bool = False) -> Tuple[str, str]:
259
+ result = self._strip_line_comments(text, ('#', '//'))
260
+ result = self._BLOCK_COMMENT.sub('', result)
261
+ if strip_docstrings:
262
+ result = self._DOCSTRING.sub('', result)
263
+ result = re.sub(r'\n{3,}', '\n\n', result)
264
+ return result.strip(), "comment_stripping"
265
+
266
+
267
+ class RepetitiveHistoryPruner:
268
+ """
269
+ Detect and collapse repetitive assistant message patterns.
270
+ e.g. 3+ messages all starting with "Here is the code:" get deduplicated.
271
+ ~10-20% on long coding sessions.
272
+ """
273
+
274
+ def apply(self, messages: List[Dict]) -> Tuple[List[Dict], str]:
275
+ if len(messages) < 6:
276
+ return messages, "history_pruning_skipped"
277
+
278
+ result = []
279
+ prefix_count: Dict[str, int] = {}
280
+
281
+ for msg in messages:
282
+ content = msg.get("content", "")
283
+ if msg.get("role") == "assistant":
284
+ # Get first 60 chars as "prefix signature"
285
+ prefix = content[:60].strip().lower()
286
+ prefix_count[prefix] = prefix_count.get(prefix, 0) + 1
287
+ # If this pattern appeared 3+ times, compress it
288
+ if prefix_count[prefix] > 2 and len(content) > 200:
289
+ # Keep first 100 + last 100 chars
290
+ compressed = content[:100] + "\n...[compressed]...\n" + content[-100:]
291
+ result.append({**msg, "content": compressed})
292
+ continue
293
+ result.append(msg)
294
+
295
+ return result, "history_pruning"
296
+
297
+
298
+ # ─── File-type filters (NEW) ──────────────────────────────────────────────────
299
+
300
+ class FileContentFilter:
301
+ """
302
+ Smart filters for heavy file types.
303
+ Prevents sending raw binary artifacts, huge CSVs, full logs, etc.
304
+ """
305
+
306
+ MAX_CSV_ROWS = 10
307
+ MAX_LOG_LINES = 50
308
+ MAX_JSON_DEPTH = 3
309
+
310
+ def filter_csv(self, content: str) -> str:
311
+ """Send schema + first N rows instead of full CSV."""
312
+ lines = [line for line in content.splitlines() if line.strip()]
313
+ if len(lines) <= self.MAX_CSV_ROWS + 1:
314
+ return content
315
+ header = lines[0]
316
+ sample = lines[1:self.MAX_CSV_ROWS + 1]
317
+ total_rows = len(lines) - 1
318
+ return (
319
+ f"[CSV — {total_rows} rows, showing first {self.MAX_CSV_ROWS}]\n"
320
+ + header + "\n"
321
+ + "\n".join(sample)
322
+ + f"\n...[{total_rows - self.MAX_CSV_ROWS} rows omitted]"
323
+ )
324
+
325
+ def filter_json(self, content: str) -> str:
326
+ """Flatten deep JSON, remove nulls/empty arrays."""
327
+ try:
328
+ data = json.loads(content)
329
+ cleaned = self._clean_json(data, depth=0)
330
+ result = json.dumps(cleaned, indent=2)
331
+ if len(result) < len(content):
332
+ return f"[JSON cleaned — {len(content)} → {len(result)} chars]\n{result}"
333
+ return content
334
+ except (json.JSONDecodeError, Exception):
335
+ return content
336
+
337
+ def _clean_json(self, obj, depth: int):
338
+ if depth > self.MAX_JSON_DEPTH:
339
+ return f"...[depth limit {self.MAX_JSON_DEPTH}]"
340
+ if isinstance(obj, dict):
341
+ return {
342
+ k: self._clean_json(v, depth + 1)
343
+ for k, v in obj.items()
344
+ if v is not None and v != [] and v != {}
345
+ }
346
+ if isinstance(obj, list):
347
+ if len(obj) > 20:
348
+ trimmed = [self._clean_json(x, depth + 1) for x in obj[:5]]
349
+ return trimmed + [f"...[{len(obj)-5} more]"]
350
+ return [self._clean_json(x, depth + 1) for x in obj]
351
+ return obj
352
+
353
+ def filter_log(self, content: str) -> str:
354
+ """Keep first + last N lines of logs (errors are usually at end)."""
355
+ lines = content.splitlines()
356
+ if len(lines) <= self.MAX_LOG_LINES:
357
+ return content
358
+ half = self.MAX_LOG_LINES // 2
359
+ head = lines[:half]
360
+ tail = lines[-half:]
361
+ omitted = len(lines) - self.MAX_LOG_LINES
362
+ return (
363
+ "\n".join(head)
364
+ + f"\n\n...[{omitted} lines omitted]...\n\n"
365
+ + "\n".join(tail)
366
+ )
367
+
368
+ def filter_by_extension(self, content: str, filename: str) -> Tuple[str, str]:
369
+ """Auto-detect file type and apply appropriate filter."""
370
+ ext = filename.lower().rsplit(".", 1)[-1] if "." in filename else ""
371
+ if ext == "csv":
372
+ return self.filter_csv(content), "csv_filter"
373
+ if ext == "json":
374
+ return self.filter_json(content), "json_filter"
375
+ if ext in ("log", "txt") and len(content.splitlines()) > 100:
376
+ return self.filter_log(content), "log_filter"
377
+ return content, "no_filter"
378
+
379
+
380
+ # ─── LLMLingua wrapper ────────────────────────────────────────────────────────
381
+
382
+ class LLMLinguaEngine:
383
+ """
384
+ LLMLingua-2 / LongLLMLingua wrapper with graceful fallback.
385
+ Auto-selects LongLLMLingua for documents > 4k tokens.
386
+ """
387
+
388
+ LONG_THRESHOLD = 4000 # tokens — use LongLLMLingua above this
389
+
390
+ def __init__(self, ratio: float = 0.5, device: str = "cpu"):
391
+ self.ratio = ratio
392
+ self.device = device
393
+ self._short = None
394
+ self._long = None
395
+ self._available = False
396
+ self._load()
397
+
398
+ def _load(self) -> None:
399
+ try:
400
+ from llmlingua import PromptCompressor # type: ignore
401
+ self._short = PromptCompressor(
402
+ model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
403
+ use_llmlingua2=True,
404
+ device_map=self.device,
405
+ )
406
+ # LongLLMLingua for long docs
407
+ self._long = PromptCompressor(
408
+ model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
409
+ use_llmlingua2=True,
410
+ device_map=self.device,
411
+ )
412
+ self._available = True
413
+ logger.info("LLMLingua-2 loaded")
414
+ except ImportError:
415
+ logger.warning(
416
+ "llmlingua not installed — using heuristic compression only. "
417
+ "pip install tokenmizer[compression] for full ML compression."
418
+ )
419
+
420
+ @property
421
+ def available(self) -> bool:
422
+ return self._available
423
+
424
+ def compress(self, text: str, ratio: Optional[float] = None) -> CompressionResult:
425
+ """
426
+ Compress text via LLMLingua-2, EXCLUDING code segments.
427
+
428
+ FIXED: previously the entire text — including any fenced/inline
429
+ code — went straight into the ML compressor with only a soft
430
+ `force_tokens` hint asking it to try to preserve a few literal
431
+ tokens like "```" and "def ". That hint does not guarantee
432
+ preservation of everything inside a code block; LLMLingua is a
433
+ lossy compressor by design, and applying it to code risks
434
+ corrupting program semantics (dropped tokens, mangled
435
+ indentation in whitespace-significant languages, truncated
436
+ strings/regexes). Since this tool's primary use case is coding
437
+ sessions, that's not a hypothetical edge case.
438
+
439
+ Now: text is segmented into code vs. prose (CodeBlockGuard), only
440
+ prose segments are sent to LLMLingua, and code segments are
441
+ reattached completely unmodified — guaranteed identical to the
442
+ input for any text wrapped in ``` fences or single backticks.
443
+ """
444
+ target = ratio or self.ratio
445
+ orig_tokens = count_tokens(text)
446
+
447
+ if not self._available or orig_tokens < 100:
448
+ return CompressionResult(
449
+ original_tokens=orig_tokens,
450
+ compressed_tokens=orig_tokens,
451
+ original_text=text,
452
+ compressed_text=text,
453
+ strategies_applied=["llmlingua_skipped_short"],
454
+ quality_score=1.0,
455
+ )
456
+
457
+ segments = CodeBlockGuard.segment(text)
458
+ code_segment_count = sum(1 for is_code, _ in segments if is_code)
459
+
460
+ try:
461
+ out_parts: List[str] = []
462
+ any_compressed = False
463
+ for is_code, seg in segments:
464
+ if is_code or count_tokens(seg) < 20:
465
+ # Too short to bother, or protected code — pass through.
466
+ out_parts.append(seg)
467
+ continue
468
+ engine = self._long if count_tokens(seg) > self.LONG_THRESHOLD else self._short
469
+ result = engine.compress_prompt(
470
+ seg,
471
+ rate=target,
472
+ force_tokens=["\n", ".", "?", "!"],
473
+ )
474
+ out_parts.append(result["compressed_prompt"])
475
+ any_compressed = True
476
+
477
+ compressed = "".join(out_parts)
478
+ comp_tokens = count_tokens(compressed)
479
+ label = "llmlingua2_code_protected" if code_segment_count else "llmlingua2"
480
+ if not any_compressed:
481
+ label = "llmlingua_skipped_all_code"
482
+
483
+ return CompressionResult(
484
+ original_tokens=orig_tokens,
485
+ compressed_tokens=comp_tokens,
486
+ original_text=text,
487
+ compressed_text=compressed,
488
+ strategies_applied=[label],
489
+ quality_score=comp_tokens / max(orig_tokens, 1),
490
+ )
491
+ except Exception as e:
492
+ logger.warning(f"LLMLingua failed: {e} — falling back")
493
+ return CompressionResult(
494
+ original_tokens=orig_tokens,
495
+ compressed_tokens=orig_tokens,
496
+ original_text=text,
497
+ compressed_text=text,
498
+ strategies_applied=["llmlingua_failed"],
499
+ quality_score=1.0,
500
+ )
501
+
502
+
503
+ # ─── Master pipeline ─────────────────────────────────────────────────────────
504
+
505
+ class CompressionPipeline:
506
+ """
507
+ Orchestrates all compression strategies in the right order.
508
+ Heuristics run first (fast, no deps), ML last (slowest, best quality).
509
+ """
510
+
511
+ def __init__(
512
+ self,
513
+ ratio: float = 0.5,
514
+ strip_comments: bool = False,
515
+ enable_ml: bool = True,
516
+ device: str = "cpu",
517
+ ):
518
+ self.ratio = ratio
519
+ self.strip_comments = strip_comments
520
+ # compression_ratio = output_tokens / input_tokens (lower = more compressed)
521
+ # If ratio > threshold, ML compression had no effect — keep heuristic result
522
+ self._quality_threshold = 0.95
523
+ self.filler = FillerRemover()
524
+ self.dedup = DuplicateLineRemover()
525
+ self.whitespace = WhitespaceNormalizer()
526
+ self.comments = CommentStripper()
527
+ self.history_pruner = RepetitiveHistoryPruner()
528
+ self.file_filter = FileContentFilter()
529
+ self.lingua = LLMLinguaEngine(ratio=ratio) if enable_ml else None
530
+
531
+ def compress_text(
532
+ self,
533
+ text: str,
534
+ filename: Optional[str] = None,
535
+ min_tokens: int = 100,
536
+ ) -> CompressionResult:
537
+ """Compress a single text block through the full pipeline."""
538
+
539
+ original = text
540
+ orig_tokens = count_tokens(text)
541
+ strategies: List[str] = []
542
+
543
+ if orig_tokens < min_tokens:
544
+ return CompressionResult(
545
+ original_tokens=orig_tokens,
546
+ compressed_tokens=orig_tokens,
547
+ original_text=original,
548
+ compressed_text=text,
549
+ strategies_applied=["skipped_too_short"],
550
+ )
551
+
552
+ # File-type filter first
553
+ if filename:
554
+ text, strat = self.file_filter.filter_by_extension(text, filename)
555
+ if strat != "no_filter":
556
+ strategies.append(strat)
557
+
558
+ # Heuristics (order matters)
559
+ text, s = self.whitespace.apply(text)
560
+ strategies.append(s)
561
+
562
+ text, s = self.filler.apply(text)
563
+ strategies.append(s)
564
+
565
+ text, s = self.dedup.apply(text)
566
+ strategies.append(s)
567
+
568
+ if self.strip_comments:
569
+ text, s = self.comments.apply(text)
570
+ strategies.append(s)
571
+
572
+ # Save the heuristic-only result BEFORE running ML compression so we can
573
+ # actually revert to it if the quality gate below rejects the ML output.
574
+ #
575
+ # FIXED — this was a real bug, not cosmetic: the previous code assigned
576
+ # `text = result.compressed_text` immediately, THEN computed
577
+ # compression_ratio from that same already-overwritten `text`. That
578
+ # meant the "keep heuristic result" comment was describing something
579
+ # the code never actually did — by the time the ratio check ran, the
580
+ # heuristic-only text was already gone. The warning fired correctly;
581
+ # the revert it claimed to perform never happened.
582
+ heuristic_text = text
583
+ heuristic_tokens = count_tokens(text)
584
+
585
+ # ML compression
586
+ comp_tokens = heuristic_tokens
587
+ quality = 0.9
588
+ if self.lingua and self.lingua.available:
589
+ result = self.lingua.compress(heuristic_text, ratio=self.ratio)
590
+ ml_tokens = count_tokens(result.compressed_text)
591
+ compression_ratio = ml_tokens / max(orig_tokens, 1)
592
+ quality_threshold = getattr(self, "_quality_threshold", 0.95)
593
+
594
+ if compression_ratio > quality_threshold:
595
+ # ML barely compressed anything — genuinely revert to heuristic text.
596
+ logger.warning(
597
+ f"Compression ratio {compression_ratio:.2f} > threshold "
598
+ f"{quality_threshold} — ML compression had no effect, "
599
+ f"reverting to heuristic-only result"
600
+ )
601
+ strategies.append("llmlingua_reverted_quality_gate")
602
+ # text/comp_tokens already hold the heuristic-only values — no-op
603
+ else:
604
+ text = result.compressed_text
605
+ comp_tokens = ml_tokens
606
+ strategies.extend(result.strategies_applied)
607
+ quality = result.quality_score
608
+
609
+ return CompressionResult(
610
+ original_tokens=orig_tokens,
611
+ compressed_tokens=comp_tokens,
612
+ original_text=original,
613
+ compressed_text=text,
614
+ strategies_applied=strategies,
615
+ quality_score=quality,
616
+ )
617
+
618
+ def compress_messages(
619
+ self,
620
+ messages: List[Dict],
621
+ protect_recent: int = 3,
622
+ ) -> Tuple[List[Dict], int]:
623
+ """
624
+ Compress all messages except the most recent N.
625
+ Returns (compressed_messages, total_tokens_saved).
626
+ """
627
+ if len(messages) <= protect_recent:
628
+ return messages, 0
629
+
630
+ # First pass: prune repetitive history
631
+ messages, _ = self.history_pruner.apply(messages)
632
+
633
+ total_saved = 0
634
+ result = []
635
+
636
+ for i, msg in enumerate(messages):
637
+ # Don't touch recent messages or system messages
638
+ if i >= len(messages) - protect_recent:
639
+ result.append(msg)
640
+ continue
641
+ if msg.get("role") == "system":
642
+ result.append(msg)
643
+ continue
644
+
645
+ content = msg.get("content", "")
646
+ cr = self.compress_text(content, min_tokens=200)
647
+ total_saved += cr.original_tokens - cr.compressed_tokens
648
+ result.append({**msg, "content": cr.compressed_text})
649
+
650
+ return result, total_saved
651
+
652
+ def terse_system_prompt(self, level: str = "full") -> str:
653
+ """Return terse-output instruction to inject into system prompt."""
654
+ levels = {
655
+ "lite": (
656
+ "Be concise. No preamble (e.g., 'Sure!', 'Great question!'). "
657
+ "No closing remarks. Start answer immediately."
658
+ ),
659
+ "full": (
660
+ "Respond like a senior engineer: no filler, no preamble, no 'I'd be happy to', "
661
+ "no closing fluff. Use fragments when clear. Preserve code/paths/URLs exactly. "
662
+ "Technical accuracy 100%. Start with the answer."
663
+ ),
664
+ "ultra": (
665
+ "Ultra-terse. Fragments only. No articles if obvious. "
666
+ "No preamble or closing. Code/paths exact. Maximum compression."
667
+ ),
668
+ }
669
+ return levels.get(level, levels["full"])