python-token-killer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,167 @@
1
+ """Dict/JSON minimizer — biggest token wins live here."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any
7
+
8
+ from ptk._base import Minimizer, strip_nullish
9
+
10
+
11
+ class DictMinimizer(Minimizer):
12
+ """Compress dicts via null-stripping, key shortening, and compact serialization.
13
+
14
+ Strategies applied in order:
15
+ 1. Strip nullish values (None, "", [], {})
16
+ 2. Flatten single-child nesting (aggressive)
17
+ 3. Shorten keys to abbreviated forms (aggressive)
18
+ 4. Serialize with minimal separators
19
+
20
+ The `format` kwarg controls output encoding:
21
+ - "json" (default) — compact json.dumps
22
+ - "kv" — key:value lines (great for flat dicts)
23
+ - "tabular" — header-once tabular format (auto for list-of-dicts values)
24
+ """
25
+
26
+ def _minimize(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> str:
27
+ if not isinstance(obj, dict):
28
+ obj = {"_": obj}
29
+
30
+ d = strip_nullish(obj)
31
+
32
+ if aggressive:
33
+ d = _flatten_single_children(d)
34
+ d = _shorten_dotted_keys(d)
35
+ d = _shorten_keys(d)
36
+
37
+ fmt = kw.get("format", "json")
38
+ if fmt == "kv":
39
+ return _to_kv(d)
40
+ if fmt == "tabular":
41
+ return _to_tabular(d)
42
+ return json.dumps(d, separators=(",", ":"), default=str)
43
+
44
+
45
+ # ── internal helpers ────────────────────────────────────────────────────
46
+
47
+
48
+ def _flatten_single_children(d: dict[str, Any], _depth: int = 0) -> dict[str, Any]:
49
+ """Collapse {"a": {"b": val}} → {"a.b": val} up to 4 levels."""
50
+ if _depth > 4:
51
+ return d
52
+ out: dict[str, Any] = {}
53
+ for k, v in d.items():
54
+ if isinstance(v, dict) and len(v) == 1:
55
+ inner_k, inner_v = next(iter(v.items()))
56
+ flattened = _flatten_single_children({f"{k}.{inner_k}": inner_v}, _depth + 1)
57
+ out.update(flattened)
58
+ else:
59
+ out[k] = v
60
+ return out
61
+
62
+
63
+ # common verbose key → short form (extensible)
64
+ _KEY_MAP: dict[str, str] = {
65
+ "description": "desc",
66
+ "message": "msg",
67
+ "timestamp": "ts",
68
+ "created_at": "ts",
69
+ "updated_at": "upd",
70
+ "configuration": "cfg",
71
+ "config": "cfg",
72
+ "environment": "env",
73
+ "database": "db",
74
+ "information": "info",
75
+ "response": "resp",
76
+ "request": "req",
77
+ "function": "fn",
78
+ "parameters": "params",
79
+ "arguments": "args",
80
+ "exception": "exc",
81
+ "traceback": "tb",
82
+ "status_code": "code",
83
+ "content_type": "ctype",
84
+ "application": "app",
85
+ "transaction": "txn",
86
+ "identifier": "id",
87
+ "metadata": "meta",
88
+ "properties": "props",
89
+ "connection": "conn",
90
+ "password": "pw",
91
+ "username": "user",
92
+ "directory": "dir",
93
+ "reference": "ref",
94
+ "implementation": "impl",
95
+ "notifications": "notifs",
96
+ "repository": "repo",
97
+ }
98
+
99
+
100
+ def _shorten_keys(d: dict[str, Any]) -> dict[str, Any]:
101
+ """Recursively shorten known verbose keys.
102
+
103
+ If two keys map to the same short form (e.g. 'timestamp' and 'created_at'
104
+ both → 'ts'), the FIRST key wins and the second is kept unshortened to
105
+ prevent silent data loss.
106
+ """
107
+ out: dict[str, Any] = {}
108
+ used_shorts: set[str] = set()
109
+ for k, v in d.items():
110
+ short = _KEY_MAP.get(k, k)
111
+ # avoid collision: if short form already used, keep original key
112
+ if short in used_shorts and short != k:
113
+ short = k
114
+ used_shorts.add(short)
115
+ if isinstance(v, dict):
116
+ out[short] = _shorten_keys(v)
117
+ elif isinstance(v, list):
118
+ out[short] = [_shorten_keys(i) if isinstance(i, dict) else i for i in v]
119
+ else:
120
+ out[short] = v
121
+ return out
122
+
123
+
124
+ def _shorten_dotted_keys(d: dict[str, Any]) -> dict[str, Any]:
125
+ """Shorten individual segments of dotted keys (from flattening)."""
126
+ out: dict[str, Any] = {}
127
+ for k, v in d.items():
128
+ if isinstance(k, str) and "." in k:
129
+ parts = [_KEY_MAP.get(p, p) for p in k.split(".")]
130
+ out[".".join(parts)] = v
131
+ else:
132
+ out[k] = v
133
+ return out
134
+
135
+
136
+ def _to_kv(d: dict[str, Any], _prefix: str = "") -> str:
137
+ """Flat key:value format — one line per leaf."""
138
+ lines: list[str] = []
139
+ for k, v in d.items():
140
+ full = f"{_prefix}{k}"
141
+ if isinstance(v, dict):
142
+ lines.append(_to_kv(v, f"{full}."))
143
+ else:
144
+ lines.append(f"{full}:{v}")
145
+ return "\n".join(lines)
146
+
147
+
148
+ def _to_tabular(d: dict[str, Any]) -> str:
149
+ """Render any list-of-dicts values as header-once tabular rows.
150
+
151
+ Non-list values render as kv pairs above the table.
152
+ """
153
+ kv_lines: list[str] = []
154
+ table_lines: list[str] = []
155
+
156
+ for k, v in d.items():
157
+ if isinstance(v, list) and v and isinstance(v[0], dict):
158
+ # collect union of all keys (preserving order)
159
+ fields: list[str] = list(dict.fromkeys(f for row in v for f in row))
160
+ table_lines.append(f"{k}[{len(v)}]{{{','.join(fields)}}}:")
161
+ for row in v:
162
+ vals = (str(row.get(f, "")) for f in fields)
163
+ table_lines.append(f" {','.join(vals)}")
164
+ else:
165
+ kv_lines.append(f"{k}:{v}")
166
+
167
+ return "\n".join(kv_lines + table_lines)
@@ -0,0 +1,83 @@
1
+ """Diff minimizer — fold unchanged context, keep only meaningful changes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from ptk._base import Minimizer
8
+
9
+
10
+ class DiffMinimizer(Minimizer):
11
+ """Compress git diffs by folding unchanged context lines.
12
+
13
+ Strategies:
14
+ 1. Keep hunk headers (@@) and file headers (---, +++)
15
+ 2. Keep added (+) and removed (-) lines
16
+ 3. Collapse context (space-prefixed) lines to `... N lines ...`
17
+ 4. Aggressive: strip file mode changes, index lines, trailing whitespace diffs
18
+ """
19
+
20
+ # max unchanged context lines to keep around changes
21
+ CONTEXT_LINES = 2
22
+
23
+ def _minimize(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> str:
24
+ text = obj if isinstance(obj, str) else str(obj)
25
+ ctx = kw.get("context_lines", 0 if aggressive else self.CONTEXT_LINES)
26
+ return _fold_diff(text, context=ctx, aggressive=aggressive)
27
+
28
+
29
+ def _fold_diff(text: str, *, context: int, aggressive: bool) -> str:
30
+ lines = text.split("\n")
31
+ result: list[str] = []
32
+ context_buf: list[str] = []
33
+
34
+ for line in lines:
35
+ # always skip noise in aggressive mode
36
+ if aggressive and _is_noise(line):
37
+ continue
38
+
39
+ if _is_significant(line):
40
+ # flush context buffer — keep only last `context` lines
41
+ if context_buf:
42
+ _flush_context(result, context_buf, keep=context)
43
+ context_buf.clear()
44
+ result.append(line)
45
+ else:
46
+ context_buf.append(line)
47
+
48
+ # flush remaining context
49
+ if context_buf:
50
+ _flush_context(result, context_buf, keep=context)
51
+
52
+ return "\n".join(result)
53
+
54
+
55
+ def _is_significant(line: str) -> bool:
56
+ """Lines that carry meaningful diff information."""
57
+ return (
58
+ line.startswith(("+", "-", "@@", "diff ", "\\ ")) and not line.startswith(("+++", "---"))
59
+ ) or line.startswith(("+++", "---"))
60
+
61
+
62
+ def _is_noise(line: str) -> bool:
63
+ """Lines that are almost never useful to an LLM."""
64
+ return (
65
+ line.startswith(("index ", "old mode", "new mode", "similarity"))
66
+ or line.startswith("rename ")
67
+ or (line.startswith("Binary files") and "differ" in line)
68
+ )
69
+
70
+
71
+ def _flush_context(result: list[str], buf: list[str], *, keep: int) -> None:
72
+ """Collapse a block of context lines, keeping at most `keep` on each end."""
73
+ if len(buf) <= keep * 2 + 1:
74
+ result.extend(buf)
75
+ return
76
+
77
+ if keep > 0:
78
+ result.extend(buf[:keep])
79
+ folded = len(buf) - keep * 2
80
+ if folded > 0:
81
+ result.append(f" ... {folded} lines ...")
82
+ if keep > 0:
83
+ result.extend(buf[-keep:])
@@ -0,0 +1,87 @@
1
+ """List/array minimizer — dedup, sampling, schema-once tabular."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from collections import Counter
7
+ from typing import Any
8
+
9
+ from ptk._base import Minimizer, strip_nullish
10
+
11
+
12
+ class ListMinimizer(Minimizer):
13
+ """Compress lists via dedup, sampling, and tabular encoding.
14
+
15
+ Strategies:
16
+ 1. Strip nullish items
17
+ 2. Deduplicate exact-match items with counts
18
+ 3. For uniform list-of-dicts: schema-once tabular output
19
+ 4. For large arrays: statistical sampling (aggressive)
20
+ """
21
+
22
+ # max items before sampling kicks in (aggressive mode)
23
+ SAMPLE_THRESHOLD = 50
24
+
25
+ def _minimize(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> str:
26
+ if not isinstance(obj, (list, tuple)):
27
+ return json.dumps(obj, separators=(",", ":"), default=str)
28
+
29
+ items = [strip_nullish(i) if isinstance(i, dict) else i for i in obj if i is not None]
30
+
31
+ if not items:
32
+ return "[]"
33
+
34
+ # ── uniform list-of-dicts → tabular ─────────────────────────
35
+ if isinstance(items[0], dict) and all(isinstance(i, dict) for i in items):
36
+ if aggressive and len(items) > self.SAMPLE_THRESHOLD:
37
+ items = _sample(items, self.SAMPLE_THRESHOLD)
38
+ return _tabular(items)
39
+
40
+ # ── primitive list → dedup with counts ──────────────────────
41
+ if aggressive and len(items) > self.SAMPLE_THRESHOLD:
42
+ items = items[: self.SAMPLE_THRESHOLD]
43
+
44
+ return _dedup_list(items)
45
+
46
+
47
+ def _tabular(rows: list[dict[str, Any]]) -> str:
48
+ """Schema-once tabular: declare fields in header, one CSV-ish row per item."""
49
+ fields: list[str] = list(dict.fromkeys(f for row in rows for f in row))
50
+ header = f"[{len(rows)}]{{{','.join(fields)}}}:"
51
+ body = "\n".join(",".join(str(row.get(f, "")) for f in fields) for row in rows)
52
+ return f"{header}\n{body}"
53
+
54
+
55
+ def _dedup_list(items: list[Any]) -> str:
56
+ """Collapse duplicate primitives: [a, a, a, b] → a (x3)\nb"""
57
+ # json-serialize each item for stable hashing
58
+ serialized = [json.dumps(i, separators=(",", ":"), default=str) for i in items]
59
+ counts = Counter(serialized)
60
+
61
+ # preserve first-seen order
62
+ seen: set[str] = set()
63
+ lines: list[str] = []
64
+ for s in serialized:
65
+ if s in seen:
66
+ continue
67
+ seen.add(s)
68
+ c = counts[s]
69
+ # strip quotes from simple strings for readability
70
+ display = s.strip('"') if s.startswith('"') and s.endswith('"') else s
71
+ lines.append(f"{display} (x{c})" if c > 1 else display)
72
+
73
+ return "\n".join(lines)
74
+
75
+
76
+ def _sample(items: list[dict[str, Any]], n: int) -> list[dict[str, Any]]:
77
+ """Deterministic even-spaced sampling — keeps first and last."""
78
+ if n <= 0:
79
+ return []
80
+ if len(items) <= n:
81
+ return items
82
+ if n == 1:
83
+ return [items[0]]
84
+ step = (len(items) - 1) / (n - 1)
85
+ indices = {round(i * step) for i in range(n)}
86
+ sampled = [items[i] for i in sorted(indices)]
87
+ return sampled
ptk/minimizers/_log.py ADDED
@@ -0,0 +1,94 @@
1
+ """Log minimizer — dedup repeated lines, error-only filtering."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+ from ptk._base import Minimizer, dedup_lines
9
+
10
+ # precompiled
11
+ _LOG_LEVEL = re.compile(
12
+ r"\b(ERROR|WARN(?:ING)?|CRITICAL|FATAL|EXCEPTION|SEVERE|PANIC)\b",
13
+ re.IGNORECASE,
14
+ )
15
+ _TIMESTAMP = re.compile(
16
+ r"^\d{4}[-/]\d{2}[-/]\d{2}[T ]\d{2}:\d{2}:\d{2}[.,]?\d*\s*(?:[+-]\d{2}:?\d{2}|Z)?\s*",
17
+ re.MULTILINE,
18
+ )
19
+ # stack trace markers — lines matching these are always preserved
20
+ _STACKTRACE_RE = re.compile(
21
+ r'^\s*(Traceback \(most recent|File "|\s+raise |'
22
+ r"\w+Error:|\w+Exception:|\w+Warning:|at \S+\.\S+\()",
23
+ re.MULTILINE,
24
+ )
25
+ # test-runner pass markers — never kept as context in errors_only mode
26
+ _PASS_MARKERS: frozenset[str] = frozenset(
27
+ {
28
+ "PASSED",
29
+ " PASS",
30
+ "--- PASS",
31
+ "... ok",
32
+ " ok",
33
+ "test result: ok",
34
+ "✓",
35
+ }
36
+ )
37
+
38
+
39
+ class LogMinimizer(Minimizer):
40
+ """Compress log output via deduplication and error filtering.
41
+
42
+ Strategies:
43
+ 1. Strip timestamps (aggressive) — timestamps are rarely useful to an LLM
44
+ 2. Collapse consecutive duplicate lines with counts
45
+ 3. Keep only error/warning lines (aggressive)
46
+ """
47
+
48
+ def _minimize(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> str:
49
+ text = obj if isinstance(obj, str) else str(obj)
50
+
51
+ # bail early on blank input
52
+ if not text.strip():
53
+ return ""
54
+
55
+ if aggressive:
56
+ text = _TIMESTAMP.sub("", text)
57
+
58
+ text = dedup_lines(text)
59
+
60
+ if aggressive or kw.get("errors_only", False):
61
+ text = _errors_only(text)
62
+
63
+ return text.strip()
64
+
65
+
66
+ def _errors_only(text: str) -> str:
67
+ """Keep error/warning lines, stack traces, 'failed' keyword lines, + context.
68
+
69
+ Pass-marker lines (PASSED, --- PASS, ... ok, etc.) are never kept even
70
+ as context around errors — they are pure noise in an error report.
71
+ """
72
+ lines = text.split("\n")
73
+ keep: set[int] = set()
74
+ for i, line in enumerate(lines):
75
+ ll = line.lower()
76
+ is_important = (
77
+ _LOG_LEVEL.search(line)
78
+ or _STACKTRACE_RE.match(line)
79
+ or "failed" in ll
80
+ or "fail:" in ll # go test: --- FAIL: TestName
81
+ or " fail " in ll # standalone FAIL line
82
+ or "error" in ll # build errors, compiler output
83
+ or "panicked" in ll # rust panic
84
+ or "assertion" in ll # assertion errors
85
+ )
86
+ if is_important:
87
+ # keep the important line + 1 line of context before/after,
88
+ # but exclude pure pass-marker lines from the context window
89
+ for j in range(max(0, i - 1), min(len(lines), i + 2)):
90
+ if not any(marker in lines[j] for marker in _PASS_MARKERS):
91
+ keep.add(j)
92
+ if not keep:
93
+ return text # no errors found — return deduped version
94
+ return "\n".join(lines[i] for i in sorted(keep))
@@ -0,0 +1,182 @@
1
+ """Text minimizer — whitespace normalization, stopword removal, abbreviation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+ from ptk._base import Minimizer
9
+
10
+ # ── precompiled ─────────────────────────────────────────────────────────
11
+
12
+ _MULTI_SPACE = re.compile(r"[ \t]+")
13
+ _MULTI_NEWLINE = re.compile(r"\n{3,}")
14
+
15
+ # high-frequency English stopwords that add tokens but rarely carry meaning
16
+ # in LLM context (intentionally conservative — we never want to destroy meaning)
17
+ _STOPWORDS: frozenset[str] = frozenset(
18
+ {
19
+ "the",
20
+ "a",
21
+ "an",
22
+ "is",
23
+ "are",
24
+ "was",
25
+ "were",
26
+ "be",
27
+ "been",
28
+ "being",
29
+ "have",
30
+ "has",
31
+ "had",
32
+ "do",
33
+ "does",
34
+ "did",
35
+ "will",
36
+ "would",
37
+ "shall",
38
+ "should",
39
+ "may",
40
+ "might",
41
+ "must",
42
+ "can",
43
+ "could",
44
+ "that",
45
+ "which",
46
+ "who",
47
+ "whom",
48
+ "this",
49
+ "these",
50
+ "those",
51
+ "am",
52
+ "its",
53
+ "very",
54
+ "just",
55
+ "also",
56
+ "really",
57
+ "quite",
58
+ "rather",
59
+ }
60
+ )
61
+
62
+ # common long phrases → shorter equivalents
63
+ _PHRASE_ABBREVIATIONS: tuple[tuple[str, str], ...] = (
64
+ ("in order to", "to"),
65
+ ("as well as", "&"),
66
+ ("due to the fact that", "because"),
67
+ ("in the event that", "if"),
68
+ ("at this point in time", "now"),
69
+ ("for the purpose of", "to"),
70
+ ("in addition to", "also"),
71
+ ("with regard to", "re:"),
72
+ ("a large number of", "many"),
73
+ ("the majority of", "most"),
74
+ ("in the process of", "while"),
75
+ ("on the other hand", "however"),
76
+ ("take into account", "consider"),
77
+ ("it is important to note that", "note:"),
78
+ ("please note that", "note:"),
79
+ ("it should be noted that", "note:"),
80
+ )
81
+
82
+ # single-word abbreviations (applied via regex word boundaries)
83
+ _WORD_ABBREVIATIONS: dict[str, str] = {
84
+ "implementation": "impl",
85
+ "implementations": "impls",
86
+ "configuration": "config",
87
+ "configurations": "configs",
88
+ "production": "prod",
89
+ "development": "dev",
90
+ "environment": "env",
91
+ "environments": "envs",
92
+ "application": "app",
93
+ "applications": "apps",
94
+ "infrastructure": "infra",
95
+ "authentication": "auth",
96
+ "authorization": "authz",
97
+ "repository": "repo",
98
+ "repositories": "repos",
99
+ "documentation": "docs",
100
+ "specification": "spec",
101
+ "specifications": "specs",
102
+ "requirements": "reqs",
103
+ "approximately": "~",
104
+ "notification": "notif",
105
+ "notifications": "notifs",
106
+ }
107
+
108
+ # filler phrases stripped entirely (claw-compactor Abbrev pattern)
109
+ _FILLER_PHRASES: tuple[str, ...] = (
110
+ "Furthermore, ",
111
+ "Furthermore,",
112
+ "In addition, ",
113
+ "In addition,",
114
+ "Moreover, ",
115
+ "Moreover,",
116
+ "Additionally, ",
117
+ "Additionally,",
118
+ "Having said that, ",
119
+ "Having said that,",
120
+ "It is worth noting that ",
121
+ "As mentioned earlier, ",
122
+ "As previously stated, ",
123
+ )
124
+
125
+ _WORD_ABBREV_RE = re.compile(
126
+ r"\b(" + "|".join(re.escape(w) for w in _WORD_ABBREVIATIONS) + r")\b",
127
+ re.IGNORECASE,
128
+ )
129
+
130
+
131
+ class TextMinimizer(Minimizer):
132
+ """Compress natural language text for LLM consumption.
133
+
134
+ Strategies:
135
+ 1. Normalize whitespace (always)
136
+ 2. Apply phrase abbreviations (always)
137
+ 3. Remove stopwords (aggressive) — safe for context/instructions, not prose
138
+ """
139
+
140
+ def _minimize(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> str:
141
+ text = obj if isinstance(obj, str) else str(obj)
142
+
143
+ # normalize whitespace
144
+ text = _MULTI_SPACE.sub(" ", text)
145
+ text = _MULTI_NEWLINE.sub("\n\n", text)
146
+
147
+ # strip filler phrases
148
+ for filler in _FILLER_PHRASES:
149
+ text = text.replace(filler, "")
150
+
151
+ # abbreviate common phrases
152
+ for long, short in _PHRASE_ABBREVIATIONS:
153
+ text = text.replace(long, short)
154
+
155
+ # abbreviate common words (case-preserving)
156
+ text = _WORD_ABBREV_RE.sub(_word_abbrev_replace, text)
157
+
158
+ if aggressive:
159
+ text = _remove_stopwords(text)
160
+
161
+ return text.strip()
162
+
163
+
164
+ def _word_abbrev_replace(m: re.Match[str]) -> str:
165
+ """Replace matched word with its abbreviation, preserving case style."""
166
+ word = m.group(0)
167
+ abbrev = _WORD_ABBREVIATIONS[word.lower()]
168
+ if word.isupper():
169
+ return abbrev.upper()
170
+ if word[0].isupper():
171
+ return abbrev.capitalize()
172
+ return abbrev
173
+
174
+
175
+ def _remove_stopwords(text: str) -> str:
176
+ """Remove stopwords while preserving line structure."""
177
+ lines: list[str] = []
178
+ for line in text.split("\n"):
179
+ words = line.split()
180
+ filtered = [w for w in words if w.lower().strip(".,;:!?") not in _STOPWORDS]
181
+ lines.append(" ".join(filtered))
182
+ return "\n".join(lines)
ptk/py.typed ADDED
File without changes