python-token-killer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ptk/__init__.py +166 -0
- ptk/_base.py +137 -0
- ptk/_types.py +126 -0
- ptk/minimizers/__init__.py +17 -0
- ptk/minimizers/_code.py +156 -0
- ptk/minimizers/_dict.py +167 -0
- ptk/minimizers/_diff.py +83 -0
- ptk/minimizers/_list.py +87 -0
- ptk/minimizers/_log.py +94 -0
- ptk/minimizers/_text.py +182 -0
- ptk/py.typed +0 -0
- python_token_killer-0.1.0.dist-info/METADATA +269 -0
- python_token_killer-0.1.0.dist-info/RECORD +15 -0
- python_token_killer-0.1.0.dist-info/WHEEL +4 -0
- python_token_killer-0.1.0.dist-info/licenses/LICENSE +21 -0
ptk/minimizers/_dict.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Dict/JSON minimizer — biggest token wins live here."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from ptk._base import Minimizer, strip_nullish
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DictMinimizer(Minimizer):
|
|
12
|
+
"""Compress dicts via null-stripping, key shortening, and compact serialization.
|
|
13
|
+
|
|
14
|
+
Strategies applied in order:
|
|
15
|
+
1. Strip nullish values (None, "", [], {})
|
|
16
|
+
2. Flatten single-child nesting (aggressive)
|
|
17
|
+
3. Shorten keys to abbreviated forms (aggressive)
|
|
18
|
+
4. Serialize with minimal separators
|
|
19
|
+
|
|
20
|
+
The `format` kwarg controls output encoding:
|
|
21
|
+
- "json" (default) — compact json.dumps
|
|
22
|
+
- "kv" — key:value lines (great for flat dicts)
|
|
23
|
+
- "tabular" — header-once tabular format (auto for list-of-dicts values)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def _minimize(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> str:
|
|
27
|
+
if not isinstance(obj, dict):
|
|
28
|
+
obj = {"_": obj}
|
|
29
|
+
|
|
30
|
+
d = strip_nullish(obj)
|
|
31
|
+
|
|
32
|
+
if aggressive:
|
|
33
|
+
d = _flatten_single_children(d)
|
|
34
|
+
d = _shorten_dotted_keys(d)
|
|
35
|
+
d = _shorten_keys(d)
|
|
36
|
+
|
|
37
|
+
fmt = kw.get("format", "json")
|
|
38
|
+
if fmt == "kv":
|
|
39
|
+
return _to_kv(d)
|
|
40
|
+
if fmt == "tabular":
|
|
41
|
+
return _to_tabular(d)
|
|
42
|
+
return json.dumps(d, separators=(",", ":"), default=str)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# ── internal helpers ────────────────────────────────────────────────────
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _flatten_single_children(d: dict[str, Any], _depth: int = 0) -> dict[str, Any]:
|
|
49
|
+
"""Collapse {"a": {"b": val}} → {"a.b": val} up to 4 levels."""
|
|
50
|
+
if _depth > 4:
|
|
51
|
+
return d
|
|
52
|
+
out: dict[str, Any] = {}
|
|
53
|
+
for k, v in d.items():
|
|
54
|
+
if isinstance(v, dict) and len(v) == 1:
|
|
55
|
+
inner_k, inner_v = next(iter(v.items()))
|
|
56
|
+
flattened = _flatten_single_children({f"{k}.{inner_k}": inner_v}, _depth + 1)
|
|
57
|
+
out.update(flattened)
|
|
58
|
+
else:
|
|
59
|
+
out[k] = v
|
|
60
|
+
return out
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# common verbose key → short form (extensible)
|
|
64
|
+
_KEY_MAP: dict[str, str] = {
|
|
65
|
+
"description": "desc",
|
|
66
|
+
"message": "msg",
|
|
67
|
+
"timestamp": "ts",
|
|
68
|
+
"created_at": "ts",
|
|
69
|
+
"updated_at": "upd",
|
|
70
|
+
"configuration": "cfg",
|
|
71
|
+
"config": "cfg",
|
|
72
|
+
"environment": "env",
|
|
73
|
+
"database": "db",
|
|
74
|
+
"information": "info",
|
|
75
|
+
"response": "resp",
|
|
76
|
+
"request": "req",
|
|
77
|
+
"function": "fn",
|
|
78
|
+
"parameters": "params",
|
|
79
|
+
"arguments": "args",
|
|
80
|
+
"exception": "exc",
|
|
81
|
+
"traceback": "tb",
|
|
82
|
+
"status_code": "code",
|
|
83
|
+
"content_type": "ctype",
|
|
84
|
+
"application": "app",
|
|
85
|
+
"transaction": "txn",
|
|
86
|
+
"identifier": "id",
|
|
87
|
+
"metadata": "meta",
|
|
88
|
+
"properties": "props",
|
|
89
|
+
"connection": "conn",
|
|
90
|
+
"password": "pw",
|
|
91
|
+
"username": "user",
|
|
92
|
+
"directory": "dir",
|
|
93
|
+
"reference": "ref",
|
|
94
|
+
"implementation": "impl",
|
|
95
|
+
"notifications": "notifs",
|
|
96
|
+
"repository": "repo",
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _shorten_keys(d: dict[str, Any]) -> dict[str, Any]:
|
|
101
|
+
"""Recursively shorten known verbose keys.
|
|
102
|
+
|
|
103
|
+
If two keys map to the same short form (e.g. 'timestamp' and 'created_at'
|
|
104
|
+
both → 'ts'), the FIRST key wins and the second is kept unshortened to
|
|
105
|
+
prevent silent data loss.
|
|
106
|
+
"""
|
|
107
|
+
out: dict[str, Any] = {}
|
|
108
|
+
used_shorts: set[str] = set()
|
|
109
|
+
for k, v in d.items():
|
|
110
|
+
short = _KEY_MAP.get(k, k)
|
|
111
|
+
# avoid collision: if short form already used, keep original key
|
|
112
|
+
if short in used_shorts and short != k:
|
|
113
|
+
short = k
|
|
114
|
+
used_shorts.add(short)
|
|
115
|
+
if isinstance(v, dict):
|
|
116
|
+
out[short] = _shorten_keys(v)
|
|
117
|
+
elif isinstance(v, list):
|
|
118
|
+
out[short] = [_shorten_keys(i) if isinstance(i, dict) else i for i in v]
|
|
119
|
+
else:
|
|
120
|
+
out[short] = v
|
|
121
|
+
return out
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _shorten_dotted_keys(d: dict[str, Any]) -> dict[str, Any]:
|
|
125
|
+
"""Shorten individual segments of dotted keys (from flattening)."""
|
|
126
|
+
out: dict[str, Any] = {}
|
|
127
|
+
for k, v in d.items():
|
|
128
|
+
if isinstance(k, str) and "." in k:
|
|
129
|
+
parts = [_KEY_MAP.get(p, p) for p in k.split(".")]
|
|
130
|
+
out[".".join(parts)] = v
|
|
131
|
+
else:
|
|
132
|
+
out[k] = v
|
|
133
|
+
return out
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _to_kv(d: dict[str, Any], _prefix: str = "") -> str:
|
|
137
|
+
"""Flat key:value format — one line per leaf."""
|
|
138
|
+
lines: list[str] = []
|
|
139
|
+
for k, v in d.items():
|
|
140
|
+
full = f"{_prefix}{k}"
|
|
141
|
+
if isinstance(v, dict):
|
|
142
|
+
lines.append(_to_kv(v, f"{full}."))
|
|
143
|
+
else:
|
|
144
|
+
lines.append(f"{full}:{v}")
|
|
145
|
+
return "\n".join(lines)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _to_tabular(d: dict[str, Any]) -> str:
|
|
149
|
+
"""Render any list-of-dicts values as header-once tabular rows.
|
|
150
|
+
|
|
151
|
+
Non-list values render as kv pairs above the table.
|
|
152
|
+
"""
|
|
153
|
+
kv_lines: list[str] = []
|
|
154
|
+
table_lines: list[str] = []
|
|
155
|
+
|
|
156
|
+
for k, v in d.items():
|
|
157
|
+
if isinstance(v, list) and v and isinstance(v[0], dict):
|
|
158
|
+
# collect union of all keys (preserving order)
|
|
159
|
+
fields: list[str] = list(dict.fromkeys(f for row in v for f in row))
|
|
160
|
+
table_lines.append(f"{k}[{len(v)}]{{{','.join(fields)}}}:")
|
|
161
|
+
for row in v:
|
|
162
|
+
vals = (str(row.get(f, "")) for f in fields)
|
|
163
|
+
table_lines.append(f" {','.join(vals)}")
|
|
164
|
+
else:
|
|
165
|
+
kv_lines.append(f"{k}:{v}")
|
|
166
|
+
|
|
167
|
+
return "\n".join(kv_lines + table_lines)
|
ptk/minimizers/_diff.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Diff minimizer — fold unchanged context, keep only meaningful changes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from ptk._base import Minimizer
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DiffMinimizer(Minimizer):
|
|
11
|
+
"""Compress git diffs by folding unchanged context lines.
|
|
12
|
+
|
|
13
|
+
Strategies:
|
|
14
|
+
1. Keep hunk headers (@@) and file headers (---, +++)
|
|
15
|
+
2. Keep added (+) and removed (-) lines
|
|
16
|
+
3. Collapse context (space-prefixed) lines to `... N lines ...`
|
|
17
|
+
4. Aggressive: strip file mode changes, index lines, trailing whitespace diffs
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
# max unchanged context lines to keep around changes
|
|
21
|
+
CONTEXT_LINES = 2
|
|
22
|
+
|
|
23
|
+
def _minimize(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> str:
|
|
24
|
+
text = obj if isinstance(obj, str) else str(obj)
|
|
25
|
+
ctx = kw.get("context_lines", 0 if aggressive else self.CONTEXT_LINES)
|
|
26
|
+
return _fold_diff(text, context=ctx, aggressive=aggressive)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _fold_diff(text: str, *, context: int, aggressive: bool) -> str:
|
|
30
|
+
lines = text.split("\n")
|
|
31
|
+
result: list[str] = []
|
|
32
|
+
context_buf: list[str] = []
|
|
33
|
+
|
|
34
|
+
for line in lines:
|
|
35
|
+
# always skip noise in aggressive mode
|
|
36
|
+
if aggressive and _is_noise(line):
|
|
37
|
+
continue
|
|
38
|
+
|
|
39
|
+
if _is_significant(line):
|
|
40
|
+
# flush context buffer — keep only last `context` lines
|
|
41
|
+
if context_buf:
|
|
42
|
+
_flush_context(result, context_buf, keep=context)
|
|
43
|
+
context_buf.clear()
|
|
44
|
+
result.append(line)
|
|
45
|
+
else:
|
|
46
|
+
context_buf.append(line)
|
|
47
|
+
|
|
48
|
+
# flush remaining context
|
|
49
|
+
if context_buf:
|
|
50
|
+
_flush_context(result, context_buf, keep=context)
|
|
51
|
+
|
|
52
|
+
return "\n".join(result)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _is_significant(line: str) -> bool:
|
|
56
|
+
"""Lines that carry meaningful diff information."""
|
|
57
|
+
return (
|
|
58
|
+
line.startswith(("+", "-", "@@", "diff ", "\\ ")) and not line.startswith(("+++", "---"))
|
|
59
|
+
) or line.startswith(("+++", "---"))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _is_noise(line: str) -> bool:
|
|
63
|
+
"""Lines that are almost never useful to an LLM."""
|
|
64
|
+
return (
|
|
65
|
+
line.startswith(("index ", "old mode", "new mode", "similarity"))
|
|
66
|
+
or line.startswith("rename ")
|
|
67
|
+
or (line.startswith("Binary files") and "differ" in line)
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _flush_context(result: list[str], buf: list[str], *, keep: int) -> None:
|
|
72
|
+
"""Collapse a block of context lines, keeping at most `keep` on each end."""
|
|
73
|
+
if len(buf) <= keep * 2 + 1:
|
|
74
|
+
result.extend(buf)
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
if keep > 0:
|
|
78
|
+
result.extend(buf[:keep])
|
|
79
|
+
folded = len(buf) - keep * 2
|
|
80
|
+
if folded > 0:
|
|
81
|
+
result.append(f" ... {folded} lines ...")
|
|
82
|
+
if keep > 0:
|
|
83
|
+
result.extend(buf[-keep:])
|
ptk/minimizers/_list.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""List/array minimizer — dedup, sampling, schema-once tabular."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from collections import Counter
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from ptk._base import Minimizer, strip_nullish
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ListMinimizer(Minimizer):
|
|
13
|
+
"""Compress lists via dedup, sampling, and tabular encoding.
|
|
14
|
+
|
|
15
|
+
Strategies:
|
|
16
|
+
1. Strip nullish items
|
|
17
|
+
2. Deduplicate exact-match items with counts
|
|
18
|
+
3. For uniform list-of-dicts: schema-once tabular output
|
|
19
|
+
4. For large arrays: statistical sampling (aggressive)
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
# max items before sampling kicks in (aggressive mode)
|
|
23
|
+
SAMPLE_THRESHOLD = 50
|
|
24
|
+
|
|
25
|
+
def _minimize(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> str:
|
|
26
|
+
if not isinstance(obj, (list, tuple)):
|
|
27
|
+
return json.dumps(obj, separators=(",", ":"), default=str)
|
|
28
|
+
|
|
29
|
+
items = [strip_nullish(i) if isinstance(i, dict) else i for i in obj if i is not None]
|
|
30
|
+
|
|
31
|
+
if not items:
|
|
32
|
+
return "[]"
|
|
33
|
+
|
|
34
|
+
# ── uniform list-of-dicts → tabular ─────────────────────────
|
|
35
|
+
if isinstance(items[0], dict) and all(isinstance(i, dict) for i in items):
|
|
36
|
+
if aggressive and len(items) > self.SAMPLE_THRESHOLD:
|
|
37
|
+
items = _sample(items, self.SAMPLE_THRESHOLD)
|
|
38
|
+
return _tabular(items)
|
|
39
|
+
|
|
40
|
+
# ── primitive list → dedup with counts ──────────────────────
|
|
41
|
+
if aggressive and len(items) > self.SAMPLE_THRESHOLD:
|
|
42
|
+
items = items[: self.SAMPLE_THRESHOLD]
|
|
43
|
+
|
|
44
|
+
return _dedup_list(items)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _tabular(rows: list[dict[str, Any]]) -> str:
|
|
48
|
+
"""Schema-once tabular: declare fields in header, one CSV-ish row per item."""
|
|
49
|
+
fields: list[str] = list(dict.fromkeys(f for row in rows for f in row))
|
|
50
|
+
header = f"[{len(rows)}]{{{','.join(fields)}}}:"
|
|
51
|
+
body = "\n".join(",".join(str(row.get(f, "")) for f in fields) for row in rows)
|
|
52
|
+
return f"{header}\n{body}"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _dedup_list(items: list[Any]) -> str:
|
|
56
|
+
"""Collapse duplicate primitives: [a, a, a, b] → a (x3)\nb"""
|
|
57
|
+
# json-serialize each item for stable hashing
|
|
58
|
+
serialized = [json.dumps(i, separators=(",", ":"), default=str) for i in items]
|
|
59
|
+
counts = Counter(serialized)
|
|
60
|
+
|
|
61
|
+
# preserve first-seen order
|
|
62
|
+
seen: set[str] = set()
|
|
63
|
+
lines: list[str] = []
|
|
64
|
+
for s in serialized:
|
|
65
|
+
if s in seen:
|
|
66
|
+
continue
|
|
67
|
+
seen.add(s)
|
|
68
|
+
c = counts[s]
|
|
69
|
+
# strip quotes from simple strings for readability
|
|
70
|
+
display = s.strip('"') if s.startswith('"') and s.endswith('"') else s
|
|
71
|
+
lines.append(f"{display} (x{c})" if c > 1 else display)
|
|
72
|
+
|
|
73
|
+
return "\n".join(lines)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _sample(items: list[dict[str, Any]], n: int) -> list[dict[str, Any]]:
|
|
77
|
+
"""Deterministic even-spaced sampling — keeps first and last."""
|
|
78
|
+
if n <= 0:
|
|
79
|
+
return []
|
|
80
|
+
if len(items) <= n:
|
|
81
|
+
return items
|
|
82
|
+
if n == 1:
|
|
83
|
+
return [items[0]]
|
|
84
|
+
step = (len(items) - 1) / (n - 1)
|
|
85
|
+
indices = {round(i * step) for i in range(n)}
|
|
86
|
+
sampled = [items[i] for i in sorted(indices)]
|
|
87
|
+
return sampled
|
ptk/minimizers/_log.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Log minimizer — dedup repeated lines, error-only filtering."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from ptk._base import Minimizer, dedup_lines
|
|
9
|
+
|
|
10
|
+
# precompiled
|
|
11
|
+
_LOG_LEVEL = re.compile(
|
|
12
|
+
r"\b(ERROR|WARN(?:ING)?|CRITICAL|FATAL|EXCEPTION|SEVERE|PANIC)\b",
|
|
13
|
+
re.IGNORECASE,
|
|
14
|
+
)
|
|
15
|
+
_TIMESTAMP = re.compile(
|
|
16
|
+
r"^\d{4}[-/]\d{2}[-/]\d{2}[T ]\d{2}:\d{2}:\d{2}[.,]?\d*\s*(?:[+-]\d{2}:?\d{2}|Z)?\s*",
|
|
17
|
+
re.MULTILINE,
|
|
18
|
+
)
|
|
19
|
+
# stack trace markers — lines matching these are always preserved
|
|
20
|
+
_STACKTRACE_RE = re.compile(
|
|
21
|
+
r'^\s*(Traceback \(most recent|File "|\s+raise |'
|
|
22
|
+
r"\w+Error:|\w+Exception:|\w+Warning:|at \S+\.\S+\()",
|
|
23
|
+
re.MULTILINE,
|
|
24
|
+
)
|
|
25
|
+
# test-runner pass markers — never kept as context in errors_only mode
|
|
26
|
+
_PASS_MARKERS: frozenset[str] = frozenset(
|
|
27
|
+
{
|
|
28
|
+
"PASSED",
|
|
29
|
+
" PASS",
|
|
30
|
+
"--- PASS",
|
|
31
|
+
"... ok",
|
|
32
|
+
" ok",
|
|
33
|
+
"test result: ok",
|
|
34
|
+
"✓",
|
|
35
|
+
}
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class LogMinimizer(Minimizer):
|
|
40
|
+
"""Compress log output via deduplication and error filtering.
|
|
41
|
+
|
|
42
|
+
Strategies:
|
|
43
|
+
1. Strip timestamps (aggressive) — timestamps are rarely useful to an LLM
|
|
44
|
+
2. Collapse consecutive duplicate lines with counts
|
|
45
|
+
3. Keep only error/warning lines (aggressive)
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def _minimize(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> str:
|
|
49
|
+
text = obj if isinstance(obj, str) else str(obj)
|
|
50
|
+
|
|
51
|
+
# bail early on blank input
|
|
52
|
+
if not text.strip():
|
|
53
|
+
return ""
|
|
54
|
+
|
|
55
|
+
if aggressive:
|
|
56
|
+
text = _TIMESTAMP.sub("", text)
|
|
57
|
+
|
|
58
|
+
text = dedup_lines(text)
|
|
59
|
+
|
|
60
|
+
if aggressive or kw.get("errors_only", False):
|
|
61
|
+
text = _errors_only(text)
|
|
62
|
+
|
|
63
|
+
return text.strip()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _errors_only(text: str) -> str:
|
|
67
|
+
"""Keep error/warning lines, stack traces, 'failed' keyword lines, + context.
|
|
68
|
+
|
|
69
|
+
Pass-marker lines (PASSED, --- PASS, ... ok, etc.) are never kept even
|
|
70
|
+
as context around errors — they are pure noise in an error report.
|
|
71
|
+
"""
|
|
72
|
+
lines = text.split("\n")
|
|
73
|
+
keep: set[int] = set()
|
|
74
|
+
for i, line in enumerate(lines):
|
|
75
|
+
ll = line.lower()
|
|
76
|
+
is_important = (
|
|
77
|
+
_LOG_LEVEL.search(line)
|
|
78
|
+
or _STACKTRACE_RE.match(line)
|
|
79
|
+
or "failed" in ll
|
|
80
|
+
or "fail:" in ll # go test: --- FAIL: TestName
|
|
81
|
+
or " fail " in ll # standalone FAIL line
|
|
82
|
+
or "error" in ll # build errors, compiler output
|
|
83
|
+
or "panicked" in ll # rust panic
|
|
84
|
+
or "assertion" in ll # assertion errors
|
|
85
|
+
)
|
|
86
|
+
if is_important:
|
|
87
|
+
# keep the important line + 1 line of context before/after,
|
|
88
|
+
# but exclude pure pass-marker lines from the context window
|
|
89
|
+
for j in range(max(0, i - 1), min(len(lines), i + 2)):
|
|
90
|
+
if not any(marker in lines[j] for marker in _PASS_MARKERS):
|
|
91
|
+
keep.add(j)
|
|
92
|
+
if not keep:
|
|
93
|
+
return text # no errors found — return deduped version
|
|
94
|
+
return "\n".join(lines[i] for i in sorted(keep))
|
ptk/minimizers/_text.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Text minimizer — whitespace normalization, stopword removal, abbreviation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from ptk._base import Minimizer
|
|
9
|
+
|
|
10
|
+
# ── precompiled ─────────────────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
_MULTI_SPACE = re.compile(r"[ \t]+")
|
|
13
|
+
_MULTI_NEWLINE = re.compile(r"\n{3,}")
|
|
14
|
+
|
|
15
|
+
# high-frequency English stopwords that add tokens but rarely carry meaning
|
|
16
|
+
# in LLM context (intentionally conservative — we never want to destroy meaning)
|
|
17
|
+
_STOPWORDS: frozenset[str] = frozenset(
|
|
18
|
+
{
|
|
19
|
+
"the",
|
|
20
|
+
"a",
|
|
21
|
+
"an",
|
|
22
|
+
"is",
|
|
23
|
+
"are",
|
|
24
|
+
"was",
|
|
25
|
+
"were",
|
|
26
|
+
"be",
|
|
27
|
+
"been",
|
|
28
|
+
"being",
|
|
29
|
+
"have",
|
|
30
|
+
"has",
|
|
31
|
+
"had",
|
|
32
|
+
"do",
|
|
33
|
+
"does",
|
|
34
|
+
"did",
|
|
35
|
+
"will",
|
|
36
|
+
"would",
|
|
37
|
+
"shall",
|
|
38
|
+
"should",
|
|
39
|
+
"may",
|
|
40
|
+
"might",
|
|
41
|
+
"must",
|
|
42
|
+
"can",
|
|
43
|
+
"could",
|
|
44
|
+
"that",
|
|
45
|
+
"which",
|
|
46
|
+
"who",
|
|
47
|
+
"whom",
|
|
48
|
+
"this",
|
|
49
|
+
"these",
|
|
50
|
+
"those",
|
|
51
|
+
"am",
|
|
52
|
+
"its",
|
|
53
|
+
"very",
|
|
54
|
+
"just",
|
|
55
|
+
"also",
|
|
56
|
+
"really",
|
|
57
|
+
"quite",
|
|
58
|
+
"rather",
|
|
59
|
+
}
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# common long phrases → shorter equivalents
|
|
63
|
+
_PHRASE_ABBREVIATIONS: tuple[tuple[str, str], ...] = (
|
|
64
|
+
("in order to", "to"),
|
|
65
|
+
("as well as", "&"),
|
|
66
|
+
("due to the fact that", "because"),
|
|
67
|
+
("in the event that", "if"),
|
|
68
|
+
("at this point in time", "now"),
|
|
69
|
+
("for the purpose of", "to"),
|
|
70
|
+
("in addition to", "also"),
|
|
71
|
+
("with regard to", "re:"),
|
|
72
|
+
("a large number of", "many"),
|
|
73
|
+
("the majority of", "most"),
|
|
74
|
+
("in the process of", "while"),
|
|
75
|
+
("on the other hand", "however"),
|
|
76
|
+
("take into account", "consider"),
|
|
77
|
+
("it is important to note that", "note:"),
|
|
78
|
+
("please note that", "note:"),
|
|
79
|
+
("it should be noted that", "note:"),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# single-word abbreviations (applied via regex word boundaries)
|
|
83
|
+
_WORD_ABBREVIATIONS: dict[str, str] = {
|
|
84
|
+
"implementation": "impl",
|
|
85
|
+
"implementations": "impls",
|
|
86
|
+
"configuration": "config",
|
|
87
|
+
"configurations": "configs",
|
|
88
|
+
"production": "prod",
|
|
89
|
+
"development": "dev",
|
|
90
|
+
"environment": "env",
|
|
91
|
+
"environments": "envs",
|
|
92
|
+
"application": "app",
|
|
93
|
+
"applications": "apps",
|
|
94
|
+
"infrastructure": "infra",
|
|
95
|
+
"authentication": "auth",
|
|
96
|
+
"authorization": "authz",
|
|
97
|
+
"repository": "repo",
|
|
98
|
+
"repositories": "repos",
|
|
99
|
+
"documentation": "docs",
|
|
100
|
+
"specification": "spec",
|
|
101
|
+
"specifications": "specs",
|
|
102
|
+
"requirements": "reqs",
|
|
103
|
+
"approximately": "~",
|
|
104
|
+
"notification": "notif",
|
|
105
|
+
"notifications": "notifs",
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
# filler phrases stripped entirely (claw-compactor Abbrev pattern)
|
|
109
|
+
_FILLER_PHRASES: tuple[str, ...] = (
|
|
110
|
+
"Furthermore, ",
|
|
111
|
+
"Furthermore,",
|
|
112
|
+
"In addition, ",
|
|
113
|
+
"In addition,",
|
|
114
|
+
"Moreover, ",
|
|
115
|
+
"Moreover,",
|
|
116
|
+
"Additionally, ",
|
|
117
|
+
"Additionally,",
|
|
118
|
+
"Having said that, ",
|
|
119
|
+
"Having said that,",
|
|
120
|
+
"It is worth noting that ",
|
|
121
|
+
"As mentioned earlier, ",
|
|
122
|
+
"As previously stated, ",
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
_WORD_ABBREV_RE = re.compile(
|
|
126
|
+
r"\b(" + "|".join(re.escape(w) for w in _WORD_ABBREVIATIONS) + r")\b",
|
|
127
|
+
re.IGNORECASE,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class TextMinimizer(Minimizer):
|
|
132
|
+
"""Compress natural language text for LLM consumption.
|
|
133
|
+
|
|
134
|
+
Strategies:
|
|
135
|
+
1. Normalize whitespace (always)
|
|
136
|
+
2. Apply phrase abbreviations (always)
|
|
137
|
+
3. Remove stopwords (aggressive) — safe for context/instructions, not prose
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
def _minimize(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> str:
|
|
141
|
+
text = obj if isinstance(obj, str) else str(obj)
|
|
142
|
+
|
|
143
|
+
# normalize whitespace
|
|
144
|
+
text = _MULTI_SPACE.sub(" ", text)
|
|
145
|
+
text = _MULTI_NEWLINE.sub("\n\n", text)
|
|
146
|
+
|
|
147
|
+
# strip filler phrases
|
|
148
|
+
for filler in _FILLER_PHRASES:
|
|
149
|
+
text = text.replace(filler, "")
|
|
150
|
+
|
|
151
|
+
# abbreviate common phrases
|
|
152
|
+
for long, short in _PHRASE_ABBREVIATIONS:
|
|
153
|
+
text = text.replace(long, short)
|
|
154
|
+
|
|
155
|
+
# abbreviate common words (case-preserving)
|
|
156
|
+
text = _WORD_ABBREV_RE.sub(_word_abbrev_replace, text)
|
|
157
|
+
|
|
158
|
+
if aggressive:
|
|
159
|
+
text = _remove_stopwords(text)
|
|
160
|
+
|
|
161
|
+
return text.strip()
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _word_abbrev_replace(m: re.Match[str]) -> str:
|
|
165
|
+
"""Replace matched word with its abbreviation, preserving case style."""
|
|
166
|
+
word = m.group(0)
|
|
167
|
+
abbrev = _WORD_ABBREVIATIONS[word.lower()]
|
|
168
|
+
if word.isupper():
|
|
169
|
+
return abbrev.upper()
|
|
170
|
+
if word[0].isupper():
|
|
171
|
+
return abbrev.capitalize()
|
|
172
|
+
return abbrev
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _remove_stopwords(text: str) -> str:
|
|
176
|
+
"""Remove stopwords while preserving line structure."""
|
|
177
|
+
lines: list[str] = []
|
|
178
|
+
for line in text.split("\n"):
|
|
179
|
+
words = line.split()
|
|
180
|
+
filtered = [w for w in words if w.lower().strip(".,;:!?") not in _STOPWORDS]
|
|
181
|
+
lines.append(" ".join(filtered))
|
|
182
|
+
return "\n".join(lines)
|
ptk/py.typed
ADDED
|
File without changes
|