python-token-killer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ptk/__init__.py +166 -0
- ptk/_base.py +137 -0
- ptk/_types.py +126 -0
- ptk/minimizers/__init__.py +17 -0
- ptk/minimizers/_code.py +156 -0
- ptk/minimizers/_dict.py +167 -0
- ptk/minimizers/_diff.py +83 -0
- ptk/minimizers/_list.py +87 -0
- ptk/minimizers/_log.py +94 -0
- ptk/minimizers/_text.py +182 -0
- ptk/py.typed +0 -0
- python_token_killer-0.1.0.dist-info/METADATA +269 -0
- python_token_killer-0.1.0.dist-info/RECORD +15 -0
- python_token_killer-0.1.0.dist-info/WHEEL +4 -0
- python_token_killer-0.1.0.dist-info/licenses/LICENSE +21 -0
ptk/__init__.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""ptk — Python Token Killer.
|
|
2
|
+
|
|
3
|
+
Minimize LLM tokens from Python objects in one call.
|
|
4
|
+
|
|
5
|
+
import ptk
|
|
6
|
+
ptk.minimize({"users": [{"name": "Alice", "age": 30}]})
|
|
7
|
+
ptk(some_dict) # shorthand
|
|
8
|
+
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from ptk._base import MinResult, _serialize
|
|
16
|
+
from ptk._types import ContentType, detect
|
|
17
|
+
from ptk.minimizers import (
|
|
18
|
+
CodeMinimizer,
|
|
19
|
+
DictMinimizer,
|
|
20
|
+
DiffMinimizer,
|
|
21
|
+
ListMinimizer,
|
|
22
|
+
LogMinimizer,
|
|
23
|
+
TextMinimizer,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
__version__ = "0.1.0"
|
|
27
|
+
__all__ = [
|
|
28
|
+
"minimize",
|
|
29
|
+
"stats",
|
|
30
|
+
"detect_type",
|
|
31
|
+
"MinResult",
|
|
32
|
+
"ContentType",
|
|
33
|
+
# minimizer classes (for direct use / subclassing)
|
|
34
|
+
"DictMinimizer",
|
|
35
|
+
"ListMinimizer",
|
|
36
|
+
"CodeMinimizer",
|
|
37
|
+
"LogMinimizer",
|
|
38
|
+
"DiffMinimizer",
|
|
39
|
+
"TextMinimizer",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
# ── singleton minimizer instances (created once, reused) ────────────────
|
|
43
|
+
|
|
44
|
+
_ROUTER: dict[ContentType, Any] = {
|
|
45
|
+
ContentType.DICT: DictMinimizer(),
|
|
46
|
+
ContentType.LIST: ListMinimizer(),
|
|
47
|
+
ContentType.CODE: CodeMinimizer(),
|
|
48
|
+
ContentType.LOG: LogMinimizer(),
|
|
49
|
+
ContentType.DIFF: DiffMinimizer(),
|
|
50
|
+
ContentType.TEXT: TextMinimizer(),
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# ── public API ──────────────────────────────────────────────────────────
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def minimize(
|
|
58
|
+
obj: Any,
|
|
59
|
+
*,
|
|
60
|
+
aggressive: bool = False,
|
|
61
|
+
content_type: ContentType | str | None = None,
|
|
62
|
+
**kw: Any,
|
|
63
|
+
) -> str:
|
|
64
|
+
"""Minimize tokens from any Python object.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
obj: dict, list, str (code/log/diff/text), or anything with __str__.
|
|
68
|
+
aggressive: Apply maximum compression (may lose some fidelity).
|
|
69
|
+
content_type: Force a content type instead of auto-detecting.
|
|
70
|
+
Accepts ContentType enum or string ("dict", "code", "log", etc.)
|
|
71
|
+
**kw: Forwarded to the minimizer (format, mode, errors_only, etc.)
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Minimized string representation.
|
|
75
|
+
"""
|
|
76
|
+
ct = _resolve_type(obj, content_type)
|
|
77
|
+
result: str = _ROUTER[ct].run(obj, aggressive=aggressive, **kw).output
|
|
78
|
+
return result
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def stats(
|
|
82
|
+
obj: Any,
|
|
83
|
+
*,
|
|
84
|
+
aggressive: bool = False,
|
|
85
|
+
content_type: ContentType | str | None = None,
|
|
86
|
+
**kw: Any,
|
|
87
|
+
) -> dict[str, Any]:
|
|
88
|
+
"""Return compression statistics without discarding the result.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
{
|
|
92
|
+
"output": <minimized string>,
|
|
93
|
+
"original_len": int, # character count
|
|
94
|
+
"minimized_len": int, # character count
|
|
95
|
+
"savings_pct": float, # e.g. 73.2
|
|
96
|
+
"content_type": str, # e.g. "dict"
|
|
97
|
+
"original_tokens": int | None, # if tiktoken available
|
|
98
|
+
"minimized_tokens": int | None,
|
|
99
|
+
}
|
|
100
|
+
"""
|
|
101
|
+
ct = _resolve_type(obj, content_type)
|
|
102
|
+
result = _ROUTER[ct].run(obj, aggressive=aggressive, **kw)
|
|
103
|
+
|
|
104
|
+
original_str = _serialize(obj)
|
|
105
|
+
orig_tok, min_tok = _estimate_tokens(original_str, result.output)
|
|
106
|
+
|
|
107
|
+
return {
|
|
108
|
+
"output": result.output,
|
|
109
|
+
"original_len": result.original_len,
|
|
110
|
+
"minimized_len": result.minimized_len,
|
|
111
|
+
"savings_pct": result.savings_pct,
|
|
112
|
+
"content_type": ct.name.lower(),
|
|
113
|
+
"original_tokens": orig_tok,
|
|
114
|
+
"minimized_tokens": min_tok,
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def detect_type(obj: Any) -> str:
|
|
119
|
+
"""Return the auto-detected content type as a lowercase string."""
|
|
120
|
+
return detect(obj).name.lower()
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# ── callable module trick ───────────────────────────────────────────────
|
|
124
|
+
# Allows `import ptk; ptk(obj)` as shorthand for `ptk.minimize(obj)`.
|
|
125
|
+
|
|
126
|
+
import sys as _sys # noqa: E402
|
|
127
|
+
import types as _types # noqa: E402
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class _CallableModule(_types.ModuleType):
|
|
131
|
+
"""Module that's also callable — `ptk(obj)` works."""
|
|
132
|
+
|
|
133
|
+
def __call__(self, obj: Any, **kw: Any) -> str:
|
|
134
|
+
return minimize(obj, **kw)
|
|
135
|
+
|
|
136
|
+
def __repr__(self) -> str:
|
|
137
|
+
return f"<module 'ptk' v{__version__}>"
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# Swap this module's class so `ptk(...)` works
|
|
141
|
+
_self = _sys.modules[__name__]
|
|
142
|
+
_self.__class__ = _CallableModule
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# ── private helpers ─────────────────────────────────────────────────────
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _resolve_type(obj: Any, hint: ContentType | str | None) -> ContentType:
|
|
149
|
+
if hint is None:
|
|
150
|
+
return detect(obj)
|
|
151
|
+
if isinstance(hint, ContentType):
|
|
152
|
+
return hint
|
|
153
|
+
# string lookup
|
|
154
|
+
return ContentType[hint.upper()]
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _estimate_tokens(original: str, minimized: str) -> tuple[int | None, int | None]:
|
|
158
|
+
"""Try tiktoken for accurate counts, fall back to len//4 heuristic."""
|
|
159
|
+
try:
|
|
160
|
+
import tiktoken
|
|
161
|
+
|
|
162
|
+
enc = tiktoken.get_encoding("cl100k_base")
|
|
163
|
+
return len(enc.encode(original)), len(enc.encode(minimized))
|
|
164
|
+
except ImportError:
|
|
165
|
+
# fast heuristic: ~4 chars per token for English
|
|
166
|
+
return len(original) // 4, len(minimized) // 4
|
ptk/_base.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Base minimizer protocol + shared utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True, slots=True)
|
|
11
|
+
class MinResult:
|
|
12
|
+
"""Immutable result from a minimizer pass."""
|
|
13
|
+
|
|
14
|
+
output: str
|
|
15
|
+
original_len: int
|
|
16
|
+
minimized_len: int
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def savings_pct(self) -> float:
|
|
20
|
+
if self.original_len == 0:
|
|
21
|
+
return 0.0
|
|
22
|
+
return round((1 - self.minimized_len / self.original_len) * 100, 1)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Minimizer(ABC):
|
|
26
|
+
"""Base class all minimizers inherit from.
|
|
27
|
+
|
|
28
|
+
Subclasses only need to implement `_minimize`.
|
|
29
|
+
The public `run()` method handles result wrapping.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def _minimize(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> str:
|
|
34
|
+
"""Return the minimized string representation."""
|
|
35
|
+
|
|
36
|
+
def run(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> MinResult:
|
|
37
|
+
original = _serialize(obj)
|
|
38
|
+
try:
|
|
39
|
+
minimized = self._minimize(obj, aggressive=aggressive, **kw)
|
|
40
|
+
except (RecursionError, ValueError, TypeError, OverflowError):
|
|
41
|
+
# graceful degradation: fall back to str() if minimizer can't handle the input
|
|
42
|
+
minimized = str(obj)
|
|
43
|
+
return MinResult(
|
|
44
|
+
output=minimized,
|
|
45
|
+
original_len=len(original),
|
|
46
|
+
minimized_len=len(minimized),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ── shared helpers (used across minimizers) ─────────────────────────────
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _serialize(obj: Any) -> str:
|
|
54
|
+
"""Cheaply serialize an object to string for length measurement.
|
|
55
|
+
|
|
56
|
+
Must NEVER raise — this is used for metrics, not output.
|
|
57
|
+
Handles circular refs, non-serializable types, tuple keys, etc.
|
|
58
|
+
"""
|
|
59
|
+
if isinstance(obj, str):
|
|
60
|
+
return obj
|
|
61
|
+
if isinstance(obj, (dict, list, tuple)):
|
|
62
|
+
import json
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
return json.dumps(obj, separators=(",", ":"), default=str)
|
|
66
|
+
except (ValueError, TypeError, OverflowError):
|
|
67
|
+
# circular reference, non-string keys, or other json failure
|
|
68
|
+
return str(obj)
|
|
69
|
+
return str(obj)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _is_nullish(v: object) -> bool:
|
|
73
|
+
"""Check if a value is 'empty' — None, "", [], or {}.
|
|
74
|
+
|
|
75
|
+
Type-checks first to avoid hashing unhashable types.
|
|
76
|
+
"""
|
|
77
|
+
if v is None:
|
|
78
|
+
return True
|
|
79
|
+
if isinstance(v, str):
|
|
80
|
+
return v == ""
|
|
81
|
+
if isinstance(v, list):
|
|
82
|
+
return len(v) == 0
|
|
83
|
+
if isinstance(v, dict):
|
|
84
|
+
return len(v) == 0
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def strip_nullish(d: dict[str, Any]) -> dict[str, Any]:
|
|
89
|
+
"""Recursively strip None, empty string, empty list, empty dict values."""
|
|
90
|
+
out: dict[str, Any] = {}
|
|
91
|
+
for k, v in d.items():
|
|
92
|
+
if _is_nullish(v):
|
|
93
|
+
continue
|
|
94
|
+
if isinstance(v, dict):
|
|
95
|
+
cleaned = strip_nullish(v)
|
|
96
|
+
if cleaned:
|
|
97
|
+
out[k] = cleaned
|
|
98
|
+
elif isinstance(v, list):
|
|
99
|
+
cleaned_list = [
|
|
100
|
+
strip_nullish(i) if isinstance(i, dict) else i for i in v if not _is_nullish(i)
|
|
101
|
+
]
|
|
102
|
+
if cleaned_list:
|
|
103
|
+
out[k] = cleaned_list
|
|
104
|
+
else:
|
|
105
|
+
out[k] = v
|
|
106
|
+
return out
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def dedup_lines(text: str, *, threshold: int = 2) -> str:
|
|
110
|
+
"""Collapse consecutive duplicate lines into `<line> (xN)`.
|
|
111
|
+
|
|
112
|
+
Uses a single-pass algorithm — O(n) time, O(1) extra per group.
|
|
113
|
+
"""
|
|
114
|
+
lines = text.split("\n")
|
|
115
|
+
if len(lines) <= 1:
|
|
116
|
+
return text
|
|
117
|
+
|
|
118
|
+
result: list[str] = []
|
|
119
|
+
prev = lines[0]
|
|
120
|
+
count = 1
|
|
121
|
+
|
|
122
|
+
for line in lines[1:]:
|
|
123
|
+
if line == prev:
|
|
124
|
+
count += 1
|
|
125
|
+
else:
|
|
126
|
+
_flush(result, prev, count, threshold)
|
|
127
|
+
prev = line
|
|
128
|
+
count = 1
|
|
129
|
+
_flush(result, prev, count, threshold)
|
|
130
|
+
return "\n".join(result)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _flush(result: list[str], line: str, count: int, threshold: int) -> None:
|
|
134
|
+
if count >= threshold:
|
|
135
|
+
result.append(f"{line} (x{count})")
|
|
136
|
+
else:
|
|
137
|
+
result.extend([line] * count)
|
ptk/_types.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""Content type detection — pure builtins, no deps."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import Enum, auto
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ContentType(Enum):
|
|
9
|
+
DICT = auto()
|
|
10
|
+
LIST = auto()
|
|
11
|
+
CODE = auto()
|
|
12
|
+
LOG = auto()
|
|
13
|
+
DIFF = auto()
|
|
14
|
+
TEXT = auto()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# ── fast heuristics (order matters — first match wins) ──────────────────
|
|
18
|
+
|
|
19
|
+
_CODE_MARKERS = frozenset(
|
|
20
|
+
{
|
|
21
|
+
"def ",
|
|
22
|
+
"class ",
|
|
23
|
+
"import ",
|
|
24
|
+
"from ",
|
|
25
|
+
"function ",
|
|
26
|
+
"const ",
|
|
27
|
+
"let ",
|
|
28
|
+
"var ",
|
|
29
|
+
"public ",
|
|
30
|
+
"private ",
|
|
31
|
+
"async ",
|
|
32
|
+
"await ",
|
|
33
|
+
"return ",
|
|
34
|
+
"if __name__",
|
|
35
|
+
"#!/",
|
|
36
|
+
"package ",
|
|
37
|
+
"func ",
|
|
38
|
+
"fn ",
|
|
39
|
+
"impl ",
|
|
40
|
+
"module ",
|
|
41
|
+
"export ",
|
|
42
|
+
"interface ",
|
|
43
|
+
"struct ",
|
|
44
|
+
}
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
_LOG_PATTERNS = frozenset(
|
|
48
|
+
{
|
|
49
|
+
# structured log levels
|
|
50
|
+
"[INFO]",
|
|
51
|
+
"[WARN]",
|
|
52
|
+
"[ERROR]",
|
|
53
|
+
"[DEBUG]",
|
|
54
|
+
"[TRACE]",
|
|
55
|
+
" INFO ",
|
|
56
|
+
" WARN ",
|
|
57
|
+
" ERROR ",
|
|
58
|
+
" DEBUG ",
|
|
59
|
+
" TRACE ",
|
|
60
|
+
"INFO:",
|
|
61
|
+
"WARN:",
|
|
62
|
+
"ERROR:",
|
|
63
|
+
"DEBUG:",
|
|
64
|
+
"TRACE:",
|
|
65
|
+
"WARNING:",
|
|
66
|
+
"CRITICAL:",
|
|
67
|
+
# test runner output (pytest, cargo test, go test, jest)
|
|
68
|
+
"PASSED",
|
|
69
|
+
"FAILED",
|
|
70
|
+
"ERRORS",
|
|
71
|
+
"--- PASS:",
|
|
72
|
+
"--- FAIL:", # go test
|
|
73
|
+
"test result: ", # cargo test
|
|
74
|
+
" passed",
|
|
75
|
+
" failed", # pytest summary
|
|
76
|
+
"✓",
|
|
77
|
+
"✗",
|
|
78
|
+
"✕", # jest / vitest
|
|
79
|
+
}
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _looks_like_diff(head: str) -> bool:
|
|
84
|
+
"""Detect unified diff format. Requires @@ hunk header + file headers or diff --git."""
|
|
85
|
+
if "@@" not in head:
|
|
86
|
+
return False
|
|
87
|
+
# strong signal: diff --git header
|
|
88
|
+
if head.startswith("diff --git") or "\ndiff --git" in head:
|
|
89
|
+
return True
|
|
90
|
+
# also accept: --- line followed by +++ line (unified diff without git header)
|
|
91
|
+
has_minus = head.startswith("--- ") or "\n--- " in head
|
|
92
|
+
has_plus = "\n+++ " in head
|
|
93
|
+
return has_minus and has_plus
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def detect(obj: object) -> ContentType:
|
|
97
|
+
"""Detect content type from a Python object. O(1) for non-str types."""
|
|
98
|
+
if isinstance(obj, dict):
|
|
99
|
+
return ContentType.DICT
|
|
100
|
+
if isinstance(obj, (list, tuple)):
|
|
101
|
+
return ContentType.LIST
|
|
102
|
+
if not isinstance(obj, str):
|
|
103
|
+
# fallback: stringify anything else and treat as text
|
|
104
|
+
return ContentType.TEXT
|
|
105
|
+
|
|
106
|
+
# ── string heuristics (check first ~2KB for speed) ──────────────
|
|
107
|
+
head = obj[:2048]
|
|
108
|
+
|
|
109
|
+
# diff detection — requires real unified diff structure, not just --- or @@
|
|
110
|
+
if _looks_like_diff(head):
|
|
111
|
+
return ContentType.DIFF
|
|
112
|
+
|
|
113
|
+
# log detection runs BEFORE code — log patterns are more specific.
|
|
114
|
+
# e.g. pytest output contains 'def test_foo():' which would trigger
|
|
115
|
+
# code detection, but the PASSED/FAILED markers identify it as log first.
|
|
116
|
+
if any(m in head for m in _LOG_PATTERNS):
|
|
117
|
+
return ContentType.LOG
|
|
118
|
+
|
|
119
|
+
# code detection — any code keyword at start of a line
|
|
120
|
+
lines = head.split("\n", 30) # only check first ~30 lines
|
|
121
|
+
for line in lines:
|
|
122
|
+
stripped = line.lstrip()
|
|
123
|
+
if any(stripped.startswith(k) for k in _CODE_MARKERS):
|
|
124
|
+
return ContentType.CODE
|
|
125
|
+
|
|
126
|
+
return ContentType.TEXT
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Minimizer registry — auto-imports all strategies."""
|
|
2
|
+
|
|
3
|
+
from ptk.minimizers._code import CodeMinimizer
|
|
4
|
+
from ptk.minimizers._dict import DictMinimizer
|
|
5
|
+
from ptk.minimizers._diff import DiffMinimizer
|
|
6
|
+
from ptk.minimizers._list import ListMinimizer
|
|
7
|
+
from ptk.minimizers._log import LogMinimizer
|
|
8
|
+
from ptk.minimizers._text import TextMinimizer
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"DictMinimizer",
|
|
12
|
+
"ListMinimizer",
|
|
13
|
+
"CodeMinimizer",
|
|
14
|
+
"LogMinimizer",
|
|
15
|
+
"DiffMinimizer",
|
|
16
|
+
"TextMinimizer",
|
|
17
|
+
]
|
ptk/minimizers/_code.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Code minimizer — comment stripping, whitespace normalization, signature extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from ptk._base import Minimizer
|
|
9
|
+
|
|
10
|
+
# ── precompiled regexes (compiled once at import time) ──────────────────
|
|
11
|
+
|
|
12
|
+
_BLOCK_COMMENT = re.compile(r"/\*.*?\*/", re.DOTALL)
|
|
13
|
+
# Match strings first (to skip), then comments. Group 1 = string (keep), Group 2 = comment (strip).
|
|
14
|
+
_STRING_OR_COMMENT_C = re.compile(
|
|
15
|
+
r"""("(?:[^"\\]|\\.)*"|'(?:[^'\\]|\\.)*')""" # group 1: quoted string
|
|
16
|
+
r"|" # OR
|
|
17
|
+
r"(//.*$)", # group 2: C-style inline comment
|
|
18
|
+
re.MULTILINE,
|
|
19
|
+
)
|
|
20
|
+
_STRING_OR_COMMENT_PY = re.compile(
|
|
21
|
+
r"""("(?:[^"\\]|\\.)*"|'(?:[^'\\]|\\.)*')""" # group 1: quoted string
|
|
22
|
+
r"|" # OR
|
|
23
|
+
r"(#.*$)", # group 2: Python inline comment
|
|
24
|
+
re.MULTILINE,
|
|
25
|
+
)
|
|
26
|
+
_DOCSTRING = re.compile(r"(\"\"\"[\s\S]*?\"\"\"|\'\'\'[\s\S]*?\'\'\')")
|
|
27
|
+
_BLANK_LINES = re.compile(r"\n{3,}")
|
|
28
|
+
_TRAILING_WS = re.compile(r"[ \t]+$", re.MULTILINE)
|
|
29
|
+
|
|
30
|
+
# pragma/directive comments that must be preserved
|
|
31
|
+
_PRAGMA_KEYWORDS: frozenset[str] = frozenset(
|
|
32
|
+
{
|
|
33
|
+
"noqa",
|
|
34
|
+
"type: ignore",
|
|
35
|
+
"type:ignore",
|
|
36
|
+
"TODO",
|
|
37
|
+
"FIXME",
|
|
38
|
+
"HACK",
|
|
39
|
+
"XXX",
|
|
40
|
+
"pragma",
|
|
41
|
+
"pylint:",
|
|
42
|
+
"fmt:",
|
|
43
|
+
"eslint-disable",
|
|
44
|
+
"eslint-enable",
|
|
45
|
+
"@ts-ignore",
|
|
46
|
+
"@ts-expect-error",
|
|
47
|
+
"noinspection",
|
|
48
|
+
}
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# signature patterns for common languages
|
|
52
|
+
_PY_SIG = re.compile(
|
|
53
|
+
r"^([ \t]*(?:async\s+)?(?:def|class)\s+\w+.*?:)\s*$",
|
|
54
|
+
re.MULTILINE,
|
|
55
|
+
)
|
|
56
|
+
_JS_SIG = re.compile(
|
|
57
|
+
r"^([ \t]*(?:export\s+)?(?:async\s+)?(?:function\s+\w+|(?:const|let|var)\s+\w+\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>)[^{]*)",
|
|
58
|
+
re.MULTILINE,
|
|
59
|
+
)
|
|
60
|
+
_RUST_SIG = re.compile(
|
|
61
|
+
r"^([ \t]*(?:pub\s+)?(?:async\s+)?fn\s+\w+[^{]*)",
|
|
62
|
+
re.MULTILINE,
|
|
63
|
+
)
|
|
64
|
+
_GO_SIG = re.compile(
|
|
65
|
+
r"^([ \t]*func\s+(?:\([^)]*\)\s+)?\w+[^{]*)",
|
|
66
|
+
re.MULTILINE,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
_SIG_PATTERNS = [_PY_SIG, _JS_SIG, _RUST_SIG, _GO_SIG]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class CodeMinimizer(Minimizer):
|
|
73
|
+
"""Compress code by stripping comments, normalizing whitespace, extracting signatures.
|
|
74
|
+
|
|
75
|
+
Modes (via `mode` kwarg):
|
|
76
|
+
- "clean" (default) — strip comments + normalize whitespace
|
|
77
|
+
- "signatures" — extract function/class signatures only (huge savings)
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def _minimize(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> str:
|
|
81
|
+
code = obj if isinstance(obj, str) else str(obj)
|
|
82
|
+
mode = kw.get("mode", "signatures" if aggressive else "clean")
|
|
83
|
+
|
|
84
|
+
if mode == "signatures":
|
|
85
|
+
return _extract_signatures(code)
|
|
86
|
+
return _clean(code)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _has_pragma(comment: str) -> bool:
|
|
90
|
+
"""Check if a comment contains a pragma/directive that must be preserved."""
|
|
91
|
+
return any(kw in comment for kw in _PRAGMA_KEYWORDS)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _strip_string_or_comment_c(m: re.Match[str]) -> str:
|
|
95
|
+
"""Handle string-or-comment match: keep strings, strip comments (unless pragma)."""
|
|
96
|
+
if m.group(1): # it's a quoted string — keep it
|
|
97
|
+
return m.group(1)
|
|
98
|
+
comment = m.group(2) # it's a // comment
|
|
99
|
+
return comment if _has_pragma(comment) else ""
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _strip_string_or_comment_py(m: re.Match[str]) -> str:
|
|
103
|
+
"""Handle string-or-comment match: keep strings, strip comments (unless pragma)."""
|
|
104
|
+
if m.group(1): # it's a quoted string — keep it
|
|
105
|
+
return m.group(1)
|
|
106
|
+
comment = m.group(2) # it's a # comment
|
|
107
|
+
return comment if _has_pragma(comment) else ""
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _strip_block_comment_if_safe(m: re.Match[str]) -> str:
|
|
111
|
+
"""Remove a block comment unless it contains a pragma."""
|
|
112
|
+
return m.group(0) if _has_pragma(m.group(0)) else ""
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _collapse_docstring(m: re.Match[str]) -> str:
|
|
116
|
+
"""Collapse a multi-line docstring to its first line only."""
|
|
117
|
+
full = m.group(0)
|
|
118
|
+
# detect the quote style
|
|
119
|
+
quote = full[:3]
|
|
120
|
+
inner = full[3:-3].strip()
|
|
121
|
+
lines = inner.split("\n")
|
|
122
|
+
first_line = lines[0].strip() if lines else ""
|
|
123
|
+
if not first_line:
|
|
124
|
+
return ""
|
|
125
|
+
# single-line docstrings or summaries — keep as one-liner
|
|
126
|
+
return f"{quote}{first_line}{quote}"
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _clean(code: str) -> str:
|
|
130
|
+
"""Strip comments and normalize whitespace — language-agnostic.
|
|
131
|
+
|
|
132
|
+
Preserves pragma comments (noqa, type: ignore, TODO, eslint-disable, etc.)
|
|
133
|
+
and collapses multi-line docstrings to first line.
|
|
134
|
+
"""
|
|
135
|
+
out = _BLOCK_COMMENT.sub(_strip_block_comment_if_safe, code)
|
|
136
|
+
# strip docstrings BEFORE inline comments so triple-quotes are handled first
|
|
137
|
+
out = _DOCSTRING.sub(_collapse_docstring, out)
|
|
138
|
+
# use string-aware patterns to avoid stripping // or # inside string literals
|
|
139
|
+
out = _STRING_OR_COMMENT_C.sub(_strip_string_or_comment_c, out)
|
|
140
|
+
out = _STRING_OR_COMMENT_PY.sub(_strip_string_or_comment_py, out)
|
|
141
|
+
out = _TRAILING_WS.sub("", out)
|
|
142
|
+
out = _BLANK_LINES.sub("\n\n", out)
|
|
143
|
+
return out.strip()
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _extract_signatures(code: str) -> str:
|
|
147
|
+
"""Pull out only function/class signatures — works across Python, JS, Rust, Go."""
|
|
148
|
+
sigs: list[str] = []
|
|
149
|
+
for pattern in _SIG_PATTERNS:
|
|
150
|
+
sigs.extend(m.group(1).strip() for m in pattern.finditer(code))
|
|
151
|
+
|
|
152
|
+
if not sigs:
|
|
153
|
+
# fallback: return cleaned code if no signatures found
|
|
154
|
+
return _clean(code)
|
|
155
|
+
|
|
156
|
+
return "\n".join(sigs)
|