python-token-killer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ptk/__init__.py ADDED
@@ -0,0 +1,166 @@
1
+ """ptk — Python Token Killer.
2
+
3
+ Minimize LLM tokens from Python objects in one call.
4
+
5
+ import ptk
6
+ ptk.minimize({"users": [{"name": "Alice", "age": 30}]})
7
+ ptk(some_dict) # shorthand
8
+
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Any
14
+
15
+ from ptk._base import MinResult, _serialize
16
+ from ptk._types import ContentType, detect
17
+ from ptk.minimizers import (
18
+ CodeMinimizer,
19
+ DictMinimizer,
20
+ DiffMinimizer,
21
+ ListMinimizer,
22
+ LogMinimizer,
23
+ TextMinimizer,
24
+ )
25
+
26
+ __version__ = "0.1.0"
27
+ __all__ = [
28
+ "minimize",
29
+ "stats",
30
+ "detect_type",
31
+ "MinResult",
32
+ "ContentType",
33
+ # minimizer classes (for direct use / subclassing)
34
+ "DictMinimizer",
35
+ "ListMinimizer",
36
+ "CodeMinimizer",
37
+ "LogMinimizer",
38
+ "DiffMinimizer",
39
+ "TextMinimizer",
40
+ ]
41
+
42
+ # ── singleton minimizer instances (created once, reused) ────────────────
43
+
44
+ _ROUTER: dict[ContentType, Any] = {
45
+ ContentType.DICT: DictMinimizer(),
46
+ ContentType.LIST: ListMinimizer(),
47
+ ContentType.CODE: CodeMinimizer(),
48
+ ContentType.LOG: LogMinimizer(),
49
+ ContentType.DIFF: DiffMinimizer(),
50
+ ContentType.TEXT: TextMinimizer(),
51
+ }
52
+
53
+
54
+ # ── public API ──────────────────────────────────────────────────────────
55
+
56
+
57
+ def minimize(
58
+ obj: Any,
59
+ *,
60
+ aggressive: bool = False,
61
+ content_type: ContentType | str | None = None,
62
+ **kw: Any,
63
+ ) -> str:
64
+ """Minimize tokens from any Python object.
65
+
66
+ Args:
67
+ obj: dict, list, str (code/log/diff/text), or anything with __str__.
68
+ aggressive: Apply maximum compression (may lose some fidelity).
69
+ content_type: Force a content type instead of auto-detecting.
70
+ Accepts ContentType enum or string ("dict", "code", "log", etc.)
71
+ **kw: Forwarded to the minimizer (format, mode, errors_only, etc.)
72
+
73
+ Returns:
74
+ Minimized string representation.
75
+ """
76
+ ct = _resolve_type(obj, content_type)
77
+ result: str = _ROUTER[ct].run(obj, aggressive=aggressive, **kw).output
78
+ return result
79
+
80
+
81
+ def stats(
82
+ obj: Any,
83
+ *,
84
+ aggressive: bool = False,
85
+ content_type: ContentType | str | None = None,
86
+ **kw: Any,
87
+ ) -> dict[str, Any]:
88
+ """Return compression statistics without discarding the result.
89
+
90
+ Returns:
91
+ {
92
+ "output": <minimized string>,
93
+ "original_len": int, # character count
94
+ "minimized_len": int, # character count
95
+ "savings_pct": float, # e.g. 73.2
96
+ "content_type": str, # e.g. "dict"
97
+ "original_tokens": int | None, # if tiktoken available
98
+ "minimized_tokens": int | None,
99
+ }
100
+ """
101
+ ct = _resolve_type(obj, content_type)
102
+ result = _ROUTER[ct].run(obj, aggressive=aggressive, **kw)
103
+
104
+ original_str = _serialize(obj)
105
+ orig_tok, min_tok = _estimate_tokens(original_str, result.output)
106
+
107
+ return {
108
+ "output": result.output,
109
+ "original_len": result.original_len,
110
+ "minimized_len": result.minimized_len,
111
+ "savings_pct": result.savings_pct,
112
+ "content_type": ct.name.lower(),
113
+ "original_tokens": orig_tok,
114
+ "minimized_tokens": min_tok,
115
+ }
116
+
117
+
118
+ def detect_type(obj: Any) -> str:
119
+ """Return the auto-detected content type as a lowercase string."""
120
+ return detect(obj).name.lower()
121
+
122
+
123
+ # ── callable module trick ───────────────────────────────────────────────
124
+ # Allows `import ptk; ptk(obj)` as shorthand for `ptk.minimize(obj)`.
125
+
126
+ import sys as _sys # noqa: E402
127
+ import types as _types # noqa: E402
128
+
129
+
130
+ class _CallableModule(_types.ModuleType):
131
+ """Module that's also callable — `ptk(obj)` works."""
132
+
133
+ def __call__(self, obj: Any, **kw: Any) -> str:
134
+ return minimize(obj, **kw)
135
+
136
+ def __repr__(self) -> str:
137
+ return f"<module 'ptk' v{__version__}>"
138
+
139
+
140
+ # Swap this module's class so `ptk(...)` works
141
+ _self = _sys.modules[__name__]
142
+ _self.__class__ = _CallableModule
143
+
144
+
145
+ # ── private helpers ─────────────────────────────────────────────────────
146
+
147
+
148
+ def _resolve_type(obj: Any, hint: ContentType | str | None) -> ContentType:
149
+ if hint is None:
150
+ return detect(obj)
151
+ if isinstance(hint, ContentType):
152
+ return hint
153
+ # string lookup
154
+ return ContentType[hint.upper()]
155
+
156
+
157
+ def _estimate_tokens(original: str, minimized: str) -> tuple[int | None, int | None]:
158
+ """Try tiktoken for accurate counts, fall back to len//4 heuristic."""
159
+ try:
160
+ import tiktoken
161
+
162
+ enc = tiktoken.get_encoding("cl100k_base")
163
+ return len(enc.encode(original)), len(enc.encode(minimized))
164
+ except ImportError:
165
+ # fast heuristic: ~4 chars per token for English
166
+ return len(original) // 4, len(minimized) // 4
ptk/_base.py ADDED
@@ -0,0 +1,137 @@
1
+ """Base minimizer protocol + shared utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass
7
+ from typing import Any
8
+
9
+
10
+ @dataclass(frozen=True, slots=True)
11
+ class MinResult:
12
+ """Immutable result from a minimizer pass."""
13
+
14
+ output: str
15
+ original_len: int
16
+ minimized_len: int
17
+
18
+ @property
19
+ def savings_pct(self) -> float:
20
+ if self.original_len == 0:
21
+ return 0.0
22
+ return round((1 - self.minimized_len / self.original_len) * 100, 1)
23
+
24
+
25
+ class Minimizer(ABC):
26
+ """Base class all minimizers inherit from.
27
+
28
+ Subclasses only need to implement `_minimize`.
29
+ The public `run()` method handles result wrapping.
30
+ """
31
+
32
+ @abstractmethod
33
+ def _minimize(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> str:
34
+ """Return the minimized string representation."""
35
+
36
+ def run(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> MinResult:
37
+ original = _serialize(obj)
38
+ try:
39
+ minimized = self._minimize(obj, aggressive=aggressive, **kw)
40
+ except (RecursionError, ValueError, TypeError, OverflowError):
41
+ # graceful degradation: fall back to str() if minimizer can't handle the input
42
+ minimized = str(obj)
43
+ return MinResult(
44
+ output=minimized,
45
+ original_len=len(original),
46
+ minimized_len=len(minimized),
47
+ )
48
+
49
+
50
+ # ── shared helpers (used across minimizers) ─────────────────────────────
51
+
52
+
53
+ def _serialize(obj: Any) -> str:
54
+ """Cheaply serialize an object to string for length measurement.
55
+
56
+ Must NEVER raise — this is used for metrics, not output.
57
+ Handles circular refs, non-serializable types, tuple keys, etc.
58
+ """
59
+ if isinstance(obj, str):
60
+ return obj
61
+ if isinstance(obj, (dict, list, tuple)):
62
+ import json
63
+
64
+ try:
65
+ return json.dumps(obj, separators=(",", ":"), default=str)
66
+ except (ValueError, TypeError, OverflowError):
67
+ # circular reference, non-string keys, or other json failure
68
+ return str(obj)
69
+ return str(obj)
70
+
71
+
72
+ def _is_nullish(v: object) -> bool:
73
+ """Check if a value is 'empty' — None, "", [], or {}.
74
+
75
+ Type-checks first to avoid hashing unhashable types.
76
+ """
77
+ if v is None:
78
+ return True
79
+ if isinstance(v, str):
80
+ return v == ""
81
+ if isinstance(v, list):
82
+ return len(v) == 0
83
+ if isinstance(v, dict):
84
+ return len(v) == 0
85
+ return False
86
+
87
+
88
+ def strip_nullish(d: dict[str, Any]) -> dict[str, Any]:
89
+ """Recursively strip None, empty string, empty list, empty dict values."""
90
+ out: dict[str, Any] = {}
91
+ for k, v in d.items():
92
+ if _is_nullish(v):
93
+ continue
94
+ if isinstance(v, dict):
95
+ cleaned = strip_nullish(v)
96
+ if cleaned:
97
+ out[k] = cleaned
98
+ elif isinstance(v, list):
99
+ cleaned_list = [
100
+ strip_nullish(i) if isinstance(i, dict) else i for i in v if not _is_nullish(i)
101
+ ]
102
+ if cleaned_list:
103
+ out[k] = cleaned_list
104
+ else:
105
+ out[k] = v
106
+ return out
107
+
108
+
109
+ def dedup_lines(text: str, *, threshold: int = 2) -> str:
110
+ """Collapse consecutive duplicate lines into `<line> (xN)`.
111
+
112
+ Uses a single-pass algorithm — O(n) time, O(1) extra per group.
113
+ """
114
+ lines = text.split("\n")
115
+ if len(lines) <= 1:
116
+ return text
117
+
118
+ result: list[str] = []
119
+ prev = lines[0]
120
+ count = 1
121
+
122
+ for line in lines[1:]:
123
+ if line == prev:
124
+ count += 1
125
+ else:
126
+ _flush(result, prev, count, threshold)
127
+ prev = line
128
+ count = 1
129
+ _flush(result, prev, count, threshold)
130
+ return "\n".join(result)
131
+
132
+
133
+ def _flush(result: list[str], line: str, count: int, threshold: int) -> None:
134
+ if count >= threshold:
135
+ result.append(f"{line} (x{count})")
136
+ else:
137
+ result.extend([line] * count)
ptk/_types.py ADDED
@@ -0,0 +1,126 @@
1
+ """Content type detection — pure builtins, no deps."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import Enum, auto
6
+
7
+
8
+ class ContentType(Enum):
9
+ DICT = auto()
10
+ LIST = auto()
11
+ CODE = auto()
12
+ LOG = auto()
13
+ DIFF = auto()
14
+ TEXT = auto()
15
+
16
+
17
+ # ── fast heuristics (order matters — first match wins) ──────────────────
18
+
19
+ _CODE_MARKERS = frozenset(
20
+ {
21
+ "def ",
22
+ "class ",
23
+ "import ",
24
+ "from ",
25
+ "function ",
26
+ "const ",
27
+ "let ",
28
+ "var ",
29
+ "public ",
30
+ "private ",
31
+ "async ",
32
+ "await ",
33
+ "return ",
34
+ "if __name__",
35
+ "#!/",
36
+ "package ",
37
+ "func ",
38
+ "fn ",
39
+ "impl ",
40
+ "module ",
41
+ "export ",
42
+ "interface ",
43
+ "struct ",
44
+ }
45
+ )
46
+
47
+ _LOG_PATTERNS = frozenset(
48
+ {
49
+ # structured log levels
50
+ "[INFO]",
51
+ "[WARN]",
52
+ "[ERROR]",
53
+ "[DEBUG]",
54
+ "[TRACE]",
55
+ " INFO ",
56
+ " WARN ",
57
+ " ERROR ",
58
+ " DEBUG ",
59
+ " TRACE ",
60
+ "INFO:",
61
+ "WARN:",
62
+ "ERROR:",
63
+ "DEBUG:",
64
+ "TRACE:",
65
+ "WARNING:",
66
+ "CRITICAL:",
67
+ # test runner output (pytest, cargo test, go test, jest)
68
+ "PASSED",
69
+ "FAILED",
70
+ "ERRORS",
71
+ "--- PASS:",
72
+ "--- FAIL:", # go test
73
+ "test result: ", # cargo test
74
+ " passed",
75
+ " failed", # pytest summary
76
+ "✓",
77
+ "✗",
78
+ "✕", # jest / vitest
79
+ }
80
+ )
81
+
82
+
83
+ def _looks_like_diff(head: str) -> bool:
84
+ """Detect unified diff format. Requires @@ hunk header + file headers or diff --git."""
85
+ if "@@" not in head:
86
+ return False
87
+ # strong signal: diff --git header
88
+ if head.startswith("diff --git") or "\ndiff --git" in head:
89
+ return True
90
+ # also accept: --- line followed by +++ line (unified diff without git header)
91
+ has_minus = head.startswith("--- ") or "\n--- " in head
92
+ has_plus = "\n+++ " in head
93
+ return has_minus and has_plus
94
+
95
+
96
+ def detect(obj: object) -> ContentType:
97
+ """Detect content type from a Python object. O(1) for non-str types."""
98
+ if isinstance(obj, dict):
99
+ return ContentType.DICT
100
+ if isinstance(obj, (list, tuple)):
101
+ return ContentType.LIST
102
+ if not isinstance(obj, str):
103
+ # fallback: stringify anything else and treat as text
104
+ return ContentType.TEXT
105
+
106
+ # ── string heuristics (check first ~2KB for speed) ──────────────
107
+ head = obj[:2048]
108
+
109
+ # diff detection — requires real unified diff structure, not just --- or @@
110
+ if _looks_like_diff(head):
111
+ return ContentType.DIFF
112
+
113
+ # log detection runs BEFORE code — log patterns are more specific.
114
+ # e.g. pytest output contains 'def test_foo():' which would trigger
115
+ # code detection, but the PASSED/FAILED markers identify it as log first.
116
+ if any(m in head for m in _LOG_PATTERNS):
117
+ return ContentType.LOG
118
+
119
+ # code detection — any code keyword at start of a line
120
+ lines = head.split("\n", 30) # only check first ~30 lines
121
+ for line in lines:
122
+ stripped = line.lstrip()
123
+ if any(stripped.startswith(k) for k in _CODE_MARKERS):
124
+ return ContentType.CODE
125
+
126
+ return ContentType.TEXT
@@ -0,0 +1,17 @@
1
+ """Minimizer registry — auto-imports all strategies."""
2
+
3
+ from ptk.minimizers._code import CodeMinimizer
4
+ from ptk.minimizers._dict import DictMinimizer
5
+ from ptk.minimizers._diff import DiffMinimizer
6
+ from ptk.minimizers._list import ListMinimizer
7
+ from ptk.minimizers._log import LogMinimizer
8
+ from ptk.minimizers._text import TextMinimizer
9
+
10
+ __all__ = [
11
+ "DictMinimizer",
12
+ "ListMinimizer",
13
+ "CodeMinimizer",
14
+ "LogMinimizer",
15
+ "DiffMinimizer",
16
+ "TextMinimizer",
17
+ ]
@@ -0,0 +1,156 @@
1
+ """Code minimizer — comment stripping, whitespace normalization, signature extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+ from ptk._base import Minimizer
9
+
10
+ # ── precompiled regexes (compiled once at import time) ──────────────────
11
+
12
+ _BLOCK_COMMENT = re.compile(r"/\*.*?\*/", re.DOTALL)
13
+ # Match strings first (to skip), then comments. Group 1 = string (keep), Group 2 = comment (strip).
14
+ _STRING_OR_COMMENT_C = re.compile(
15
+ r"""("(?:[^"\\]|\\.)*"|'(?:[^'\\]|\\.)*')""" # group 1: quoted string
16
+ r"|" # OR
17
+ r"(//.*$)", # group 2: C-style inline comment
18
+ re.MULTILINE,
19
+ )
20
+ _STRING_OR_COMMENT_PY = re.compile(
21
+ r"""("(?:[^"\\]|\\.)*"|'(?:[^'\\]|\\.)*')""" # group 1: quoted string
22
+ r"|" # OR
23
+ r"(#.*$)", # group 2: Python inline comment
24
+ re.MULTILINE,
25
+ )
26
+ _DOCSTRING = re.compile(r"(\"\"\"[\s\S]*?\"\"\"|\'\'\'[\s\S]*?\'\'\')")
27
+ _BLANK_LINES = re.compile(r"\n{3,}")
28
+ _TRAILING_WS = re.compile(r"[ \t]+$", re.MULTILINE)
29
+
30
+ # pragma/directive comments that must be preserved
31
+ _PRAGMA_KEYWORDS: frozenset[str] = frozenset(
32
+ {
33
+ "noqa",
34
+ "type: ignore",
35
+ "type:ignore",
36
+ "TODO",
37
+ "FIXME",
38
+ "HACK",
39
+ "XXX",
40
+ "pragma",
41
+ "pylint:",
42
+ "fmt:",
43
+ "eslint-disable",
44
+ "eslint-enable",
45
+ "@ts-ignore",
46
+ "@ts-expect-error",
47
+ "noinspection",
48
+ }
49
+ )
50
+
51
+ # signature patterns for common languages
52
+ _PY_SIG = re.compile(
53
+ r"^([ \t]*(?:async\s+)?(?:def|class)\s+\w+.*?:)\s*$",
54
+ re.MULTILINE,
55
+ )
56
+ _JS_SIG = re.compile(
57
+ r"^([ \t]*(?:export\s+)?(?:async\s+)?(?:function\s+\w+|(?:const|let|var)\s+\w+\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>)[^{]*)",
58
+ re.MULTILINE,
59
+ )
60
+ _RUST_SIG = re.compile(
61
+ r"^([ \t]*(?:pub\s+)?(?:async\s+)?fn\s+\w+[^{]*)",
62
+ re.MULTILINE,
63
+ )
64
+ _GO_SIG = re.compile(
65
+ r"^([ \t]*func\s+(?:\([^)]*\)\s+)?\w+[^{]*)",
66
+ re.MULTILINE,
67
+ )
68
+
69
+ _SIG_PATTERNS = [_PY_SIG, _JS_SIG, _RUST_SIG, _GO_SIG]
70
+
71
+
72
+ class CodeMinimizer(Minimizer):
73
+ """Compress code by stripping comments, normalizing whitespace, extracting signatures.
74
+
75
+ Modes (via `mode` kwarg):
76
+ - "clean" (default) — strip comments + normalize whitespace
77
+ - "signatures" — extract function/class signatures only (huge savings)
78
+ """
79
+
80
+ def _minimize(self, obj: Any, *, aggressive: bool = False, **kw: Any) -> str:
81
+ code = obj if isinstance(obj, str) else str(obj)
82
+ mode = kw.get("mode", "signatures" if aggressive else "clean")
83
+
84
+ if mode == "signatures":
85
+ return _extract_signatures(code)
86
+ return _clean(code)
87
+
88
+
89
+ def _has_pragma(comment: str) -> bool:
90
+ """Check if a comment contains a pragma/directive that must be preserved."""
91
+ return any(kw in comment for kw in _PRAGMA_KEYWORDS)
92
+
93
+
94
+ def _strip_string_or_comment_c(m: re.Match[str]) -> str:
95
+ """Handle string-or-comment match: keep strings, strip comments (unless pragma)."""
96
+ if m.group(1): # it's a quoted string — keep it
97
+ return m.group(1)
98
+ comment = m.group(2) # it's a // comment
99
+ return comment if _has_pragma(comment) else ""
100
+
101
+
102
+ def _strip_string_or_comment_py(m: re.Match[str]) -> str:
103
+ """Handle string-or-comment match: keep strings, strip comments (unless pragma)."""
104
+ if m.group(1): # it's a quoted string — keep it
105
+ return m.group(1)
106
+ comment = m.group(2) # it's a # comment
107
+ return comment if _has_pragma(comment) else ""
108
+
109
+
110
+ def _strip_block_comment_if_safe(m: re.Match[str]) -> str:
111
+ """Remove a block comment unless it contains a pragma."""
112
+ return m.group(0) if _has_pragma(m.group(0)) else ""
113
+
114
+
115
+ def _collapse_docstring(m: re.Match[str]) -> str:
116
+ """Collapse a multi-line docstring to its first line only."""
117
+ full = m.group(0)
118
+ # detect the quote style
119
+ quote = full[:3]
120
+ inner = full[3:-3].strip()
121
+ lines = inner.split("\n")
122
+ first_line = lines[0].strip() if lines else ""
123
+ if not first_line:
124
+ return ""
125
+ # single-line docstrings or summaries — keep as one-liner
126
+ return f"{quote}{first_line}{quote}"
127
+
128
+
129
+ def _clean(code: str) -> str:
130
+ """Strip comments and normalize whitespace — language-agnostic.
131
+
132
+ Preserves pragma comments (noqa, type: ignore, TODO, eslint-disable, etc.)
133
+ and collapses multi-line docstrings to first line.
134
+ """
135
+ out = _BLOCK_COMMENT.sub(_strip_block_comment_if_safe, code)
136
+ # strip docstrings BEFORE inline comments so triple-quotes are handled first
137
+ out = _DOCSTRING.sub(_collapse_docstring, out)
138
+ # use string-aware patterns to avoid stripping // or # inside string literals
139
+ out = _STRING_OR_COMMENT_C.sub(_strip_string_or_comment_c, out)
140
+ out = _STRING_OR_COMMENT_PY.sub(_strip_string_or_comment_py, out)
141
+ out = _TRAILING_WS.sub("", out)
142
+ out = _BLANK_LINES.sub("\n\n", out)
143
+ return out.strip()
144
+
145
+
146
+ def _extract_signatures(code: str) -> str:
147
+ """Pull out only function/class signatures — works across Python, JS, Rust, Go."""
148
+ sigs: list[str] = []
149
+ for pattern in _SIG_PATTERNS:
150
+ sigs.extend(m.group(1).strip() for m in pattern.finditer(code))
151
+
152
+ if not sigs:
153
+ # fallback: return cleaned code if no signatures found
154
+ return _clean(code)
155
+
156
+ return "\n".join(sigs)