agentslim 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentslim/__init__.py +27 -0
- agentslim/code.py +307 -0
- agentslim/compressor.py +242 -0
- agentslim/memory.py +201 -0
- agentslim/tools.py +176 -0
- agentslim/utils.py +130 -0
- agentslim-0.1.0.dist-info/METADATA +300 -0
- agentslim-0.1.0.dist-info/RECORD +10 -0
- agentslim-0.1.0.dist-info/WHEEL +4 -0
- agentslim-0.1.0.dist-info/licenses/LICENSE +21 -0
agentslim/__init__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
agentslim — Make your AI agents leaner, faster, and cheaper.
|
|
3
|
+
|
|
4
|
+
Core modules:
|
|
5
|
+
- memory: Smart context window with auto-summarization
|
|
6
|
+
- compressor: Text / HTML / JSON compressor before sending to LLM
|
|
7
|
+
- tools: Tool/function-call schema minifier
|
|
8
|
+
- code: Code-aware context extractor (send only relevant chunks)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from agentslim.memory import AgentMemory
|
|
12
|
+
from agentslim.compressor import Compressor
|
|
13
|
+
from agentslim.tools import ToolMinifier
|
|
14
|
+
from agentslim.code import CodeContext
|
|
15
|
+
from agentslim.utils import count_tokens, estimate_cost
|
|
16
|
+
|
|
17
|
+
__version__ = "0.1.0"
|
|
18
|
+
__author__ = "agentslim contributors"
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"AgentMemory",
|
|
22
|
+
"Compressor",
|
|
23
|
+
"ToolMinifier",
|
|
24
|
+
"CodeContext",
|
|
25
|
+
"count_tokens",
|
|
26
|
+
"estimate_cost",
|
|
27
|
+
]
|
agentslim/code.py
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
"""
|
|
2
|
+
code.py — Code-aware context extractor for coding agents.
|
|
3
|
+
|
|
4
|
+
Instead of sending an entire file (hundreds of tokens), send only the
|
|
5
|
+
relevant "chunk" — the function or class the agent needs, plus configurable
|
|
6
|
+
surrounding context lines.
|
|
7
|
+
|
|
8
|
+
Supports Python, JavaScript / TypeScript, and generic line-based extraction.
|
|
9
|
+
|
|
10
|
+
Quickstart::
|
|
11
|
+
|
|
12
|
+
from agentslim import CodeContext
|
|
13
|
+
|
|
14
|
+
ctx = CodeContext.extract_function("my_file.py", "process_payment")
|
|
15
|
+
print(ctx)
|
|
16
|
+
# → def process_payment(order_id: str) -> dict:
|
|
17
|
+
# ...
|
|
18
|
+
|
|
19
|
+
outline = CodeContext.outline("my_file.py")
|
|
20
|
+
# → ['class PaymentService (L12)', 'def process_payment (L34)', ...]
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import ast
|
|
26
|
+
import re
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Optional, Union
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class CodeContext:
|
|
32
|
+
"""
|
|
33
|
+
Code-aware context extractor — send only what the agent needs.
|
|
34
|
+
|
|
35
|
+
All methods are static; no instantiation required.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
39
|
+
# Public API
|
|
40
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
@staticmethod
|
|
43
|
+
def extract_function(
|
|
44
|
+
source: Union[str, Path],
|
|
45
|
+
function_name: str,
|
|
46
|
+
context_lines: int = 3,
|
|
47
|
+
) -> Optional[str]:
|
|
48
|
+
"""
|
|
49
|
+
Extract a single function or method from a Python source file.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
source: File path or raw source code string.
|
|
53
|
+
function_name: Name of the function / method to find.
|
|
54
|
+
context_lines: Lines of surrounding context to include.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Source of the function with surrounding context, or ``None``
|
|
58
|
+
if not found.
|
|
59
|
+
|
|
60
|
+
Example::
|
|
61
|
+
|
|
62
|
+
snippet = CodeContext.extract_function("app.py", "handle_request")
|
|
63
|
+
"""
|
|
64
|
+
code = _read_source(source)
|
|
65
|
+
try:
|
|
66
|
+
tree = ast.parse(code)
|
|
67
|
+
except SyntaxError:
|
|
68
|
+
return _fallback_extract(code, function_name, context_lines)
|
|
69
|
+
|
|
70
|
+
lines = code.splitlines()
|
|
71
|
+
for node in ast.walk(tree):
|
|
72
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
73
|
+
if node.name == function_name:
|
|
74
|
+
start = max(0, node.lineno - 1 - context_lines)
|
|
75
|
+
end = node.end_lineno # type: ignore[attr-defined]
|
|
76
|
+
end = min(len(lines), end + context_lines)
|
|
77
|
+
snippet = "\n".join(lines[start:end])
|
|
78
|
+
return f"# File: {source} (L{start + 1}–L{end})\n{snippet}"
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
@staticmethod
|
|
82
|
+
def extract_class(
|
|
83
|
+
source: Union[str, Path],
|
|
84
|
+
class_name: str,
|
|
85
|
+
methods_only: bool = False,
|
|
86
|
+
) -> Optional[str]:
|
|
87
|
+
"""
|
|
88
|
+
Extract a class definition from Python source.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
source: File path or raw source code string.
|
|
92
|
+
class_name: Name of the class to find.
|
|
93
|
+
methods_only: If ``True``, return only method signatures (no bodies).
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Class source or skeleton, or ``None`` if not found.
|
|
97
|
+
"""
|
|
98
|
+
code = _read_source(source)
|
|
99
|
+
try:
|
|
100
|
+
tree = ast.parse(code)
|
|
101
|
+
except SyntaxError:
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
lines = code.splitlines()
|
|
105
|
+
for node in ast.walk(tree):
|
|
106
|
+
if isinstance(node, ast.ClassDef) and node.name == class_name:
|
|
107
|
+
start = node.lineno - 1
|
|
108
|
+
end = node.end_lineno # type: ignore[attr-defined]
|
|
109
|
+
if methods_only:
|
|
110
|
+
return _class_skeleton(node, lines)
|
|
111
|
+
snippet = "\n".join(lines[start:end])
|
|
112
|
+
return f"# File: {source} (L{start + 1}–L{end})\n{snippet}"
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
@staticmethod
|
|
116
|
+
def extract_lines(
|
|
117
|
+
source: Union[str, Path],
|
|
118
|
+
start_line: int,
|
|
119
|
+
end_line: int,
|
|
120
|
+
context_lines: int = 0,
|
|
121
|
+
) -> str:
|
|
122
|
+
"""
|
|
123
|
+
Extract specific line range from a file (1-indexed).
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
source: File path or raw source code string.
|
|
127
|
+
start_line: First line to include (1-indexed).
|
|
128
|
+
end_line: Last line to include (1-indexed, inclusive).
|
|
129
|
+
context_lines: Extra lines of context around the range.
|
|
130
|
+
"""
|
|
131
|
+
code = _read_source(source)
|
|
132
|
+
lines = code.splitlines()
|
|
133
|
+
start = max(0, start_line - 1 - context_lines)
|
|
134
|
+
end = min(len(lines), end_line + context_lines)
|
|
135
|
+
snippet = "\n".join(lines[start:end])
|
|
136
|
+
return f"# File: {source} (L{start + 1}–L{end})\n{snippet}"
|
|
137
|
+
|
|
138
|
+
@staticmethod
|
|
139
|
+
def outline(source: Union[str, Path], language: str = "auto") -> list[str]:
|
|
140
|
+
"""
|
|
141
|
+
Return a high-level outline of the file: class and function names
|
|
142
|
+
with line numbers.
|
|
143
|
+
|
|
144
|
+
Useful for giving an agent an overview without sending full source.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
source: File path or raw source code string.
|
|
148
|
+
language: ``"python"``, ``"js"``/``"ts"``, or ``"auto"`` (detect).
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
List of strings like ``['class PaymentService (L12)', ...]``.
|
|
152
|
+
|
|
153
|
+
Example::
|
|
154
|
+
|
|
155
|
+
for item in CodeContext.outline("api/routes.py"):
|
|
156
|
+
print(item)
|
|
157
|
+
"""
|
|
158
|
+
code = _read_source(source)
|
|
159
|
+
lang = _detect_language(source, language)
|
|
160
|
+
|
|
161
|
+
if lang == "python":
|
|
162
|
+
return _python_outline(code)
|
|
163
|
+
if lang in ("js", "ts"):
|
|
164
|
+
return _js_outline(code)
|
|
165
|
+
return _generic_outline(code)
|
|
166
|
+
|
|
167
|
+
@staticmethod
|
|
168
|
+
def folded(source: Union[str, Path], language: str = "auto") -> str:
|
|
169
|
+
"""
|
|
170
|
+
Return the source with function/method bodies replaced by ``...``.
|
|
171
|
+
|
|
172
|
+
Dramatically reduces token count while preserving structure.
|
|
173
|
+
|
|
174
|
+
Example::
|
|
175
|
+
|
|
176
|
+
folded = CodeContext.folded("large_module.py")
|
|
177
|
+
# class MyClass:
|
|
178
|
+
# def method_one(self): ...
|
|
179
|
+
# def method_two(self, x: int) -> str: ...
|
|
180
|
+
"""
|
|
181
|
+
code = _read_source(source)
|
|
182
|
+
lang = _detect_language(source, language)
|
|
183
|
+
|
|
184
|
+
if lang == "python":
|
|
185
|
+
return _python_folded(code)
|
|
186
|
+
# For other languages, return outline as string
|
|
187
|
+
items = CodeContext.outline(source, language)
|
|
188
|
+
return "\n".join(items)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
192
|
+
# Internal helpers
|
|
193
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
194
|
+
|
|
195
|
+
def _read_source(source: Union[str, Path]) -> str:
|
|
196
|
+
path = Path(source)
|
|
197
|
+
if path.exists():
|
|
198
|
+
return path.read_text(encoding="utf-8", errors="replace")
|
|
199
|
+
return str(source) # Treat as raw source code
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _detect_language(source: Union[str, Path], hint: str) -> str:
|
|
203
|
+
if hint != "auto":
|
|
204
|
+
return hint.lower()
|
|
205
|
+
suffix = Path(str(source)).suffix.lower()
|
|
206
|
+
return {"py": "python", ".py": "python", ".js": "js", ".ts": "ts", ".jsx": "js", ".tsx": "ts"}.get(
|
|
207
|
+
suffix, "generic"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _fallback_extract(code: str, name: str, context_lines: int) -> Optional[str]:
|
|
212
|
+
"""Regex-based fallback for non-parsable files."""
|
|
213
|
+
lines = code.splitlines()
|
|
214
|
+
for i, line in enumerate(lines):
|
|
215
|
+
if re.match(rf"\s*def\s+{re.escape(name)}\s*\(", line):
|
|
216
|
+
start = max(0, i - context_lines)
|
|
217
|
+
# Find end heuristically: next line at same or lower indent
|
|
218
|
+
indent = len(line) - len(line.lstrip())
|
|
219
|
+
end = i + 1
|
|
220
|
+
while end < len(lines):
|
|
221
|
+
l = lines[end]
|
|
222
|
+
if l.strip() == "":
|
|
223
|
+
end += 1
|
|
224
|
+
continue
|
|
225
|
+
if len(l) - len(l.lstrip()) <= indent and end > i + 1:
|
|
226
|
+
break
|
|
227
|
+
end += 1
|
|
228
|
+
end = min(len(lines), end + context_lines)
|
|
229
|
+
return "\n".join(lines[start:end])
|
|
230
|
+
return None
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _python_outline(code: str) -> list[str]:
|
|
234
|
+
try:
|
|
235
|
+
tree = ast.parse(code)
|
|
236
|
+
except SyntaxError:
|
|
237
|
+
return _generic_outline(code)
|
|
238
|
+
items = []
|
|
239
|
+
for node in ast.walk(tree):
|
|
240
|
+
if isinstance(node, ast.ClassDef):
|
|
241
|
+
items.append((node.lineno, f"class {node.name} (L{node.lineno})"))
|
|
242
|
+
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
243
|
+
prefix = "async def" if isinstance(node, ast.AsyncFunctionDef) else "def"
|
|
244
|
+
items.append((node.lineno, f"{prefix} {node.name} (L{node.lineno})"))
|
|
245
|
+
items.sort()
|
|
246
|
+
return [label for _, label in items]
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _js_outline(code: str) -> list[str]:
|
|
250
|
+
items = []
|
|
251
|
+
patterns = [
|
|
252
|
+
(r"^(export\s+)?(default\s+)?class\s+(\w+)", "class"),
|
|
253
|
+
(r"^(export\s+)?(async\s+)?function\s+(\w+)", "function"),
|
|
254
|
+
(r"^\s*(const|let|var)\s+(\w+)\s*=\s*(async\s+)?\(", "arrow fn"),
|
|
255
|
+
]
|
|
256
|
+
for i, line in enumerate(code.splitlines(), 1):
|
|
257
|
+
for pattern, kind in patterns:
|
|
258
|
+
m = re.match(pattern, line.strip())
|
|
259
|
+
if m:
|
|
260
|
+
name = m.group(m.lastindex or 1) if m.lastindex else line[:40]
|
|
261
|
+
items.append(f"{kind} {name} (L{i})")
|
|
262
|
+
break
|
|
263
|
+
return items
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _generic_outline(code: str) -> list[str]:
|
|
267
|
+
items = []
|
|
268
|
+
for i, line in enumerate(code.splitlines(), 1):
|
|
269
|
+
if re.match(r"^\s*(def |class |function |const |export )", line):
|
|
270
|
+
items.append(f"L{i}: {line.strip()[:80]}")
|
|
271
|
+
return items
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _class_skeleton(node: ast.ClassDef, lines: list[str]) -> str:
|
|
275
|
+
"""Return class header + method signatures only, bodies as '...'."""
|
|
276
|
+
parts = [f"class {node.name}:"]
|
|
277
|
+
for item in node.body:
|
|
278
|
+
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
279
|
+
sig_line = lines[item.lineno - 1].strip()
|
|
280
|
+
parts.append(f" {sig_line} ...")
|
|
281
|
+
return "\n".join(parts)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _python_folded(code: str) -> str:
|
|
285
|
+
"""Replace function bodies with '...' using AST."""
|
|
286
|
+
try:
|
|
287
|
+
tree = ast.parse(code)
|
|
288
|
+
except SyntaxError:
|
|
289
|
+
return code
|
|
290
|
+
|
|
291
|
+
lines = code.splitlines()
|
|
292
|
+
replacements: list[tuple[int, int, str]] = [] # (start_line, end_line, replacement)
|
|
293
|
+
|
|
294
|
+
for node in ast.walk(tree):
|
|
295
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
296
|
+
sig_line = node.lineno - 1
|
|
297
|
+
body_start = node.body[0].lineno - 1
|
|
298
|
+
body_end = node.end_lineno - 1 # type: ignore[attr-defined]
|
|
299
|
+
indent = " " * (len(lines[sig_line]) - len(lines[sig_line].lstrip()) + 4)
|
|
300
|
+
replacements.append((body_start, body_end, f"{indent}..."))
|
|
301
|
+
|
|
302
|
+
# Apply replacements from bottom to top to preserve line numbers
|
|
303
|
+
replacements.sort(key=lambda x: x[0], reverse=True)
|
|
304
|
+
for start, end, replacement in replacements:
|
|
305
|
+
lines[start : end + 1] = [replacement]
|
|
306
|
+
|
|
307
|
+
return "\n".join(lines)
|
agentslim/compressor.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""
|
|
2
|
+
compressor.py — Text, HTML, JSON, and Markdown compressor for LLM prompts.
|
|
3
|
+
|
|
4
|
+
Strategy:
|
|
5
|
+
1. Strip HTML tags / attributes that carry zero semantic value.
|
|
6
|
+
2. Collapse redundant whitespace and blank lines.
|
|
7
|
+
3. Remove code comments (optional — preserves intent by default).
|
|
8
|
+
4. Compact JSON / YAML to minimal representation.
|
|
9
|
+
5. Remove filler phrases from natural language ("As an AI language model…").
|
|
10
|
+
|
|
11
|
+
All steps are individually togglable so you control the trade-off between
|
|
12
|
+
compression ratio and information fidelity.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import json
|
|
18
|
+
import re
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from typing import Optional
|
|
21
|
+
|
|
22
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
23
|
+
# Filler phrases common in LLM outputs / web scrapes (English + Russian)
|
|
24
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
25
|
+
_FILLER_PHRASES: list[str] = [
|
|
26
|
+
r"as an ai language model[,\s]*",
|
|
27
|
+
r"i('m| am) just an ai[,\s]*",
|
|
28
|
+
r"certainly[,!]?\s*",
|
|
29
|
+
r"of course[,!]?\s*",
|
|
30
|
+
r"sure[,!]?\s*here('s| is)[,\s]*",
|
|
31
|
+
r"great question[,!]?\s*",
|
|
32
|
+
r"absolutely[,!]?\s*",
|
|
33
|
+
r"как языковая модель[,\s]*",
|
|
34
|
+
r"конечно[,!]?\s*",
|
|
35
|
+
r"разумеется[,!]?\s*",
|
|
36
|
+
r"отличный вопрос[,!]?\s*",
|
|
37
|
+
]
|
|
38
|
+
_FILLER_RE = re.compile(
|
|
39
|
+
"|".join(_FILLER_PHRASES),
|
|
40
|
+
flags=re.IGNORECASE,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# HTML tags that are purely structural/decorative — safe to remove entirely
|
|
44
|
+
_DECORATIVE_TAGS = {
|
|
45
|
+
"script", "style", "nav", "footer", "header",
|
|
46
|
+
"aside", "advertisement", "noscript", "svg",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class CompressorConfig:
|
|
52
|
+
"""Fine-grained control over which compression steps are applied."""
|
|
53
|
+
|
|
54
|
+
strip_html: bool = True
|
|
55
|
+
"""Remove HTML tags, keeping only inner text."""
|
|
56
|
+
|
|
57
|
+
remove_decorative_html: bool = True
|
|
58
|
+
"""Drop entire <script>, <style>, <nav>, etc. blocks before stripping."""
|
|
59
|
+
|
|
60
|
+
collapse_whitespace: bool = True
|
|
61
|
+
"""Merge multiple blank lines / spaces into one."""
|
|
62
|
+
|
|
63
|
+
remove_filler_phrases: bool = True
|
|
64
|
+
"""Strip common LLM filler phrases from text."""
|
|
65
|
+
|
|
66
|
+
compact_json: bool = True
|
|
67
|
+
"""Re-serialize JSON with no indentation / extra spaces."""
|
|
68
|
+
|
|
69
|
+
remove_python_comments: bool = False
|
|
70
|
+
"""Strip ``# …`` comment lines from Python code (off by default)."""
|
|
71
|
+
|
|
72
|
+
remove_js_comments: bool = False
|
|
73
|
+
"""Strip ``// …`` and ``/* … */`` comments from JS/TS code."""
|
|
74
|
+
|
|
75
|
+
max_consecutive_newlines: int = 2
|
|
76
|
+
"""Maximum number of consecutive newlines kept after collapsing."""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class Compressor:
|
|
80
|
+
"""
|
|
81
|
+
Multi-format text compressor that reduces token usage before sending
|
|
82
|
+
content to an LLM.
|
|
83
|
+
|
|
84
|
+
Quickstart::
|
|
85
|
+
|
|
86
|
+
from agentslim import Compressor
|
|
87
|
+
|
|
88
|
+
c = Compressor()
|
|
89
|
+
slim = c.compress(raw_html_page)
|
|
90
|
+
print(slim)
|
|
91
|
+
|
|
92
|
+
You can also use format-specific helpers::
|
|
93
|
+
|
|
94
|
+
c.compress_json(big_json_string)
|
|
95
|
+
c.compress_html(html_string)
|
|
96
|
+
c.compress_code(python_source, language="python")
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
def __init__(self, config: Optional[CompressorConfig] = None) -> None:
|
|
100
|
+
self.config = config or CompressorConfig()
|
|
101
|
+
|
|
102
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
103
|
+
# Public API
|
|
104
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
105
|
+
|
|
106
|
+
def compress(self, text: str) -> str:
|
|
107
|
+
"""
|
|
108
|
+
Auto-detect format and compress accordingly.
|
|
109
|
+
|
|
110
|
+
Detects JSON, HTML, or treats input as plain text / markdown.
|
|
111
|
+
"""
|
|
112
|
+
stripped = text.strip()
|
|
113
|
+
if self._looks_like_json(stripped):
|
|
114
|
+
return self.compress_json(stripped)
|
|
115
|
+
if self._looks_like_html(stripped):
|
|
116
|
+
return self.compress_html(stripped)
|
|
117
|
+
return self.compress_text(stripped)
|
|
118
|
+
|
|
119
|
+
def compress_text(self, text: str) -> str:
|
|
120
|
+
"""Compress natural-language text (plain text / Markdown)."""
|
|
121
|
+
if self.config.remove_filler_phrases:
|
|
122
|
+
text = _FILLER_RE.sub("", text)
|
|
123
|
+
if self.config.collapse_whitespace:
|
|
124
|
+
text = self._collapse_whitespace(text)
|
|
125
|
+
return text.strip()
|
|
126
|
+
|
|
127
|
+
def compress_html(self, html: str) -> str:
|
|
128
|
+
"""Strip HTML tags and decorative blocks, returning clean text."""
|
|
129
|
+
if self.config.remove_decorative_html:
|
|
130
|
+
html = self._remove_decorative_blocks(html)
|
|
131
|
+
if self.config.strip_html:
|
|
132
|
+
html = self._strip_html_tags(html)
|
|
133
|
+
return self.compress_text(html)
|
|
134
|
+
|
|
135
|
+
def compress_json(self, json_str: str) -> str:
|
|
136
|
+
"""
|
|
137
|
+
Compact JSON to minimal representation (no indentation, no spaces).
|
|
138
|
+
|
|
139
|
+
Falls back gracefully if the input is not valid JSON.
|
|
140
|
+
"""
|
|
141
|
+
if not self.config.compact_json:
|
|
142
|
+
return json_str
|
|
143
|
+
try:
|
|
144
|
+
obj = json.loads(json_str)
|
|
145
|
+
return json.dumps(obj, ensure_ascii=False, separators=(",", ":"))
|
|
146
|
+
except json.JSONDecodeError:
|
|
147
|
+
# Not valid JSON — treat as plain text
|
|
148
|
+
return self.compress_text(json_str)
|
|
149
|
+
|
|
150
|
+
def compress_code(self, source: str, language: str = "python") -> str:
|
|
151
|
+
"""
|
|
152
|
+
Compress source code by optionally removing comments.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
source: Raw source code string.
|
|
156
|
+
language: ``"python"``, ``"js"``, or ``"ts"`` (TypeScript).
|
|
157
|
+
"""
|
|
158
|
+
lang = language.lower()
|
|
159
|
+
if lang == "python" and self.config.remove_python_comments:
|
|
160
|
+
source = self._strip_python_comments(source)
|
|
161
|
+
elif lang in ("js", "ts", "javascript", "typescript") and self.config.remove_js_comments:
|
|
162
|
+
source = self._strip_js_comments(source)
|
|
163
|
+
if self.config.collapse_whitespace:
|
|
164
|
+
source = self._collapse_whitespace(source)
|
|
165
|
+
return source.strip()
|
|
166
|
+
|
|
167
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
168
|
+
# Private helpers
|
|
169
|
+
# ──────────────────────────────────────────────────────────────────────────
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def _looks_like_json(text: str) -> bool:
|
|
173
|
+
return (text.startswith("{") and text.endswith("}")) or (
|
|
174
|
+
text.startswith("[") and text.endswith("]")
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
@staticmethod
|
|
178
|
+
def _looks_like_html(text: str) -> bool:
|
|
179
|
+
return bool(re.search(r"<\s*(html|body|div|p|span|a|head)", text, re.IGNORECASE))
|
|
180
|
+
|
|
181
|
+
@staticmethod
|
|
182
|
+
def _remove_decorative_blocks(html: str) -> str:
|
|
183
|
+
for tag in _DECORATIVE_TAGS:
|
|
184
|
+
pattern = re.compile(
|
|
185
|
+
rf"<{tag}[^>]*>.*?</{tag}>",
|
|
186
|
+
flags=re.IGNORECASE | re.DOTALL,
|
|
187
|
+
)
|
|
188
|
+
html = pattern.sub("", html)
|
|
189
|
+
return html
|
|
190
|
+
|
|
191
|
+
@staticmethod
|
|
192
|
+
def _strip_html_tags(html: str) -> str:
|
|
193
|
+
# Replace block-level tags with newlines for readability
|
|
194
|
+
block_tags = re.compile(
|
|
195
|
+
r"<(br|p|div|li|tr|h[1-6]|blockquote)[^>]*>",
|
|
196
|
+
flags=re.IGNORECASE,
|
|
197
|
+
)
|
|
198
|
+
html = block_tags.sub("\n", html)
|
|
199
|
+
# Strip remaining tags
|
|
200
|
+
html = re.sub(r"<[^>]+>", "", html)
|
|
201
|
+
# Decode common HTML entities
|
|
202
|
+
html = (
|
|
203
|
+
html.replace("&", "&")
|
|
204
|
+
.replace("<", "<")
|
|
205
|
+
.replace(">", ">")
|
|
206
|
+
.replace(""", '"')
|
|
207
|
+
.replace("'", "'")
|
|
208
|
+
.replace(" ", " ")
|
|
209
|
+
)
|
|
210
|
+
return html
|
|
211
|
+
|
|
212
|
+
def _collapse_whitespace(self, text: str) -> str:
|
|
213
|
+
# Replace tabs and non-breaking spaces with regular space
|
|
214
|
+
text = re.sub(r"[ \t]+", " ", text)
|
|
215
|
+
# Collapse more than N consecutive newlines
|
|
216
|
+
limit = self.config.max_consecutive_newlines
|
|
217
|
+
text = re.sub(rf"\n{{{limit + 1},}}", "\n" * limit, text)
|
|
218
|
+
# Strip trailing space on each line
|
|
219
|
+
text = "\n".join(line.rstrip() for line in text.splitlines())
|
|
220
|
+
return text
|
|
221
|
+
|
|
222
|
+
@staticmethod
|
|
223
|
+
def _strip_python_comments(source: str) -> str:
|
|
224
|
+
lines = []
|
|
225
|
+
for line in source.splitlines():
|
|
226
|
+
stripped = line.lstrip()
|
|
227
|
+
if stripped.startswith("#"):
|
|
228
|
+
continue # Drop full-line comments
|
|
229
|
+
# Inline comment — keep code, drop comment
|
|
230
|
+
# (naive: doesn't handle # inside strings)
|
|
231
|
+
if " #" in line:
|
|
232
|
+
line = line[: line.index(" #")]
|
|
233
|
+
lines.append(line)
|
|
234
|
+
return "\n".join(lines)
|
|
235
|
+
|
|
236
|
+
@staticmethod
|
|
237
|
+
def _strip_js_comments(source: str) -> str:
|
|
238
|
+
# Remove /* … */ block comments
|
|
239
|
+
source = re.sub(r"/\*.*?\*/", "", source, flags=re.DOTALL)
|
|
240
|
+
# Remove // … line comments (not inside strings — simplified)
|
|
241
|
+
source = re.sub(r"//[^\n]*", "", source)
|
|
242
|
+
return source
|