emergent-translator 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,448 @@
1
+ """AST-based code skeletonization for Python source files.
2
+
3
+ Replaces non-focal function/method bodies with ``...`` to reduce token usage
4
+ when feeding source code to LLMs. Uses only stdlib ``ast`` — zero extra
5
+ dependencies.
6
+
7
+ Example::
8
+
9
+ >>> from emergent_translator.code_skeleton import skeletonize
10
+ >>> print(skeletonize('''
11
+ ... class Foo:
12
+ ... def bar(self, x: int) -> int:
13
+ ... \"\"\"Return x squared.\"\"\"
14
+ ... return x * x
15
+ ... '''))
16
+ <BLANKLINE>
17
+ class Foo:
18
+ def bar(self, x: int) -> int:
19
+ \"\"\"Return x squared.\"\"\"
20
+ ...
21
+ <BLANKLINE>
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import ast
27
+ import fnmatch
28
+ import os
29
+ from dataclasses import dataclass, field
30
+ from pathlib import Path
31
+ from typing import Any, Dict, List, Optional, Set, Union
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Private helpers & dataclass
36
+ # ---------------------------------------------------------------------------
37
+
38
+ @dataclass(frozen=True)
39
+ class _BodyRange:
40
+ """Line-range metadata for a single function/async-function definition."""
41
+ qual_name: str
42
+ decorator_start: int # 1-based line of first decorator (or def line)
43
+ def_line: int # 1-based line of def/async def
44
+ body_start: int # 1-based first line of body
45
+ body_end: int # 1-based last line of body (inclusive)
46
+ docstring_end: Optional[int] # 1-based last line of docstring, or None
47
+ indent: int # column offset of body statements
48
+
49
+
50
+ def _is_docstring(node: ast.AST) -> bool:
51
+ """True if *node* is a string-literal expression (i.e. a docstring)."""
52
+ return (
53
+ isinstance(node, ast.Expr)
54
+ and isinstance(node.value, (ast.Constant,))
55
+ and isinstance(node.value.value, str)
56
+ )
57
+
58
+
59
+ def _get_docstring_end(body: list[ast.stmt]) -> Optional[int]:
60
+ """Return ``end_lineno`` of docstring if the first statement is one."""
61
+ if body and _is_docstring(body[0]):
62
+ return body[0].end_lineno
63
+ return None
64
+
65
+
66
+ def _body_indent(lines: list[str], body_start_0: int) -> int:
67
+ """Detect indentation width (number of leading spaces) at *body_start_0* (0-based)."""
68
+ if 0 <= body_start_0 < len(lines):
69
+ line = lines[body_start_0]
70
+ return len(line) - len(line.lstrip())
71
+ return 0
72
+
73
+
74
+ def _collect_ranges(tree: ast.Module, lines: list[str]) -> List[_BodyRange]:
75
+ """Walk the AST and return a ``_BodyRange`` for every function definition."""
76
+ ranges: List[_BodyRange] = []
77
+
78
+ def _walk(node: ast.AST, prefix: str = "") -> None:
79
+ for child in ast.iter_child_nodes(node):
80
+ if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
81
+ qual = f"{prefix}{child.name}" if not prefix else f"{prefix}.{child.name}"
82
+ dec_start = (
83
+ child.decorator_list[0].lineno
84
+ if child.decorator_list
85
+ else child.lineno
86
+ )
87
+ body_start = child.body[0].lineno
88
+ body_end = child.body[-1].end_lineno
89
+ ds_end = _get_docstring_end(child.body)
90
+ indent = _body_indent(lines, body_start - 1)
91
+
92
+ ranges.append(_BodyRange(
93
+ qual_name=qual,
94
+ decorator_start=dec_start,
95
+ def_line=child.lineno,
96
+ body_start=body_start,
97
+ body_end=body_end,
98
+ docstring_end=ds_end,
99
+ indent=indent,
100
+ ))
101
+ # Recurse into the function body for nested defs
102
+ _walk(child, qual)
103
+ elif isinstance(child, ast.ClassDef):
104
+ cls_qual = f"{prefix}{child.name}" if not prefix else f"{prefix}.{child.name}"
105
+ _walk(child, cls_qual)
106
+ else:
107
+ _walk(child, prefix)
108
+
109
+ _walk(tree)
110
+ return ranges
111
+
112
+
113
+ def _is_focal(qual_name: str, focal: Set[str]) -> bool:
114
+ """Check if *qual_name* matches any entry in *focal*.
115
+
116
+ Matching rules:
117
+ - Exact qualified name: ``"MyClass.my_method"``
118
+ - Simple (unqualified) name: ``"my_method"`` matches any ``*.my_method``
119
+ - Glob pattern: ``"MyClass.*"`` matches ``"MyClass.foo"``
120
+ """
121
+ simple_name = qual_name.rsplit(".", 1)[-1]
122
+ for pattern in focal:
123
+ if pattern == qual_name:
124
+ return True
125
+ if pattern == simple_name:
126
+ return True
127
+ if fnmatch.fnmatch(qual_name, pattern):
128
+ return True
129
+ return False
130
+
131
+
132
+ def _build_skeleton_lines(
133
+ lines: list[str],
134
+ ranges: List[_BodyRange],
135
+ focal: Set[str],
136
+ keep_docstrings: bool,
137
+ ) -> list[str]:
138
+ """Core line-surgery engine.
139
+
140
+ Returns a new list of lines with non-focal function bodies replaced by
141
+ ``...`` (preserving docstrings if requested).
142
+ """
143
+ if not ranges:
144
+ return list(lines)
145
+
146
+ # Determine which 1-based line numbers to suppress.
147
+ suppress: set[int] = set()
148
+ ellipsis_at: dict[int, int] = {} # line_no -> indent for placing `...`
149
+
150
+ for r in ranges:
151
+ if _is_focal(r.qual_name, focal):
152
+ continue
153
+
154
+ # Single-line function (def and body on same line) — keep as-is.
155
+ if r.body_start == r.def_line:
156
+ continue
157
+
158
+ # Determine where replacement starts (after docstring if keeping them).
159
+ if keep_docstrings and r.docstring_end is not None:
160
+ replace_start = r.docstring_end + 1
161
+ else:
162
+ replace_start = r.body_start
163
+
164
+ if replace_start > r.body_end:
165
+ # Body is only a docstring — nothing to replace.
166
+ continue
167
+
168
+ for lineno in range(replace_start, r.body_end + 1):
169
+ suppress.add(lineno)
170
+
171
+ # Place `...` at the replacement start line.
172
+ ellipsis_at[replace_start] = r.indent
173
+
174
+ # Un-suppress lines that belong to focal functions (nested focal inside
175
+ # non-focal parent).
176
+ for r in ranges:
177
+ if _is_focal(r.qual_name, focal):
178
+ for lineno in range(r.decorator_start, r.body_end + 1):
179
+ suppress.discard(lineno)
180
+
181
+ # Build output.
182
+ out: list[str] = []
183
+ for i, line in enumerate(lines):
184
+ lineno = i + 1 # 1-based
185
+ if lineno in ellipsis_at and lineno not in suppress:
186
+ # Edge case: this line is an ellipsis insertion point but wasn't
187
+ # suppressed (e.g. focal un-suppressed it). Just keep the line.
188
+ out.append(line)
189
+ elif lineno in ellipsis_at:
190
+ indent = ellipsis_at[lineno]
191
+ out.append(" " * indent + "...\n")
192
+ elif lineno not in suppress:
193
+ out.append(line)
194
+
195
+ return out
196
+
197
+
198
+ # ---------------------------------------------------------------------------
199
+ # Public dataclass
200
+ # ---------------------------------------------------------------------------
201
+
202
+ @dataclass(frozen=True)
203
+ class SkeletonResult:
204
+ """Result of skeletonizing a single file."""
205
+ source: str
206
+ original_lines: int
207
+ skeleton_lines: int
208
+ reduction_pct: float
209
+ functions_total: int
210
+ functions_skeletonized: int
211
+ functions_focal: int
212
+ original_tokens: int
213
+ skeleton_tokens: int
214
+ token_reduction_pct: float
215
+
216
+
217
+ # ---------------------------------------------------------------------------
218
+ # Free functions
219
+ # ---------------------------------------------------------------------------
220
+
221
+ def skeletonize(
222
+ source: str,
223
+ focal: Optional[list[str]] = None,
224
+ keep_docstrings: bool = True,
225
+ ) -> str:
226
+ """Return a skeletonized version of Python *source*.
227
+
228
+ Non-focal function/method bodies are replaced with ``...``. Focal
229
+ functions (matched by name, qualified name, or glob) keep their full
230
+ implementation.
231
+
232
+ Parameters
233
+ ----------
234
+ source:
235
+ Python source code as a string.
236
+ focal:
237
+ List of function/method names to keep. Supports qualified names
238
+ (``"Class.method"``), simple names (``"method"``), and globs
239
+ (``"Class.*"``).
240
+ keep_docstrings:
241
+ If True (default), docstrings are preserved even in skeletonized
242
+ functions.
243
+ """
244
+ if not source or not source.strip():
245
+ return source
246
+
247
+ try:
248
+ tree = ast.parse(source)
249
+ except SyntaxError:
250
+ return source
251
+
252
+ lines = source.splitlines(keepends=True)
253
+ # Ensure last line has a newline for consistent processing.
254
+ if lines and not lines[-1].endswith("\n"):
255
+ lines[-1] += "\n"
256
+ trailing_newline = False
257
+ else:
258
+ trailing_newline = True
259
+
260
+ focal_set: Set[str] = set(focal) if focal else set()
261
+ ranges = _collect_ranges(tree, lines)
262
+ result_lines = _build_skeleton_lines(lines, ranges, focal_set, keep_docstrings)
263
+ result = "".join(result_lines)
264
+
265
+ if not trailing_newline and result.endswith("\n"):
266
+ result = result[:-1]
267
+
268
+ return result
269
+
270
+
271
+ def skeletonize_file(
272
+ path: Union[str, Path],
273
+ focal: Optional[list[str]] = None,
274
+ keep_docstrings: bool = True,
275
+ ) -> SkeletonResult:
276
+ """Read a Python file and return a :class:`SkeletonResult` with stats."""
277
+ from .claude_compression import estimate_tokens
278
+
279
+ path = Path(path)
280
+ source = path.read_text(encoding="utf-8")
281
+ skeleton = skeletonize(source, focal=focal, keep_docstrings=keep_docstrings)
282
+
283
+ focal_set: Set[str] = set(focal) if focal else set()
284
+
285
+ try:
286
+ tree = ast.parse(source)
287
+ except SyntaxError:
288
+ tree = ast.Module(body=[])
289
+
290
+ lines = source.splitlines(keepends=True)
291
+ ranges = _collect_ranges(tree, lines)
292
+ total = len(ranges)
293
+ focal_count = sum(1 for r in ranges if _is_focal(r.qual_name, focal_set))
294
+ skeletonized = total - focal_count
295
+
296
+ orig_lines = len(source.splitlines())
297
+ skel_lines = len(skeleton.splitlines())
298
+ reduction = (1 - skel_lines / max(orig_lines, 1)) * 100
299
+
300
+ orig_tokens = estimate_tokens(source)
301
+ skel_tokens = estimate_tokens(skeleton)
302
+ token_reduction = (1 - skel_tokens / max(orig_tokens, 1)) * 100
303
+
304
+ return SkeletonResult(
305
+ source=skeleton,
306
+ original_lines=orig_lines,
307
+ skeleton_lines=skel_lines,
308
+ reduction_pct=round(reduction, 1),
309
+ functions_total=total,
310
+ functions_skeletonized=skeletonized,
311
+ functions_focal=focal_count,
312
+ original_tokens=orig_tokens,
313
+ skeleton_tokens=skel_tokens,
314
+ token_reduction_pct=round(token_reduction, 1),
315
+ )
316
+
317
+
318
+ def skeletonize_dir(
319
+ root: Union[str, Path],
320
+ focal: Optional[list[str]] = None,
321
+ keep_docstrings: bool = True,
322
+ exclude: Optional[list[str]] = None,
323
+ ) -> Dict[str, SkeletonResult]:
324
+ """Skeletonize all ``.py`` files under *root*.
325
+
326
+ Parameters
327
+ ----------
328
+ root:
329
+ Directory to walk.
330
+ focal:
331
+ Focal function names (applied to all files).
332
+ keep_docstrings:
333
+ Preserve docstrings in skeletonized functions.
334
+ exclude:
335
+ Glob patterns for paths to skip (matched against the path relative
336
+ to *root*).
337
+ """
338
+ root = Path(root)
339
+ exclude = exclude or []
340
+ results: Dict[str, SkeletonResult] = {}
341
+
342
+ for dirpath, _dirnames, filenames in os.walk(root):
343
+ for fname in sorted(filenames):
344
+ if not fname.endswith(".py"):
345
+ continue
346
+ full = Path(dirpath) / fname
347
+ rel = str(full.relative_to(root))
348
+
349
+ if any(fnmatch.fnmatch(rel, pat) for pat in exclude):
350
+ continue
351
+
352
+ try:
353
+ results[rel] = skeletonize_file(full, focal=focal, keep_docstrings=keep_docstrings)
354
+ except Exception:
355
+ # Skip files that can't be read/parsed.
356
+ continue
357
+
358
+ return results
359
+
360
+
361
+ # ---------------------------------------------------------------------------
362
+ # Class wrapper
363
+ # ---------------------------------------------------------------------------
364
+
365
+ class CodeSkeleton:
366
+ """Stateful wrapper around the skeletonization free functions.
367
+
368
+ Example::
369
+
370
+ >>> skel = CodeSkeleton(focal=["place_order"])
371
+ >>> result = skel.skeletonize_file("order_manager.py")
372
+ """
373
+
374
+ def __init__(
375
+ self,
376
+ focal: Optional[list[str]] = None,
377
+ keep_docstrings: bool = True,
378
+ exclude: Optional[list[str]] = None,
379
+ ):
380
+ self._focal: Set[str] = set(focal) if focal else set()
381
+ self._keep_docstrings = keep_docstrings
382
+ self._exclude = list(exclude) if exclude else []
383
+
384
+ # -- focal management ---------------------------------------------------
385
+
386
+ @property
387
+ def focal(self) -> Set[str]:
388
+ """Current set of focal function names."""
389
+ return set(self._focal)
390
+
391
+ def add_focal(self, *names: str) -> None:
392
+ """Add names to the focal set."""
393
+ self._focal.update(names)
394
+
395
+ def remove_focal(self, *names: str) -> None:
396
+ """Remove names from the focal set."""
397
+ self._focal -= set(names)
398
+
399
+ # -- delegation ---------------------------------------------------------
400
+
401
+ def skeletonize(self, source: str) -> str:
402
+ """Skeletonize Python *source* using current settings."""
403
+ return skeletonize(
404
+ source,
405
+ focal=list(self._focal),
406
+ keep_docstrings=self._keep_docstrings,
407
+ )
408
+
409
+ def skeletonize_file(self, path: Union[str, Path]) -> SkeletonResult:
410
+ """Skeletonize a file and return a :class:`SkeletonResult`."""
411
+ return skeletonize_file(
412
+ path,
413
+ focal=list(self._focal),
414
+ keep_docstrings=self._keep_docstrings,
415
+ )
416
+
417
+ def skeletonize_dir(self, root: Union[str, Path]) -> Dict[str, SkeletonResult]:
418
+ """Skeletonize all ``.py`` files under *root*."""
419
+ return skeletonize_dir(
420
+ root,
421
+ focal=list(self._focal),
422
+ keep_docstrings=self._keep_docstrings,
423
+ exclude=self._exclude,
424
+ )
425
+
426
+ @staticmethod
427
+ def summary(results: Dict[str, SkeletonResult]) -> Dict[str, Any]:
428
+ """Aggregate stats across multiple :class:`SkeletonResult` values."""
429
+ total_orig_lines = sum(r.original_lines for r in results.values())
430
+ total_skel_lines = sum(r.skeleton_lines for r in results.values())
431
+ total_orig_tokens = sum(r.original_tokens for r in results.values())
432
+ total_skel_tokens = sum(r.skeleton_tokens for r in results.values())
433
+ total_funcs = sum(r.functions_total for r in results.values())
434
+ total_skel = sum(r.functions_skeletonized for r in results.values())
435
+ total_focal = sum(r.functions_focal for r in results.values())
436
+
437
+ return {
438
+ "files": len(results),
439
+ "original_lines": total_orig_lines,
440
+ "skeleton_lines": total_skel_lines,
441
+ "reduction_pct": round((1 - total_skel_lines / max(total_orig_lines, 1)) * 100, 1),
442
+ "functions_total": total_funcs,
443
+ "functions_skeletonized": total_skel,
444
+ "functions_focal": total_focal,
445
+ "original_tokens": total_orig_tokens,
446
+ "skeleton_tokens": total_skel_tokens,
447
+ "token_reduction_pct": round((1 - total_skel_tokens / max(total_orig_tokens, 1)) * 100, 1),
448
+ }