py-adtools 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
adtools/py_code.py ADDED
@@ -0,0 +1,577 @@
1
+ """
2
+ Copyright (c) 2025 Rui Zhang <rzhang.cs@gmail.com>
3
+
4
+ NOTICE: This code is under MIT license. This code is intended for academic/research purposes only.
5
+ Commercial use of this software or its derivatives requires prior written permission.
6
+ """
7
+
8
+ import ast
9
+ import dataclasses
10
+ import textwrap
11
+ import tokenize
12
+ from typing import List, Optional, Union, Set, Any
13
+ from io import BytesIO
14
+
15
+ __all__ = ["PyCodeBlock", "PyFunction", "PyClass", "PyProgram"]
16
+
17
+
18
+ @dataclasses.dataclass
19
+ class PyCodeBlock:
20
+ """A parsed Python code block (e.g., top-level code that is not in classes/functions,
21
+ or miscellaneous statements inside a class).
22
+ """
23
+
24
+ code: str
25
+
26
+ def __str__(self) -> str:
27
+ return self.code
28
+
29
+ def __repr__(self) -> str:
30
+ return self.__str__() + "\n"
31
+
32
+ def _to_str(self, indent_str=""):
33
+ return _indent_code_skip_multi_line_str(self.code, indent_str)
34
+
35
+
36
+ # Adapted from: https://github.com/google-deepmind/funsearch/blob/main/implementation/code_manipulation.py
37
+ @dataclasses.dataclass
38
+ class PyFunction:
39
+ """A parsed Python function."""
40
+
41
+ decorator: str
42
+ name: str
43
+ args: str
44
+ body: str
45
+ return_type: Optional[str] = None
46
+ docstring: Optional[str] = None
47
+ is_async: bool = False
48
+
49
+ def _to_str(self, indent_str=""):
50
+ return_type = f" -> {self.return_type}" if self.return_type else ""
51
+ function_def = f"{self.decorator}\n" if self.decorator else ""
52
+ prefix = "async def" if self.is_async else "def"
53
+ function_def += f"{prefix} {self.name}({self.args}){return_type}:"
54
+ # Indent function signature
55
+ function_def = textwrap.indent(function_def, indent_str) + "\n"
56
+
57
+ if self.docstring:
58
+ # We indent the docstring. Assumes 4-space standard indentation for generation
59
+ new_line = "\n" if self.body else ""
60
+ # If the docstring has multiple lines, we add a line break
61
+ is_multi_line_doc = len(self.docstring.splitlines()) > 1
62
+ docstring_end = "\n " if is_multi_line_doc else ""
63
+ # Indent doc-string
64
+ function_def += (
65
+ textwrap.indent(
66
+ f' """{self.docstring}{docstring_end}"""', indent_str
67
+ )
68
+ + new_line
69
+ )
70
+
71
+ # Indent function body
72
+ function_def += _indent_code_skip_multi_line_str(self.body, indent_str)
73
+ return function_def
74
+
75
+ def __str__(self) -> str:
76
+ return_type = f" -> {self.return_type}" if self.return_type else ""
77
+ function_def = f"{self.decorator}\n" if self.decorator else ""
78
+ prefix = "async def" if self.is_async else "def"
79
+ function_def += f"{prefix} {self.name}({self.args}){return_type}:\n"
80
+
81
+ if self.docstring:
82
+ # We indent the docstring. Assumes 4-space standard indentation for generation
83
+ new_line = "\n" if self.body else ""
84
+ # If the docstring has multiple lines, we add a line break
85
+ is_multi_line_doc = len(self.docstring.splitlines()) > 1
86
+ docstring_end = "\n " if is_multi_line_doc else ""
87
+ function_def += f' """{self.docstring}{docstring_end}"""{new_line}'
88
+
89
+ # The body is expected to be already indented (if parsed correctly).
90
+ # We ensure it is indented relative to the function definition.
91
+ function_def += self.body
92
+ return function_def
93
+
94
+ def __repr__(self) -> str:
95
+ return self.__str__() + "\n\n"
96
+
97
+ def __setattr__(self, name: str, value: str) -> None:
98
+ # Ensure there aren't leading & trailing new lines in `body`
99
+ if name == "body" and isinstance(value, str):
100
+ value = value.strip("\n")
101
+ # Ensure there aren't leading & trailing quotes in `docstring`
102
+ if name == "docstring" and value is not None:
103
+ if '"""' in value:
104
+ value = value.strip()
105
+ value = value.replace('"""', "")
106
+ super().__setattr__(name, value)
107
+
108
+ @classmethod
109
+ def extract_first_function_from_text(cls, text: str) -> "PyFunction":
110
+ """Parses text and returns the first function found."""
111
+ tree = ast.parse(text)
112
+ visitor = _ProgramVisitor(text)
113
+ visitor.visit(tree)
114
+ program = visitor.return_program()
115
+ if not program.functions:
116
+ raise ValueError("No functions found in the provided text.")
117
+ return program.functions[0]
118
+
119
+ @classmethod
120
+ def extract_all_functions_from_text(cls, text: str) -> List["PyFunction"]:
121
+ """Parses text and returns all top-level functions found."""
122
+ tree = ast.parse(text)
123
+ visitor = _ProgramVisitor(text)
124
+ visitor.visit(tree)
125
+ program = visitor.return_program()
126
+ return program.functions
127
+
128
+
129
+ @dataclasses.dataclass
130
+ class PyClass:
131
+ """A parsed Python class."""
132
+
133
+ decorator: str
134
+ name: str
135
+ bases: str
136
+
137
+ # Holds raw code blocks (variables, assignments) found in the class body.
138
+ statements: Optional[List[PyCodeBlock]] = None
139
+ docstring: Optional[str] = None
140
+ functions: List[PyFunction] = dataclasses.field(default_factory=list)
141
+ # Holds everything in order (Methods + Statements + Gaps).
142
+ body: Optional[List[Union[PyCodeBlock, PyFunction]]] = None
143
+
144
+ def __str__(self) -> str:
145
+ class_def = f"{self.decorator}\n" if self.decorator else ""
146
+ class_def += f"class {self.name}"
147
+ if self.bases:
148
+ class_def += f"({self.bases})"
149
+ class_def += ":\n"
150
+
151
+ if self.docstring:
152
+ # If the docstring has multiple lines, we add a line break
153
+ is_multi_line_doc = len(self.docstring.splitlines()) > 1
154
+ docstring_end = "\n " if is_multi_line_doc else ""
155
+ class_def += f' """{self.docstring}{docstring_end}"""\n\n'
156
+
157
+ if self.body:
158
+ last_item = None
159
+ for i, item in enumerate(self.body):
160
+ if last_item is not None:
161
+ # If there are not two consecutive PyCodeBlock instances, we add a new line
162
+ if not (
163
+ isinstance(last_item, PyCodeBlock)
164
+ and isinstance(item, PyCodeBlock)
165
+ ):
166
+ class_def += "\n"
167
+
168
+ # Use item._to_str() to indent each item in the class
169
+ assert isinstance(item, (PyCodeBlock, PyFunction))
170
+ class_def += str(item._to_str(indent_str=" "))
171
+ class_def += "\n" if i != len(self.body) - 1 else ""
172
+ last_item = item
173
+ else:
174
+ class_def += " pass"
175
+
176
+ return class_def
177
+
178
+ def __repr__(self):
179
+ return self.__str__() + "\n\n"
180
+
181
+ def __setattr__(self, name: str, value: str) -> None:
182
+ if name == "body" and isinstance(value, str):
183
+ value = value.strip("\n")
184
+ if name == "docstring" and value is not None:
185
+ if '"""' in value:
186
+ value = value.strip()
187
+ value = value.replace('"""', "")
188
+ super().__setattr__(name, value)
189
+
190
+ @classmethod
191
+ def extract_first_class_from_text(cls, text: str) -> "PyClass":
192
+ tree = ast.parse(text)
193
+ visitor = _ProgramVisitor(text)
194
+ visitor.visit(tree)
195
+ program = visitor.return_program()
196
+ if not program.classes:
197
+ raise ValueError("No classes found in the provided text.")
198
+ return program.classes[0]
199
+
200
+ @classmethod
201
+ def extract_all_classes_from_text(cls, text: str) -> List["PyClass"]:
202
+ tree = ast.parse(text)
203
+ visitor = _ProgramVisitor(text)
204
+ visitor.visit(tree)
205
+ program = visitor.return_program()
206
+ return program.classes
207
+
208
+
209
+ @dataclasses.dataclass
210
+ class PyProgram:
211
+ """A parsed Python program containing scripts, functions, and classes."""
212
+
213
+ scripts: List[PyCodeBlock] # Top-level code not in classes/functions
214
+ functions: List[PyFunction] # Top-level functions
215
+ classes: List[PyClass] # Top-level classes
216
+ elements: List[
217
+ Union[PyFunction, PyClass, PyCodeBlock]
218
+ ] # Complete sequence of the file elements.
219
+
220
+ def __str__(self) -> str:
221
+ program = ""
222
+ for item in self.elements:
223
+ program += str(item) + "\n\n"
224
+ return program.strip()
225
+
226
+ @classmethod
227
+ def from_text(cls, text: str, debug=False) -> Optional["PyProgram"]:
228
+ """Parses text into a PyProgram object. Returns None on syntax errors."""
229
+ try:
230
+ tree = ast.parse(text)
231
+ visitor = _ProgramVisitor(text)
232
+ visitor.visit(tree)
233
+ return visitor.return_program()
234
+ except:
235
+ if debug:
236
+ raise
237
+ return None
238
+
239
+ @classmethod
240
+ def remove_comments(cls, py_code: str | Any) -> str:
241
+ """Removes all comments from the given Python code string.
242
+
243
+ This function uses the `tokenize` module to identify and remove all
244
+ comment tokens (# ...) while attempting to preserve the original
245
+ code structure and formatting.
246
+ """
247
+ try:
248
+ py_code = str(py_code)
249
+ # Use tokenize to accurately identify and remove comments
250
+ io_obj = BytesIO(py_code.encode("utf-8"))
251
+ tokens = tokenize.tokenize(io_obj.readline)
252
+ filtered_tokens = [t for t in tokens if t.type != tokenize.COMMENT]
253
+ return tokenize.untokenize(filtered_tokens).decode("utf-8")
254
+ except (tokenize.TokenError, IndentationError):
255
+ # Return original code if tokenization fails
256
+ return py_code
257
+
258
+
259
+ def _indent_code_skip_multi_line_str(code: str, indent_str: str) -> str:
260
+ """Indents code by `indent_str`, but skips lines that are inside
261
+ multiline strings to preserve their internal formatting.
262
+ """
263
+ lines = code.splitlines()
264
+ string_lines = set()
265
+
266
+ # Identify lines belonging to multiline strings
267
+ tokens = tokenize.tokenize(BytesIO(code.encode("utf-8")).readline)
268
+ for token in tokens:
269
+ if token.type == tokenize.STRING:
270
+ start_line, _ = token.start
271
+ end_line, _ = token.end
272
+
273
+ # If it is a multiline string
274
+ if end_line > start_line:
275
+ # We protect the content (start+1 to end)
276
+ # We also protect the end_line because usually the closing quotes
277
+ # are already positioned correctly in the source string
278
+ for i in range(start_line + 1, end_line + 1):
279
+ string_lines.add(i)
280
+
281
+ result = []
282
+ for i, line in enumerate(lines):
283
+ lineno = i + 1
284
+ # If the line is inside a multiline string, append it as-is (no indent)
285
+ if lineno in string_lines:
286
+ result.append(line)
287
+ else:
288
+ # Otherwise, apply indentation
289
+ # We strip whitespace to avoid indenting empty lines (mimicking textwrap behavior)
290
+ if line.strip():
291
+ result.append(indent_str + line)
292
+ else:
293
+ result.append("")
294
+
295
+ return "\n".join(result)
296
+
297
+
298
+ class _ProgramVisitor(ast.NodeVisitor):
299
+ """Parses code to collect all required information to produce a `PyProgram`.
300
+ Handles scripts, functions, and classes with robust indentation handling.
301
+ """
302
+
303
+ def __init__(self, sourcecode: str):
304
+ self._codelines: List[str] = sourcecode.splitlines()
305
+ self._scripts: List[PyCodeBlock] = []
306
+ self._functions: List[PyFunction] = []
307
+ self._classes: List[PyClass] = []
308
+ self._elements: List[Union[PyFunction, PyClass, PyCodeBlock]] = []
309
+ self._last_script_end = 0
310
+ # Pre-process to identify all lines that are part of a multiline string.
311
+ self._multiline_string_lines: Set[int] = self._detect_multiline_strings(
312
+ sourcecode
313
+ )
314
+
315
+ def _detect_multiline_strings(self, sourcecode: str) -> Set[int]:
316
+ """Scans the source code using tokenize to identify line numbers
317
+ that belong to the body of multiline strings. These lines are not indented or dedented.
318
+ """
319
+ string_lines = set()
320
+ # Tokenize the source code
321
+ tokens = tokenize.tokenize(BytesIO(sourcecode.encode("utf-8")).readline)
322
+ for token in tokens:
323
+ if token.type == tokenize.STRING:
324
+ start_line, _ = token.start
325
+ end_line, _ = token.end
326
+
327
+ # If start_line != end_line, it is a multiline string
328
+ if end_line > start_line:
329
+ # Mark the lines strictly between start and end as string body
330
+ # The start line usually contains the assignment variable or key,
331
+ # so the indent lines is between [start_line + 1, end_line],
332
+ # or [start_line + 1, end_line + 1)
333
+ for i in range(start_line + 1, end_line + 1):
334
+ string_lines.add(i)
335
+
336
+ return string_lines
337
+
338
+ def _get_code(self, start_line: int, end_line: int, remove_indent: int = 0) -> str:
339
+ """Get code between start_line and end_line.
340
+
341
+ Args:
342
+ remove_indent: The number of spaces to strip from the beginning of each line.
343
+ This corresponds to the column offset of the function/class definition.
344
+ """
345
+ if start_line >= end_line:
346
+ return ""
347
+
348
+ lines = self._codelines[start_line:end_line]
349
+
350
+ if remove_indent > 0:
351
+ dedented_lines = []
352
+
353
+ for idx, line in enumerate(lines):
354
+ # Calculate the 1-based line number in the original source file
355
+ current_lineno = start_line + idx + 1
356
+
357
+ if current_lineno in self._multiline_string_lines:
358
+ # Check if the current line is inside a multiline string
359
+ # If the line is in the multiline string, we preserve it exactly as is
360
+ dedented_lines.append(line)
361
+ else:
362
+ # For normal code, we allow stripping if the line is empty (isspace),
363
+ # even if it doesn't have the full indentation length
364
+ if len(line) >= remove_indent and line[:remove_indent].isspace():
365
+ dedented_lines.append(line[remove_indent:])
366
+ else:
367
+ dedented_lines.append(line)
368
+
369
+ return "\n".join(dedented_lines).rstrip()
370
+ else:
371
+ # For top-level functions (remove_indent=0), return raw code
372
+ return "\n".join(lines).rstrip()
373
+
374
+ def _add_script_segment(self, start_line: int, end_line: int):
375
+ """Add a script segment (gap between functions/classes) from the code."""
376
+ if start_line >= end_line:
377
+ return
378
+ script_code = self._get_code(start_line, end_line).strip()
379
+ if script_code:
380
+ script = PyCodeBlock(code=script_code)
381
+ self._scripts.append(script)
382
+ self._elements.append(script)
383
+
384
+ def _extract_function_info(
385
+ self, node: Union[ast.FunctionDef, ast.AsyncFunctionDef]
386
+ ) -> PyFunction:
387
+ """Shared logic to extract information from FunctionDef or AsyncFunctionDef."""
388
+ # Extract decorators
389
+ if hasattr(node, "decorator_list") and node.decorator_list:
390
+ dec_start = min(d.lineno for d in node.decorator_list)
391
+ decorator = self._get_code(
392
+ dec_start - 1, node.lineno - 1, remove_indent=node.col_offset
393
+ )
394
+ else:
395
+ decorator = None
396
+
397
+ # Extract docstring
398
+ if isinstance(node.body[0], ast.Expr) and isinstance(
399
+ node.body[0].value, ast.Constant
400
+ ):
401
+ docstring = ast.literal_eval(ast.unparse(node.body[0])).strip()
402
+ # Dedent docstring based on the node offset
403
+ dedented_docstring_lines = []
404
+ # For top-level functions, the node.col_offset is 0, docstring is not modified
405
+ # For class methods, the node.col_offset is 4, we dedent docstring for 4 spaces (the class indent)
406
+ remove_indent = node.col_offset
407
+
408
+ for idx, line in enumerate(docstring.splitlines()):
409
+ if len(line) >= remove_indent and line[:remove_indent].isspace():
410
+ line = line[remove_indent:]
411
+ dedented_docstring_lines.append(line)
412
+
413
+ docstring = "\n".join(dedented_docstring_lines)
414
+ else:
415
+ docstring = None
416
+
417
+ # Determine where the actual code body starts
418
+ if docstring and len(node.body) > 1:
419
+ body_start_line = node.body[1].lineno - 1
420
+ elif docstring:
421
+ body_start_line = node.end_lineno
422
+ else:
423
+ body_start_line = node.body[0].lineno - 1
424
+
425
+ # Extract body, and apply critical indentation:
426
+ # For top-level functions, col_offset is 0, the body is not modified
427
+ # For class methods, col_offset is 4, the dy loses exactly 4 spaces (the class indent)
428
+ body = self._get_code(
429
+ body_start_line, node.end_lineno, remove_indent=node.col_offset
430
+ )
431
+ is_async = isinstance(node, ast.AsyncFunctionDef)
432
+
433
+ return PyFunction(
434
+ decorator=decorator,
435
+ name=node.name,
436
+ args=ast.unparse(node.args),
437
+ return_type=ast.unparse(node.returns) if node.returns else None,
438
+ docstring=docstring,
439
+ body=body,
440
+ is_async=is_async,
441
+ )
442
+
443
+ def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
444
+ """Handles top-level synchronous functions."""
445
+ if node.col_offset == 0:
446
+ start_line = node.lineno - 1
447
+ if hasattr(node, "decorator_list") and node.decorator_list:
448
+ start_line = min(d.lineno for d in node.decorator_list) - 1
449
+
450
+ self._add_script_segment(self._last_script_end, start_line)
451
+ self._last_script_end = node.end_lineno
452
+
453
+ func = self._extract_function_info(node)
454
+ self._functions.append(func)
455
+ self._elements.append(func)
456
+
457
+ def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
458
+ """Handles top-level asynchronous functions."""
459
+ if node.col_offset == 0:
460
+ start_line = node.lineno - 1
461
+ if hasattr(node, "decorator_list") and node.decorator_list:
462
+ start_line = min(d.lineno for d in node.decorator_list) - 1
463
+
464
+ self._add_script_segment(self._last_script_end, start_line)
465
+ self._last_script_end = node.end_lineno
466
+
467
+ func = self._extract_function_info(node)
468
+ self._functions.append(func)
469
+ self._elements.append(func)
470
+
471
+ def visit_ClassDef(self, node: ast.ClassDef) -> None:
472
+ """Handles top-level classes."""
473
+ if node.col_offset == 0:
474
+ # Handle decorators and preceding script
475
+ start_line = node.lineno - 1
476
+ if hasattr(node, "decorator_list") and node.decorator_list:
477
+ start_line = min(d.lineno for d in node.decorator_list) - 1
478
+ decorator_code = self._get_code(start_line, node.lineno - 1)
479
+ else:
480
+ decorator_code = None
481
+
482
+ self._add_script_segment(self._last_script_end, start_line)
483
+ self._last_script_end = node.end_lineno
484
+
485
+ # Extract docstring
486
+ if isinstance(node.body[0], ast.Expr) and isinstance(
487
+ node.body[0].value, ast.Constant
488
+ ):
489
+ docstring = ast.literal_eval(ast.unparse(node.body[0])).strip()
490
+ else:
491
+ docstring = None
492
+
493
+ # Extract class basic info
494
+ bases = (
495
+ ", ".join([ast.unparse(base) for base in node.bases])
496
+ if node.bases
497
+ else None
498
+ )
499
+
500
+ # Process class body contents
501
+ methods = []
502
+ statements = []
503
+ class_body = []
504
+ last_inner_end = node.lineno
505
+ body_nodes = node.body
506
+
507
+ if docstring:
508
+ if len(body_nodes) > 0:
509
+ last_inner_end = body_nodes[0].end_lineno
510
+ body_nodes = body_nodes[1:]
511
+
512
+ for item in body_nodes:
513
+ # Default start is the definition line
514
+ item_start_line = item.lineno
515
+
516
+ # If the item has decorators (Function or Class), the visual start is the first decorator
517
+ if hasattr(item, "decorator_list") and item.decorator_list:
518
+ item_start_line = min(d.lineno for d in item.decorator_list)
519
+
520
+ # Capture Gaps (Use item_start_line instead of item.lineno)
521
+ gap_code = self._get_code(
522
+ last_inner_end, item_start_line - 1, remove_indent=item.col_offset
523
+ ).strip()
524
+ if gap_code:
525
+ gap_block = PyCodeBlock(code=gap_code)
526
+ statements.append(gap_block)
527
+ class_body.append(gap_block)
528
+
529
+ # Process the Item
530
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
531
+ method_func = self._extract_function_info(item)
532
+ methods.append(method_func)
533
+ class_body.append(method_func)
534
+ else:
535
+ code_text = self._get_code(
536
+ item_start_line - 1,
537
+ item.end_lineno,
538
+ remove_indent=item.col_offset,
539
+ )
540
+ block = PyCodeBlock(code=code_text)
541
+ statements.append(block)
542
+ class_body.append(block)
543
+
544
+ last_inner_end = item.end_lineno
545
+
546
+ class_obj = PyClass(
547
+ decorator=decorator_code,
548
+ name=node.name,
549
+ bases=bases,
550
+ docstring=docstring,
551
+ statements=statements if statements else None,
552
+ functions=methods,
553
+ body=class_body if class_body else None,
554
+ )
555
+ self._classes.append(class_obj)
556
+ self._elements.append(class_obj)
557
+
558
+ self.generic_visit(node)
559
+
560
+ def return_program(self) -> PyProgram:
561
+ """Finalizes parsing and returns the PyProgram object."""
562
+ self._add_script_segment(self._last_script_end, len(self._codelines))
563
+
564
+ return PyProgram(
565
+ scripts=self._scripts,
566
+ functions=self._functions,
567
+ classes=self._classes,
568
+ elements=self._elements,
569
+ )
570
+
571
+
572
+ if __name__ == "__main__":
573
+ with open(__file__) as f:
574
+ code = f.read()
575
+
576
+ code = PyProgram.from_text(code, debug=True)
577
+ print(code)
@@ -0,0 +1,2 @@
1
+ from adtools.sandbox.sandbox_executor import SandboxExecutor, ExecutionResults
2
+ from adtools.sandbox.sandbox_executor_ray import SandboxExecutorRay