tangle-cli 0.0.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tangle_cli/__init__.py +19 -0
- tangle_cli/api_cli.py +787 -0
- tangle_cli/api_schema.py +633 -0
- tangle_cli/api_transport.py +461 -0
- tangle_cli/args_container.py +244 -0
- tangle_cli/artifacts.py +293 -0
- tangle_cli/artifacts_cli.py +108 -0
- tangle_cli/cli.py +57 -0
- tangle_cli/cli_helpers.py +116 -0
- tangle_cli/cli_options.py +52 -0
- tangle_cli/client.py +677 -0
- tangle_cli/component_from_func.py +1856 -0
- tangle_cli/component_generator.py +298 -0
- tangle_cli/component_inspector.py +494 -0
- tangle_cli/component_publisher.py +921 -0
- tangle_cli/components_cli.py +269 -0
- tangle_cli/dynamic_discovery_client.py +296 -0
- tangle_cli/generated_model_extensions.py +405 -0
- tangle_cli/generated_runtime.py +43 -0
- tangle_cli/handler.py +96 -0
- tangle_cli/hydration_trust.py +222 -0
- tangle_cli/logger.py +166 -0
- tangle_cli/models.py +407 -0
- tangle_cli/module_bundler.py +662 -0
- tangle_cli/openapi/__init__.py +0 -0
- tangle_cli/openapi/codegen.py +1090 -0
- tangle_cli/openapi/parser.py +77 -0
- tangle_cli/pipeline_dehydrator.py +720 -0
- tangle_cli/pipeline_hydrator.py +1785 -0
- tangle_cli/pipeline_run_annotations.py +41 -0
- tangle_cli/pipeline_run_details.py +203 -0
- tangle_cli/pipeline_run_manager.py +1994 -0
- tangle_cli/pipeline_run_search.py +712 -0
- tangle_cli/pipeline_runner.py +620 -0
- tangle_cli/pipeline_runs_cli.py +584 -0
- tangle_cli/pipelines.py +581 -0
- tangle_cli/pipelines_cli.py +271 -0
- tangle_cli/published_components_cli.py +373 -0
- tangle_cli/py.typed +0 -0
- tangle_cli/quickstart.py +110 -0
- tangle_cli/secrets.py +156 -0
- tangle_cli/secrets_cli.py +269 -0
- tangle_cli/utils.py +942 -0
- tangle_cli/version_manager.py +470 -0
- tangle_cli-0.0.1a1.dist-info/METADATA +561 -0
- tangle_cli-0.0.1a1.dist-info/RECORD +48 -0
- tangle_cli-0.0.1a1.dist-info/WHEEL +4 -0
- tangle_cli-0.0.1a1.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,1856 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Component YAML generator from Python functions.
|
|
3
|
+
|
|
4
|
+
Converts Python functions into Tangle component YAML files. Supports two modes:
|
|
5
|
+
|
|
6
|
+
- **inline** (default): Single-file components with source code embedded directly.
|
|
7
|
+
- **bundle**: Multi-file components with local dependency modules serialized via
|
|
8
|
+
zlib-compressed source text and injected into sys.modules at runtime.
|
|
9
|
+
|
|
10
|
+
Key functions:
|
|
11
|
+
- generate_component_yaml() - Top-level entry point for YAML generation
|
|
12
|
+
- extract_interface() - Introspects a function's signature, types, and docstring
|
|
13
|
+
- extract_file_metadata() - Extracts metadata (name, version, etc.) from source via AST
|
|
14
|
+
- extract_docstring_metadata() - Parses the Metadata section from a docstring string
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import ast
|
|
18
|
+
import importlib.util
|
|
19
|
+
import inspect
|
|
20
|
+
import json
|
|
21
|
+
import os
|
|
22
|
+
import re
|
|
23
|
+
import sys
|
|
24
|
+
import textwrap
|
|
25
|
+
import types
|
|
26
|
+
import typing
|
|
27
|
+
import warnings
|
|
28
|
+
from dataclasses import dataclass, field
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Any, Callable, Literal
|
|
31
|
+
|
|
32
|
+
import docstring_parser
|
|
33
|
+
|
|
34
|
+
from tangle_cli.module_bundler import ModuleBundler
|
|
35
|
+
from tangle_cli.utils import dump_yaml, get_git_info, get_git_root
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
import tomllib
|
|
39
|
+
except ModuleNotFoundError:
|
|
40
|
+
import tomli as tomllib # type: ignore[no-redef]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ============================================================================
|
|
44
|
+
# InputPath / OutputPath annotation types
|
|
45
|
+
# ============================================================================
|
|
46
|
+
# These mirror the cloud_pipelines.components types so we can introspect
|
|
47
|
+
# functions that use them without requiring the cloud_pipelines SDK.
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class InputPath:
|
|
51
|
+
"""Annotation indicating a function parameter receives a file path for input data."""
|
|
52
|
+
|
|
53
|
+
def __init__(self, type: str | None = None):
|
|
54
|
+
self.type = type
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class OutputPath:
|
|
58
|
+
"""Annotation indicating a function parameter receives a file path for output data."""
|
|
59
|
+
|
|
60
|
+
def __init__(self, type: str | None = None):
|
|
61
|
+
self.type = type
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ============================================================================
|
|
65
|
+
# Type mapping (replicating Cloud-Pipelines SDK _data_passing.py)
|
|
66
|
+
# ============================================================================
|
|
67
|
+
|
|
68
|
+
# Python type → Tangle type name
|
|
69
|
+
_TYPE_TO_TANGLE: dict[type, str] = {
|
|
70
|
+
str: "String",
|
|
71
|
+
int: "Integer",
|
|
72
|
+
float: "Float",
|
|
73
|
+
bool: "Boolean",
|
|
74
|
+
list: "JsonArray",
|
|
75
|
+
dict: "JsonObject",
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
# Tangle type name → argparse deserializer expression
|
|
79
|
+
_TYPE_TO_DESERIALIZER: dict[str, str] = {
|
|
80
|
+
"String": "str",
|
|
81
|
+
"Integer": "int",
|
|
82
|
+
"Float": "float",
|
|
83
|
+
"Boolean": "_deserialize_bool",
|
|
84
|
+
"JsonArray": "json.loads",
|
|
85
|
+
"JsonObject": "json.loads",
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
# Tangle type names that need extra definitions in the generated code
|
|
89
|
+
_TYPE_DEFINITIONS: dict[str, str] = {
|
|
90
|
+
"Boolean": textwrap.dedent("""\
|
|
91
|
+
def _deserialize_bool(s):
|
|
92
|
+
s = s.lower()
|
|
93
|
+
if s in ("true", "1", "yes"):
|
|
94
|
+
return True
|
|
95
|
+
if s in ("false", "0", "no"):
|
|
96
|
+
return False
|
|
97
|
+
raise TypeError(
|
|
98
|
+
f'Error parsing "{s}" as bool value. Supported values: "true", "false", "1", "0".'
|
|
99
|
+
)"""),
|
|
100
|
+
"JsonArray": "import json",
|
|
101
|
+
"JsonObject": "import json",
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
_MAKE_PARENT_DIRS_HELPER = textwrap.dedent("""\
|
|
105
|
+
def _make_parent_dirs_and_return_path(file_path: str):
|
|
106
|
+
import os
|
|
107
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
108
|
+
return file_path""")
|
|
109
|
+
|
|
110
|
+
# Tangle type name → output serializer expression (for NamedTuple return fields)
|
|
111
|
+
_TYPE_TO_SERIALIZER: dict[str, str] = {
|
|
112
|
+
"String": "_serialize_str",
|
|
113
|
+
"Integer": "str",
|
|
114
|
+
"Float": "str",
|
|
115
|
+
"Boolean": "str",
|
|
116
|
+
"JsonArray": "json.dumps",
|
|
117
|
+
"JsonObject": "json.dumps",
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
_SERIALIZE_STR_HELPER = textwrap.dedent("""\
|
|
121
|
+
def _serialize_str(str_value) -> str:
|
|
122
|
+
if isinstance(str_value, str):
|
|
123
|
+
return str_value
|
|
124
|
+
else:
|
|
125
|
+
return str(str_value)""")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ============================================================================
|
|
129
|
+
# Data structures
|
|
130
|
+
# ============================================================================
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@dataclass
|
|
134
|
+
class ParamInfo:
|
|
135
|
+
"""Describes a single function parameter mapped to a component input or output."""
|
|
136
|
+
|
|
137
|
+
name: str # Python parameter name
|
|
138
|
+
yaml_name: str # Name in YAML (may have _path/_file suffix stripped)
|
|
139
|
+
python_type: str | None # Original Python type annotation string
|
|
140
|
+
tangle_type: str | None # Tangle type: String, Integer, Float, etc.
|
|
141
|
+
kind: Literal["input", "output", "input_path", "return_output"]
|
|
142
|
+
description: str | None = None
|
|
143
|
+
default: Any = inspect.Parameter.empty
|
|
144
|
+
optional: bool = False
|
|
145
|
+
deserializer: str = "str" # argparse type= expression
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@dataclass
|
|
149
|
+
class FunctionSpec:
|
|
150
|
+
"""Complete specification of a function for component generation."""
|
|
151
|
+
|
|
152
|
+
name: str
|
|
153
|
+
component_name: str
|
|
154
|
+
description: str | None
|
|
155
|
+
params: list[ParamInfo] = field(default_factory=list)
|
|
156
|
+
return_params: list[ParamInfo] = field(default_factory=list) # Return value outputs
|
|
157
|
+
single_return_output: bool = False # True when -> str (not NamedTuple); needs _outputs=[_outputs] wrapping
|
|
158
|
+
source_code: str = ""
|
|
159
|
+
source_code_stripped: str = ""
|
|
160
|
+
module_source_stripped: str = "" # Full module source (for bundle mode)
|
|
161
|
+
docstring_metadata: dict[str, str] = field(default_factory=dict) # name, version, updated_at from Metadata:
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def inputs(self) -> list[ParamInfo]:
|
|
165
|
+
return [p for p in self.params if p.kind in ("input", "input_path")]
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def outputs(self) -> list[ParamInfo]:
|
|
169
|
+
"""OutputPath parameter outputs."""
|
|
170
|
+
return [p for p in self.params if p.kind == "output"]
|
|
171
|
+
|
|
172
|
+
@property
|
|
173
|
+
def all_outputs(self) -> list[ParamInfo]:
|
|
174
|
+
"""All outputs: OutputPath parameters + NamedTuple return fields."""
|
|
175
|
+
return self.outputs + self.return_params
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# ============================================================================
|
|
179
|
+
# Module loading
|
|
180
|
+
# ============================================================================
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _ensure_cloud_pipelines_shim() -> None:
|
|
184
|
+
"""Register import-time shims used while introspecting authoring files.
|
|
185
|
+
|
|
186
|
+
This allows loading Python files that use `from cloud_pipelines import components`
|
|
187
|
+
and/or TD authoring decorators without requiring those authoring packages.
|
|
188
|
+
The TD authoring constructs are stripped from generated runtime code later.
|
|
189
|
+
"""
|
|
190
|
+
if "cloud_pipelines" not in sys.modules:
|
|
191
|
+
components_mod = types.ModuleType("cloud_pipelines.components")
|
|
192
|
+
setattr(components_mod, "InputPath", InputPath)
|
|
193
|
+
setattr(components_mod, "OutputPath", OutputPath)
|
|
194
|
+
|
|
195
|
+
cloud_pipelines_mod = types.ModuleType("cloud_pipelines")
|
|
196
|
+
setattr(cloud_pipelines_mod, "components", components_mod)
|
|
197
|
+
|
|
198
|
+
sys.modules["cloud_pipelines"] = cloud_pipelines_mod
|
|
199
|
+
sys.modules["cloud_pipelines.components"] = components_mod
|
|
200
|
+
|
|
201
|
+
_ensure_tangle_deploy_authoring_shim()
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _identity_decorator(*args, **kwargs):
|
|
205
|
+
def decorate(func):
|
|
206
|
+
return func
|
|
207
|
+
|
|
208
|
+
return decorate
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class _AuthoringGeneric:
|
|
212
|
+
def __class_getitem__(cls, item):
|
|
213
|
+
return cls
|
|
214
|
+
|
|
215
|
+
def __init__(self, *args, **kwargs):
|
|
216
|
+
pass
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _ensure_tangle_deploy_authoring_shim() -> None:
|
|
220
|
+
"""Register a tiny shim for TD pipeline authoring imports if absent."""
|
|
221
|
+
if "tangle_deploy.python_pipeline" in sys.modules:
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
tangle_deploy_mod = sys.modules.get("tangle_deploy") or types.ModuleType("tangle_deploy")
|
|
225
|
+
python_pipeline_mod = types.ModuleType("tangle_deploy.python_pipeline")
|
|
226
|
+
for name in ("task", "pipeline", "subpipeline", "registered"):
|
|
227
|
+
setattr(python_pipeline_mod, name, _identity_decorator)
|
|
228
|
+
for name in ("In", "Out", "Outputs", "TaskEnv"):
|
|
229
|
+
setattr(python_pipeline_mod, name, _AuthoringGeneric)
|
|
230
|
+
setattr(python_pipeline_mod, "ref", lambda *args, **kwargs: None)
|
|
231
|
+
|
|
232
|
+
setattr(tangle_deploy_mod, "python_pipeline", python_pipeline_mod)
|
|
233
|
+
sys.modules.setdefault("tangle_deploy", tangle_deploy_mod)
|
|
234
|
+
sys.modules["tangle_deploy.python_pipeline"] = python_pipeline_mod
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def load_python_module(file_path: Path, extra_sys_path: list[Path] | None = None) -> Any:
|
|
238
|
+
"""Dynamically import a Python module from a file path.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
file_path: Path to the Python source file.
|
|
242
|
+
extra_sys_path: Additional directories to add to ``sys.path`` during
|
|
243
|
+
module loading. This is needed when the module imports sibling
|
|
244
|
+
packages that live outside ``file_path.parent`` (e.g. when
|
|
245
|
+
``--resolve-root`` points at a parent ``src/`` directory).
|
|
246
|
+
"""
|
|
247
|
+
_ensure_cloud_pipelines_shim()
|
|
248
|
+
|
|
249
|
+
module_name = file_path.stem
|
|
250
|
+
spec = importlib.util.spec_from_file_location(module_name, location=str(file_path))
|
|
251
|
+
if not spec or not spec.loader:
|
|
252
|
+
raise ValueError(f"Unable to create module spec for {file_path}")
|
|
253
|
+
module = importlib.util.module_from_spec(spec)
|
|
254
|
+
# Add the module's directory to sys.path so relative imports work
|
|
255
|
+
module_dir = str(file_path.parent.resolve())
|
|
256
|
+
original_path = sys.path.copy()
|
|
257
|
+
if module_dir not in sys.path:
|
|
258
|
+
sys.path.insert(0, module_dir)
|
|
259
|
+
# Add extra directories (e.g. resolve_root) so sibling imports resolve
|
|
260
|
+
if extra_sys_path:
|
|
261
|
+
for p in reversed(extra_sys_path):
|
|
262
|
+
p_str = str(p.resolve())
|
|
263
|
+
if p_str not in sys.path:
|
|
264
|
+
sys.path.insert(0, p_str)
|
|
265
|
+
try:
|
|
266
|
+
spec.loader.exec_module(module)
|
|
267
|
+
finally:
|
|
268
|
+
sys.path = original_path
|
|
269
|
+
return module
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def get_function_from_module(module: Any, function_name: str | None = None) -> Callable:
|
|
273
|
+
"""Get a function from a loaded module.
|
|
274
|
+
|
|
275
|
+
If function_name is specified, returns that function.
|
|
276
|
+
Otherwise, returns the single public function (errors if 0 or >1).
|
|
277
|
+
"""
|
|
278
|
+
if function_name:
|
|
279
|
+
func = getattr(module, function_name, None)
|
|
280
|
+
if func is None or not callable(func):
|
|
281
|
+
raise ValueError(f"Function '{function_name}' not found in module {module.__name__}")
|
|
282
|
+
return func
|
|
283
|
+
|
|
284
|
+
functions = [
|
|
285
|
+
getattr(module, name)
|
|
286
|
+
for name in dir(module)
|
|
287
|
+
if not name.startswith("_") and callable(getattr(module, name)) and not isinstance(getattr(module, name), type)
|
|
288
|
+
]
|
|
289
|
+
|
|
290
|
+
if not functions:
|
|
291
|
+
raise ValueError(f"No public functions found in module {module.__name__}")
|
|
292
|
+
if len(functions) > 1:
|
|
293
|
+
names = [f.__name__ for f in functions]
|
|
294
|
+
raise ValueError(
|
|
295
|
+
f"Found multiple functions in module {module.__name__}: {names}. " "Please specify --function-name."
|
|
296
|
+
)
|
|
297
|
+
return functions[0]
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
# ============================================================================
|
|
301
|
+
# Type annotation resolution
|
|
302
|
+
# ============================================================================
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def _resolve_annotation(annotation: Any) -> tuple[str | None, str, Literal["input", "output", "input_path"]]:
|
|
306
|
+
"""Resolve a parameter annotation to (tangle_type, deserializer, kind).
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
(tangle_type, deserializer_code, kind)
|
|
310
|
+
"""
|
|
311
|
+
if annotation is inspect.Parameter.empty or annotation is None:
|
|
312
|
+
return "String", "str", "input"
|
|
313
|
+
|
|
314
|
+
# Handle InputPath / OutputPath (both our local versions and cloud_pipelines versions)
|
|
315
|
+
type_name = type(annotation).__name__
|
|
316
|
+
if type_name == "OutputPath":
|
|
317
|
+
inner_type = getattr(annotation, "type", None) or "String"
|
|
318
|
+
return inner_type, "_make_parent_dirs_and_return_path", "output"
|
|
319
|
+
if type_name == "InputPath":
|
|
320
|
+
inner_type = getattr(annotation, "type", None) or "String"
|
|
321
|
+
return inner_type, "str", "input_path"
|
|
322
|
+
|
|
323
|
+
# Handle generic types first: Optional[T], list[T], dict[K,V], Union[T, None]
|
|
324
|
+
# Must come before isinstance(type) check because list[str] passes isinstance(type) in Python 3.10
|
|
325
|
+
origin = typing.get_origin(annotation)
|
|
326
|
+
if origin in (list,):
|
|
327
|
+
return "JsonArray", "json.loads", "input"
|
|
328
|
+
if origin in (dict,):
|
|
329
|
+
return "JsonObject", "json.loads", "input"
|
|
330
|
+
if origin is typing.Union or origin is types.UnionType:
|
|
331
|
+
args = typing.get_args(annotation)
|
|
332
|
+
# Optional[T] == Union[T, None]
|
|
333
|
+
if len(args) == 2 and type(None) in args:
|
|
334
|
+
non_none = args[0] if args[1] is type(None) else args[1]
|
|
335
|
+
return _resolve_annotation(non_none)
|
|
336
|
+
return None, "str", "input"
|
|
337
|
+
|
|
338
|
+
# Handle direct Python types (after generic check)
|
|
339
|
+
if isinstance(annotation, type):
|
|
340
|
+
tangle = _TYPE_TO_TANGLE.get(annotation)
|
|
341
|
+
if tangle:
|
|
342
|
+
return tangle, _TYPE_TO_DESERIALIZER[tangle], "input"
|
|
343
|
+
return str(annotation.__name__), "str", "input"
|
|
344
|
+
|
|
345
|
+
# ForwardRef or other annotation — use string representation
|
|
346
|
+
return str(getattr(annotation, "__forward_arg__", annotation)), "str", "input"
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _make_return_param(name: str, annotation: type) -> ParamInfo:
|
|
350
|
+
"""Create a ParamInfo for a return value output."""
|
|
351
|
+
tangle_type = _TYPE_TO_TANGLE.get(annotation, "String")
|
|
352
|
+
return ParamInfo(
|
|
353
|
+
name=name,
|
|
354
|
+
yaml_name=name,
|
|
355
|
+
python_type=str(annotation) if annotation else None,
|
|
356
|
+
tangle_type=tangle_type,
|
|
357
|
+
kind="return_output",
|
|
358
|
+
description=None,
|
|
359
|
+
deserializer=_TYPE_TO_SERIALIZER.get(tangle_type, "_serialize_str"),
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def _resolve_namedtuple_return(return_ann: Any) -> list[ParamInfo]:
|
|
364
|
+
"""Extract output parameters from a NamedTuple return annotation."""
|
|
365
|
+
# __annotations__ doesn't exist in python 3.5 and earlier
|
|
366
|
+
# _field_types doesn't exist in python 3.9 and later
|
|
367
|
+
field_annotations = getattr(return_ann, "__annotations__", None) or getattr(return_ann, "_field_types", None)
|
|
368
|
+
return [
|
|
369
|
+
_make_return_param(
|
|
370
|
+
name=field_name,
|
|
371
|
+
annotation=field_annotations.get(field_name, str) if field_annotations else str,
|
|
372
|
+
)
|
|
373
|
+
for field_name in return_ann._fields
|
|
374
|
+
]
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _resolve_single_return(return_ann: type) -> ParamInfo | None:
|
|
378
|
+
"""Create an output parameter for a single (non-NamedTuple) return type.
|
|
379
|
+
|
|
380
|
+
Returns None if the type is not a recognized Tangle type.
|
|
381
|
+
"""
|
|
382
|
+
if return_ann not in _TYPE_TO_TANGLE:
|
|
383
|
+
return None
|
|
384
|
+
return _make_return_param(name="Output", annotation=return_ann)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def _resolve_return_type(func: Callable) -> tuple[list[ParamInfo], bool]:
|
|
388
|
+
"""Extract output parameters from the function's return type annotation.
|
|
389
|
+
|
|
390
|
+
Matches the Cloud-Pipelines SDK behavior:
|
|
391
|
+
- NamedTuple return -> one output per field (multi-output)
|
|
392
|
+
- Single type return (str, int, etc.) -> one output named "Output" (single-output)
|
|
393
|
+
- No return annotation -> no outputs
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
(return_params, single_return_output) where single_return_output is True
|
|
397
|
+
when the return is a plain type (not NamedTuple) and the generated code
|
|
398
|
+
needs ``_outputs = [_outputs]`` wrapping.
|
|
399
|
+
"""
|
|
400
|
+
# Use inspect.signature like the SDK does (avoids typing.get_type_hints issues
|
|
401
|
+
# with InputPath/OutputPath instances that aren't valid types for Optional[]).
|
|
402
|
+
return_ann = inspect.signature(func).return_annotation
|
|
403
|
+
if return_ann is None or return_ann is inspect.Parameter.empty:
|
|
404
|
+
return [], False
|
|
405
|
+
|
|
406
|
+
if hasattr(return_ann, "_fields"):
|
|
407
|
+
return _resolve_namedtuple_return(return_ann), False
|
|
408
|
+
|
|
409
|
+
param = _resolve_single_return(return_ann)
|
|
410
|
+
if param:
|
|
411
|
+
return [param], True
|
|
412
|
+
|
|
413
|
+
return [], False
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
# ============================================================================
|
|
417
|
+
# Interface extraction
|
|
418
|
+
# ============================================================================
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def _python_name_to_component_name(name: str) -> str:
|
|
422
|
+
"""Convert a Python function name to a human-readable component name."""
|
|
423
|
+
name_with_spaces = re.sub(" +", " ", name.replace("_", " ")).strip()
|
|
424
|
+
if not name_with_spaces:
|
|
425
|
+
return name
|
|
426
|
+
return name_with_spaces[0].upper() + name_with_spaces[1:]
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def extract_docstring_metadata(docstring: str) -> dict[str, str]:
|
|
430
|
+
"""Extract metadata and description from a docstring.
|
|
431
|
+
|
|
432
|
+
Extracts the main description text (before any sections) and key-value pairs
|
|
433
|
+
from the Metadata section:
|
|
434
|
+
|
|
435
|
+
Processes and validates input data.
|
|
436
|
+
|
|
437
|
+
Metadata:
|
|
438
|
+
name: My Component Name
|
|
439
|
+
version: 1.2
|
|
440
|
+
updated_at: 2025-01-01T00:00:00Z
|
|
441
|
+
|
|
442
|
+
Args:
|
|
443
|
+
...
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
Dict with keys like "description", "name", "version", "updated_at" (only present if found).
|
|
447
|
+
"""
|
|
448
|
+
sections = [
|
|
449
|
+
"args",
|
|
450
|
+
"arguments",
|
|
451
|
+
"parameters",
|
|
452
|
+
"returns",
|
|
453
|
+
"raises",
|
|
454
|
+
"yields",
|
|
455
|
+
"note",
|
|
456
|
+
"notes",
|
|
457
|
+
"example",
|
|
458
|
+
"examples",
|
|
459
|
+
"metadata",
|
|
460
|
+
]
|
|
461
|
+
|
|
462
|
+
metadata: dict[str, str] = {}
|
|
463
|
+
in_metadata = False
|
|
464
|
+
in_description = True
|
|
465
|
+
description_lines: list[str] = []
|
|
466
|
+
|
|
467
|
+
for line in docstring.split("\n"):
|
|
468
|
+
stripped = line.strip()
|
|
469
|
+
|
|
470
|
+
# Check for section headers
|
|
471
|
+
if stripped and stripped.rstrip(":").lower() in sections:
|
|
472
|
+
in_description = False
|
|
473
|
+
if stripped.lower() == "metadata:":
|
|
474
|
+
in_metadata = True
|
|
475
|
+
elif in_metadata:
|
|
476
|
+
break
|
|
477
|
+
continue
|
|
478
|
+
|
|
479
|
+
if in_metadata:
|
|
480
|
+
# Parse any key: value pair
|
|
481
|
+
kv_match = re.match(r"^(\w[\w_]*)\s*:\s*(.+)", stripped)
|
|
482
|
+
if kv_match:
|
|
483
|
+
key = kv_match.group(1).lower()
|
|
484
|
+
value = kv_match.group(2).strip()
|
|
485
|
+
# Normalize version_timestamp to updated_at
|
|
486
|
+
if key == "version_timestamp":
|
|
487
|
+
key = "updated_at"
|
|
488
|
+
metadata[key] = value
|
|
489
|
+
elif in_description:
|
|
490
|
+
# Collect description lines (before any section)
|
|
491
|
+
if stripped:
|
|
492
|
+
description_lines.append(stripped)
|
|
493
|
+
|
|
494
|
+
if description_lines:
|
|
495
|
+
metadata["description"] = " ".join(description_lines)
|
|
496
|
+
|
|
497
|
+
return metadata
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def find_function_in_source(
|
|
501
|
+
file_path: Path, function_name: str | None = None
|
|
502
|
+
) -> tuple[str | None, ast.FunctionDef | None]:
|
|
503
|
+
"""Find a function in a Python source file by AST parsing.
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
file_path: Path to the Python file
|
|
507
|
+
function_name: Name of function to find. If not found or not provided,
|
|
508
|
+
falls back to first public function in the file.
|
|
509
|
+
|
|
510
|
+
Returns:
|
|
511
|
+
Tuple of (function_name, function_node) or (None, None) if no functions found.
|
|
512
|
+
"""
|
|
513
|
+
try:
|
|
514
|
+
content = file_path.read_text()
|
|
515
|
+
tree = ast.parse(content)
|
|
516
|
+
|
|
517
|
+
all_functions = [
|
|
518
|
+
node
|
|
519
|
+
for node in ast.iter_child_nodes(tree)
|
|
520
|
+
if isinstance(node, ast.FunctionDef) and not node.name.startswith("_")
|
|
521
|
+
]
|
|
522
|
+
|
|
523
|
+
if not all_functions:
|
|
524
|
+
return None, None
|
|
525
|
+
|
|
526
|
+
if function_name:
|
|
527
|
+
for func in all_functions:
|
|
528
|
+
if func.name == function_name:
|
|
529
|
+
return func.name, func
|
|
530
|
+
# Function not found, fall back to first function
|
|
531
|
+
first_func = all_functions[0]
|
|
532
|
+
warnings.warn(
|
|
533
|
+
f"Function '{function_name}' not found in {file_path.name}, " f"using '{first_func.name}' instead"
|
|
534
|
+
)
|
|
535
|
+
return first_func.name, first_func
|
|
536
|
+
|
|
537
|
+
first_func = all_functions[0]
|
|
538
|
+
return first_func.name, first_func
|
|
539
|
+
|
|
540
|
+
except (SyntaxError, ValueError, OSError) as e:
|
|
541
|
+
warnings.warn(f"Could not parse {file_path}: {e}")
|
|
542
|
+
return None, None
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
def extract_file_metadata(file_path: Path, function_name: str | None = None) -> tuple[dict[str, str], str | None]:
|
|
546
|
+
"""Extract metadata from a function's docstring in a Python source file.
|
|
547
|
+
|
|
548
|
+
Finds the function via AST, extracts its docstring, and parses the Metadata
|
|
549
|
+
section for keys like name, version, updated_at, plus the description.
|
|
550
|
+
|
|
551
|
+
Args:
|
|
552
|
+
file_path: Path to the Python file
|
|
553
|
+
function_name: Function to extract from. Defaults to file stem.
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
Tuple of (metadata_dict, actual_function_name_used)
|
|
557
|
+
"""
|
|
558
|
+
if not function_name:
|
|
559
|
+
function_name = file_path.stem.replace("-", "_")
|
|
560
|
+
|
|
561
|
+
actual_func_name, func_node = find_function_in_source(file_path, function_name)
|
|
562
|
+
if not func_node:
|
|
563
|
+
return {}, None
|
|
564
|
+
|
|
565
|
+
docstring = ast.get_docstring(func_node)
|
|
566
|
+
if docstring:
|
|
567
|
+
return extract_docstring_metadata(docstring), actual_func_name
|
|
568
|
+
|
|
569
|
+
return {}, actual_func_name
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def extract_interface(
|
|
573
|
+
func: Callable,
|
|
574
|
+
docstring_metadata: dict[str, str],
|
|
575
|
+
) -> FunctionSpec:
|
|
576
|
+
"""Extract component interface from a Python function.
|
|
577
|
+
|
|
578
|
+
Uses inspect.signature() for parameter info and docstring_parser for descriptions.
|
|
579
|
+
|
|
580
|
+
Args:
|
|
581
|
+
func: The Python function to introspect.
|
|
582
|
+
docstring_metadata: Metadata from extract_file_metadata or extract_docstring_metadata.
|
|
583
|
+
"""
|
|
584
|
+
signature = inspect.signature(func)
|
|
585
|
+
parsed_docstring = docstring_parser.parse(inspect.getdoc(func) or "")
|
|
586
|
+
doc_dict = {p.arg_name: p.description for p in parsed_docstring.params}
|
|
587
|
+
|
|
588
|
+
params: list[ParamInfo] = []
|
|
589
|
+
|
|
590
|
+
for param in signature.parameters.values():
|
|
591
|
+
annotation = param.annotation
|
|
592
|
+
tangle_type, deserializer, kind = _resolve_annotation(annotation)
|
|
593
|
+
|
|
594
|
+
# Determine the YAML name (strip _path/_file suffixes for InputPath/OutputPath)
|
|
595
|
+
yaml_name = param.name
|
|
596
|
+
if kind in ("output", "input_path"):
|
|
597
|
+
if yaml_name.endswith("_path"):
|
|
598
|
+
yaml_name = yaml_name[: -len("_path")]
|
|
599
|
+
elif yaml_name.endswith("_file"):
|
|
600
|
+
yaml_name = yaml_name[: -len("_file")]
|
|
601
|
+
|
|
602
|
+
# Determine optionality and default
|
|
603
|
+
optional = False
|
|
604
|
+
default = inspect.Parameter.empty
|
|
605
|
+
if param.default is not inspect.Parameter.empty:
|
|
606
|
+
if kind == "input":
|
|
607
|
+
optional = True
|
|
608
|
+
default = param.default
|
|
609
|
+
elif kind == "input_path" and param.default is None:
|
|
610
|
+
optional = True
|
|
611
|
+
|
|
612
|
+
params.append(
|
|
613
|
+
ParamInfo(
|
|
614
|
+
name=param.name,
|
|
615
|
+
yaml_name=yaml_name,
|
|
616
|
+
python_type=str(annotation) if annotation is not inspect.Parameter.empty else None,
|
|
617
|
+
tangle_type=tangle_type,
|
|
618
|
+
kind=kind,
|
|
619
|
+
description=doc_dict.get(param.name),
|
|
620
|
+
default=default,
|
|
621
|
+
optional=optional,
|
|
622
|
+
deserializer=deserializer,
|
|
623
|
+
)
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
component_name = docstring_metadata.get("name") or _python_name_to_component_name(func.__name__)
|
|
627
|
+
description = parsed_docstring.description
|
|
628
|
+
if description:
|
|
629
|
+
# Strip Metadata: section that docstring_parser doesn't understand
|
|
630
|
+
desc_lines = []
|
|
631
|
+
for line in description.split("\n"):
|
|
632
|
+
if line.strip().lower() == "metadata:":
|
|
633
|
+
break
|
|
634
|
+
desc_lines.append(line)
|
|
635
|
+
description = "\n".join(desc_lines).strip()
|
|
636
|
+
|
|
637
|
+
# Get source code
|
|
638
|
+
source_code = ""
|
|
639
|
+
source_code_stripped = ""
|
|
640
|
+
module_source_stripped = ""
|
|
641
|
+
try:
|
|
642
|
+
raw_source = inspect.getsource(func)
|
|
643
|
+
source_code = textwrap.dedent(raw_source)
|
|
644
|
+
# Remove decorators
|
|
645
|
+
lines = source_code.split("\n")
|
|
646
|
+
while lines and not lines[0].startswith("def "):
|
|
647
|
+
del lines[0]
|
|
648
|
+
source_code = "\n".join(lines)
|
|
649
|
+
source_code_stripped = _strip_type_hints(source_code)
|
|
650
|
+
|
|
651
|
+
# module_source_stripped is populated externally via generate_component_yaml
|
|
652
|
+
# (since we have the file path there but not here)
|
|
653
|
+
except (OSError, TypeError) as e:
|
|
654
|
+
warnings.warn(f"Could not get source code for {func.__name__}: {e}")
|
|
655
|
+
|
|
656
|
+
# Extract return type outputs (NamedTuple or single value)
|
|
657
|
+
return_params, single_return_output = _resolve_return_type(func)
|
|
658
|
+
|
|
659
|
+
# Enrich return_params with descriptions from docstring Returns section.
|
|
660
|
+
# docstring_parser interprets "field_name: description" under Returns as
|
|
661
|
+
# type_name=field_name, so we check both return_name and type_name.
|
|
662
|
+
if return_params and parsed_docstring.many_returns:
|
|
663
|
+
returns_dict: dict[str, str] = {}
|
|
664
|
+
for r in parsed_docstring.many_returns:
|
|
665
|
+
name = r.return_name or r.type_name
|
|
666
|
+
if name and r.description:
|
|
667
|
+
returns_dict[name] = r.description
|
|
668
|
+
for rp in return_params:
|
|
669
|
+
if rp.name in returns_dict:
|
|
670
|
+
rp.description = returns_dict[rp.name]
|
|
671
|
+
|
|
672
|
+
return FunctionSpec(
|
|
673
|
+
name=func.__name__,
|
|
674
|
+
component_name=component_name,
|
|
675
|
+
description=description,
|
|
676
|
+
params=params,
|
|
677
|
+
return_params=return_params,
|
|
678
|
+
single_return_output=single_return_output,
|
|
679
|
+
source_code=source_code,
|
|
680
|
+
source_code_stripped=source_code_stripped,
|
|
681
|
+
module_source_stripped=module_source_stripped,
|
|
682
|
+
docstring_metadata=docstring_metadata,
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
# ============================================================================
|
|
687
|
+
# __main__ guard stripping
|
|
688
|
+
# ============================================================================
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def _strip_main_guard(source_code: str) -> str:
|
|
692
|
+
"""Remove ``if __name__ == "__main__":`` blocks from source code.
|
|
693
|
+
|
|
694
|
+
These guards conflict with the generated argparse wrapper because both
|
|
695
|
+
execute at module level. When the guard appears *before* the wrapper it
|
|
696
|
+
fires first and typically calls ``sys.exit()``, preventing the component
|
|
697
|
+
from running.
|
|
698
|
+
"""
|
|
699
|
+
try:
|
|
700
|
+
tree = ast.parse(source_code)
|
|
701
|
+
except SyntaxError:
|
|
702
|
+
return source_code
|
|
703
|
+
|
|
704
|
+
lines = source_code.splitlines(keepends=True)
|
|
705
|
+
|
|
706
|
+
# Collect line ranges to remove (1-indexed, inclusive)
|
|
707
|
+
ranges_to_remove: list[tuple[int, int]] = []
|
|
708
|
+
for node in ast.iter_child_nodes(tree):
|
|
709
|
+
if not isinstance(node, ast.If):
|
|
710
|
+
continue
|
|
711
|
+
if _is_name_main_test(node.test):
|
|
712
|
+
start = node.lineno
|
|
713
|
+
end = node.end_lineno or node.lineno
|
|
714
|
+
ranges_to_remove.append((start, end))
|
|
715
|
+
|
|
716
|
+
if not ranges_to_remove:
|
|
717
|
+
return source_code
|
|
718
|
+
|
|
719
|
+
removed: set[int] = set()
|
|
720
|
+
for start, end in ranges_to_remove:
|
|
721
|
+
removed.update(range(start, end + 1))
|
|
722
|
+
|
|
723
|
+
kept = [line for i, line in enumerate(lines, 1) if i not in removed]
|
|
724
|
+
return "".join(kept)
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
def _is_name_main_test(node: ast.expr) -> bool:
|
|
728
|
+
"""Return True if *node* is ``__name__ == "__main__"`` (in either order)."""
|
|
729
|
+
if not isinstance(node, ast.Compare):
|
|
730
|
+
return False
|
|
731
|
+
if len(node.ops) != 1 or not isinstance(node.ops[0], ast.Eq):
|
|
732
|
+
return False
|
|
733
|
+
if len(node.comparators) != 1:
|
|
734
|
+
return False
|
|
735
|
+
|
|
736
|
+
left = node.left
|
|
737
|
+
right = node.comparators[0]
|
|
738
|
+
|
|
739
|
+
def _is_dunder_name(n: ast.expr) -> bool:
|
|
740
|
+
return isinstance(n, ast.Name) and n.id == "__name__"
|
|
741
|
+
|
|
742
|
+
def _is_main_str(n: ast.expr) -> bool:
|
|
743
|
+
return isinstance(n, ast.Constant) and n.value == "__main__"
|
|
744
|
+
|
|
745
|
+
return (_is_dunder_name(left) and _is_main_str(right)) or (_is_main_str(left) and _is_dunder_name(right))
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
# ============================================================================
|
|
749
|
+
# Authoring-construct stripping (authoring imports + @task/@pipeline/@subpipeline/@registered)
|
|
750
|
+
# ============================================================================
|
|
751
|
+
|
|
752
|
+
# Decorators that exist purely to *record* a function at authoring time. They
|
|
753
|
+
# must never survive into the baked operation program (see
|
|
754
|
+
# _strip_authoring_constructs). ``registered`` marks an op published separately
|
|
755
|
+
# via its own gen_config.yaml; when that same op is baked (through its
|
|
756
|
+
# local_from_python entry) the decorator + its authoring import must be stripped
|
|
757
|
+
# too, exactly like @task.
|
|
758
|
+
_AUTHORING_DECORATOR_NAMES = frozenset({"task", "pipeline", "subpipeline", "registered"})
|
|
759
|
+
|
|
760
|
+
# The python-pipeline authoring module. ONLY imports of this module (and its
|
|
761
|
+
# submodules) are authoring-only and stripped from the baked source. We
|
|
762
|
+
# deliberately do NOT strip other ``tangle_deploy.*`` packages (e.g.
|
|
763
|
+
# ``tangle_deploy.utils``): those may be legitimate runtime helpers used inside a
|
|
764
|
+
# ``@task`` body, and dropping them would raise ``NameError`` in the operation
|
|
765
|
+
# container.
|
|
766
|
+
_AUTHORING_IMPORT_MODULE = "tangle_deploy.python_pipeline"
|
|
767
|
+
|
|
768
|
+
# The authoring-only ``TaskEnv`` class name. A module-level ``X = TaskEnv(...)``
|
|
769
|
+
# (or ``X = <alias>.TaskEnv(...)``) declaration is authoring-only by contract and
|
|
770
|
+
# is stripped from the baked source by ``_strip_authoring_constructs``.
|
|
771
|
+
# Matched by trailing NAME only (like the authoring decorators), because in
|
|
772
|
+
# python-pipeline authoring files ``TaskEnv`` always
|
|
773
|
+
# resolves to ``tangle_deploy.python_pipeline.TaskEnv``.
|
|
774
|
+
_AUTHORING_ENV_CLASS_NAME = "TaskEnv"
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
class AuthoringStripError(ValueError):
|
|
778
|
+
"""Raised when env-only authoring code cannot be safely stripped.
|
|
779
|
+
|
|
780
|
+
The TaskEnv runtime-strip hardening (``_strip_authoring_constructs``)
|
|
781
|
+
raises this when a ``@task(env=...)`` env binding is entangled with
|
|
782
|
+
runtime code — e.g. a mixed ``from _envs import UPI, helper`` import whose
|
|
783
|
+
``helper`` is used at runtime, or a collected env name referenced by the
|
|
784
|
+
kept task body. Failing fast here is intentional: silently baking a broken
|
|
785
|
+
``from _envs import UPI`` / ``UPI = TaskEnv(...)`` would only surface as a
|
|
786
|
+
``NameError`` / ``ImportError`` at container start. The message tells the
|
|
787
|
+
author how to split the import or keep TaskEnv values authoring-only.
|
|
788
|
+
"""
|
|
789
|
+
|
|
790
|
+
|
|
791
|
+
def _decorator_called_name(node: ast.expr) -> str | None:
|
|
792
|
+
"""Return the trailing name a decorator expression resolves to.
|
|
793
|
+
|
|
794
|
+
Handles ``@name`` / ``@name(...)`` and ``@mod.name`` / ``@mod.name(...)``
|
|
795
|
+
forms, returning the trailing attribute/name (e.g. ``task`` for both
|
|
796
|
+
``@task(...)`` and ``@tangle_deploy.python_pipeline.task(...)``). Returns
|
|
797
|
+
``None`` for shapes we do not recognise so callers leave them untouched.
|
|
798
|
+
|
|
799
|
+
Limitation (v1, intentional): matching is by trailing NAME only, not by
|
|
800
|
+
import resolution. A hypothetical unrelated ``@some_other_lib.task(...)``
|
|
801
|
+
decorator would therefore also match. This is acceptable because in
|
|
802
|
+
python-pipeline authoring files the only decorators named ``task`` /
|
|
803
|
+
``pipeline`` / ``subpipeline`` are the authoring decorators; resolving the
|
|
804
|
+
import binding is deferred unless a real collision appears.
|
|
805
|
+
"""
|
|
806
|
+
if isinstance(node, ast.Call):
|
|
807
|
+
node = node.func
|
|
808
|
+
if isinstance(node, ast.Name):
|
|
809
|
+
return node.id
|
|
810
|
+
if isinstance(node, ast.Attribute):
|
|
811
|
+
return node.attr
|
|
812
|
+
return None
|
|
813
|
+
|
|
814
|
+
|
|
815
|
+
def _is_authoring_import(node: ast.stmt) -> bool:
|
|
816
|
+
"""Return True if *node* imports the python-pipeline authoring surface.
|
|
817
|
+
|
|
818
|
+
Matches ONLY the ``tangle_deploy.python_pipeline`` module (and its
|
|
819
|
+
submodules):
|
|
820
|
+
|
|
821
|
+
- ``from tangle_deploy.python_pipeline import ...`` (including the aliased
|
|
822
|
+
``from tangle_deploy.python_pipeline import ref as operation_by_ref`` form
|
|
823
|
+
and submodules like ``from tangle_deploy.python_pipeline.x import y``);
|
|
824
|
+
- ``import tangle_deploy.python_pipeline`` / ``import
|
|
825
|
+
tangle_deploy.python_pipeline as tp``.
|
|
826
|
+
|
|
827
|
+
It does NOT match other ``tangle_deploy.*`` packages (e.g.
|
|
828
|
+
``from tangle_deploy.utils import X``) — those can be genuine runtime helpers
|
|
829
|
+
referenced inside a ``@task`` body and must survive into the baked program.
|
|
830
|
+
Relative imports (``from . import x``) are never authoring imports.
|
|
831
|
+
"""
|
|
832
|
+
if isinstance(node, ast.ImportFrom):
|
|
833
|
+
if node.level: # relative import — not the authoring package
|
|
834
|
+
return False
|
|
835
|
+
module = node.module or ""
|
|
836
|
+
return module == _AUTHORING_IMPORT_MODULE or module.startswith(_AUTHORING_IMPORT_MODULE + ".")
|
|
837
|
+
if isinstance(node, ast.Import):
|
|
838
|
+
return any(
|
|
839
|
+
alias.name == _AUTHORING_IMPORT_MODULE or alias.name.startswith(_AUTHORING_IMPORT_MODULE + ".")
|
|
840
|
+
for alias in node.names
|
|
841
|
+
)
|
|
842
|
+
return False
|
|
843
|
+
|
|
844
|
+
|
|
845
|
+
def _attr_root_name(node: ast.expr) -> str | None:
|
|
846
|
+
"""Return the root ``Name`` id of an attribute chain (``a.b.c`` -> ``a``).
|
|
847
|
+
|
|
848
|
+
Returns ``None`` for shapes that don't bottom out in a plain ``Name``
|
|
849
|
+
(e.g. ``foo().bar``), so callers leave them untouched.
|
|
850
|
+
"""
|
|
851
|
+
while isinstance(node, ast.Attribute):
|
|
852
|
+
node = node.value
|
|
853
|
+
return node.id if isinstance(node, ast.Name) else None
|
|
854
|
+
|
|
855
|
+
|
|
856
|
+
def _env_keyword_binding_name(call: ast.Call) -> str | None:
|
|
857
|
+
"""Return the module-level authoring name a ``@task(env=...)`` keyword needs.
|
|
858
|
+
|
|
859
|
+
Inspects the ``env=`` keyword of a (stripped) ``@task(...)`` decorator and
|
|
860
|
+
returns the name of the module-level binding that must also be stripped so
|
|
861
|
+
the baked runtime program does not crash referencing an authoring-only name:
|
|
862
|
+
|
|
863
|
+
- ``env=UPI`` -> ``"UPI"`` (a module-level env *binding* to strip, either an
|
|
864
|
+
``UPI = TaskEnv(...)`` assignment or a ``from _envs import UPI`` import);
|
|
865
|
+
- ``env=_envs.UPI`` -> ``"_envs"`` (the module-alias root, so the
|
|
866
|
+
``import _envs`` line can be stripped);
|
|
867
|
+
- ``env=TaskEnv(...)`` / ``env=tp.TaskEnv(...)`` (inline) -> ``None``: the
|
|
868
|
+
whole decorator line range is already deleted, so there is no residual
|
|
869
|
+
module-level binding to strip;
|
|
870
|
+
- anything else -> ``None`` (leave it untouched).
|
|
871
|
+
"""
|
|
872
|
+
for keyword in call.keywords:
|
|
873
|
+
if keyword.arg != "env":
|
|
874
|
+
continue
|
|
875
|
+
value = keyword.value
|
|
876
|
+
if isinstance(value, ast.Name):
|
|
877
|
+
return value.id
|
|
878
|
+
if isinstance(value, ast.Attribute):
|
|
879
|
+
return _attr_root_name(value)
|
|
880
|
+
# env=TaskEnv(...) / env=tp.TaskEnv(...) inline, or any other shape:
|
|
881
|
+
# the decorator range already covers it, no residual binding.
|
|
882
|
+
return None
|
|
883
|
+
return None
|
|
884
|
+
|
|
885
|
+
|
|
886
|
+
def _is_task_env_construction(value: ast.expr | None) -> bool:
|
|
887
|
+
"""True if *value* is a direct ``TaskEnv(...)`` / ``<alias>.TaskEnv(...)`` call.
|
|
888
|
+
|
|
889
|
+
Matched by trailing call name (mirroring ``_decorator_called_name``), so
|
|
890
|
+
both ``TaskEnv(image=...)`` and ``tp.TaskEnv(image=...)`` qualify. Used to
|
|
891
|
+
detect module-level env declarations like ``UPI = TaskEnv(...)`` regardless
|
|
892
|
+
of whether a ``@task(env=UPI)`` references them.
|
|
893
|
+
"""
|
|
894
|
+
return isinstance(value, ast.Call) and _decorator_called_name(value) == _AUTHORING_ENV_CLASS_NAME
|
|
895
|
+
|
|
896
|
+
|
|
897
|
+
def _import_bound_names(node: ast.Import | ast.ImportFrom) -> dict[str, ast.alias]:
|
|
898
|
+
"""Map each name a top-level import binds into the namespace to its alias.
|
|
899
|
+
|
|
900
|
+
- ``from m import UPI`` -> ``{"UPI": alias}``
|
|
901
|
+
- ``from m import UPI as U`` -> ``{"U": alias}``
|
|
902
|
+
- ``import _envs`` -> ``{"_envs": alias}`` (root of a dotted module path)
|
|
903
|
+
- ``import a.b.c`` -> ``{"a": alias}``
|
|
904
|
+
- ``import envs as task_envs`` -> ``{"task_envs": alias}``
|
|
905
|
+
"""
|
|
906
|
+
bound: dict[str, ast.alias] = {}
|
|
907
|
+
for alias in node.names:
|
|
908
|
+
if alias.asname:
|
|
909
|
+
bound[alias.asname] = alias
|
|
910
|
+
elif isinstance(node, ast.Import):
|
|
911
|
+
# ``import a.b.c`` binds only the top-level package ``a``.
|
|
912
|
+
bound[alias.name.split(".", 1)[0]] = alias
|
|
913
|
+
else:
|
|
914
|
+
bound[alias.name] = alias
|
|
915
|
+
return bound
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
def _annotation_name_node_ids(tree: ast.AST) -> set[int]:
|
|
919
|
+
"""Return ``id()`` of every ``ast.Name`` that lives inside a type-annotation slot.
|
|
920
|
+
|
|
921
|
+
Annotation slots are stripped from the baked output by ``_strip_type_hints``
|
|
922
|
+
(which runs AFTER ``_strip_authoring_constructs``), so a name that appears
|
|
923
|
+
ONLY in an annotation is NOT a live runtime reference. Excluding these from
|
|
924
|
+
the fail-fast reference scan prevents a false positive where an env name
|
|
925
|
+
used only as a parameter/return type annotation (``def f(x: UPI) -> UPI:``)
|
|
926
|
+
is mistaken for a kept runtime reference (FIX N1, §3.5).
|
|
927
|
+
|
|
928
|
+
Annotation slots covered (matching ``_strip_type_hints_ast``):
|
|
929
|
+
|
|
930
|
+
- function parameter annotations: ``args.args`` / ``posonlyargs`` /
|
|
931
|
+
``kwonlyargs`` plus ``*args`` (``vararg``) and ``**kwargs`` (``kwarg``);
|
|
932
|
+
- ``FunctionDef`` / ``AsyncFunctionDef`` return annotations (``-> T``);
|
|
933
|
+
- ``AnnAssign`` annotations (``x: T`` / ``x: T = ...``).
|
|
934
|
+
|
|
935
|
+
Because ``tree`` stays alive for the duration of the caller, every node's
|
|
936
|
+
``id()`` is stable and unique, so identity membership is reliable.
|
|
937
|
+
"""
|
|
938
|
+
annotation_slots: list[ast.expr] = []
|
|
939
|
+
for node in ast.walk(tree):
|
|
940
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
941
|
+
args = node.args
|
|
942
|
+
for arg in (
|
|
943
|
+
*args.posonlyargs,
|
|
944
|
+
*args.args,
|
|
945
|
+
*args.kwonlyargs,
|
|
946
|
+
args.vararg,
|
|
947
|
+
args.kwarg,
|
|
948
|
+
):
|
|
949
|
+
if arg is not None and arg.annotation is not None:
|
|
950
|
+
annotation_slots.append(arg.annotation)
|
|
951
|
+
if node.returns is not None:
|
|
952
|
+
annotation_slots.append(node.returns)
|
|
953
|
+
elif isinstance(node, ast.AnnAssign):
|
|
954
|
+
annotation_slots.append(node.annotation)
|
|
955
|
+
|
|
956
|
+
name_ids: set[int] = set()
|
|
957
|
+
for slot in annotation_slots:
|
|
958
|
+
for sub in ast.walk(slot):
|
|
959
|
+
if isinstance(sub, ast.Name):
|
|
960
|
+
name_ids.add(id(sub))
|
|
961
|
+
return name_ids
|
|
962
|
+
|
|
963
|
+
|
|
964
|
+
def _strip_authoring_constructs(source_code: str) -> str:
|
|
965
|
+
"""Strip python-pipeline authoring imports and decorators from baked source.
|
|
966
|
+
|
|
967
|
+
The generated operation container re-executes ``module_source_stripped`` at
|
|
968
|
+
startup and then calls the target function directly. Authoring constructs
|
|
969
|
+
must NOT survive into that runtime program:
|
|
970
|
+
|
|
971
|
+
- re-running an ``@task`` / ``@pipeline`` / ``@subpipeline`` decorator
|
|
972
|
+
replaces the function with a ``CallableRef`` recorder, which raises at
|
|
973
|
+
call time because there is no active ``@pipeline`` trace context;
|
|
974
|
+
- on a thin image the ``from tangle_deploy.python_pipeline import ...``
|
|
975
|
+
import itself can fail with ``ImportError``.
|
|
976
|
+
|
|
977
|
+
This removes them via surgical AST line-range deletion (mirroring
|
|
978
|
+
``_strip_main_guard``), so comments/formatting in the rest of the source
|
|
979
|
+
survive — we deliberately avoid a full ``ast.unparse`` round-trip.
|
|
980
|
+
|
|
981
|
+
Contract this relies on: authoring-surface names (``task``, ``pipeline``,
|
|
982
|
+
``subpipeline``, ``In``, ``Out``, ``Outputs``, ``ref``, ...) appear ONLY in
|
|
983
|
+
decorators and type annotations — both stripped before the source is baked —
|
|
984
|
+
never in a runtime function body. Dropping the whole authoring import line is
|
|
985
|
+
therefore safe.
|
|
986
|
+
|
|
987
|
+
Scope of the strip (intentional v1 boundaries):
|
|
988
|
+
|
|
989
|
+
- imports: only ``tangle_deploy.python_pipeline`` (and submodules) are
|
|
990
|
+
dropped — see ``_is_authoring_import``. Other ``tangle_deploy.*`` runtime
|
|
991
|
+
helpers are preserved.
|
|
992
|
+
- decorators: matched by trailing NAME (``task`` / ``pipeline`` /
|
|
993
|
+
``subpipeline``), not by import resolution — see ``_decorator_called_name``
|
|
994
|
+
for the limitation. Unrelated decorators (``@functools.cache``,
|
|
995
|
+
``@property``, ...) are preserved.
|
|
996
|
+
|
|
997
|
+
TaskEnv authoring-strip hardening (``@task(env=...)``): an env
|
|
998
|
+
declaration that exists ONLY to feed a stripped ``@task(env=...)`` decorator
|
|
999
|
+
would otherwise crash the baked program (``NameError: TaskEnv`` for a
|
|
1000
|
+
co-located ``UPI = TaskEnv(...)`` whose import was stripped, or
|
|
1001
|
+
``ImportError`` for a ``from _envs import UPI`` whose module is not in the
|
|
1002
|
+
runtime image). On top of the import/decorator strip this also removes, by
|
|
1003
|
+
line range:
|
|
1004
|
+
|
|
1005
|
+
- every module-level ``X = TaskEnv(...)`` / ``X: TaskEnv = TaskEnv(...)``
|
|
1006
|
+
declaration (direct ``TaskEnv(...)`` construction), and
|
|
1007
|
+
- module-level bindings (assignment OR import) of any name a stripped
|
|
1008
|
+
``@task(env=...)`` referenced — ``env=UPI`` collects ``UPI``
|
|
1009
|
+
(``UPI = TaskEnv(...)`` / ``UPI = make_task_env(...)`` / ``from _envs import
|
|
1010
|
+
UPI``); ``env=_envs.UPI`` collects the module alias ``_envs``
|
|
1011
|
+
(``import _envs``).
|
|
1012
|
+
|
|
1013
|
+
It is deliberately narrow: only names PROVEN to participate in a stripped
|
|
1014
|
+
``@task(env=...)`` decorator or a direct module-level ``TaskEnv(...)`` call
|
|
1015
|
+
are removed. It is NOT a general unused-import cleaner. It raises
|
|
1016
|
+
:class:`AuthoringStripError` (fail-fast) rather than bake a broken program
|
|
1017
|
+
when an env binding is entangled with runtime code: a mixed
|
|
1018
|
+
``from _envs import UPI, helper`` whose ``helper`` is used at runtime, or a
|
|
1019
|
+
collected env name still referenced by the kept task body.
|
|
1020
|
+
|
|
1021
|
+
This intentionally operates on ``module_source_stripped`` ONLY. It must never
|
|
1022
|
+
touch the verbatim ``python_original_code`` annotation, which is read
|
|
1023
|
+
directly from the source file elsewhere and kept byte-verbatim.
|
|
1024
|
+
"""
|
|
1025
|
+
try:
|
|
1026
|
+
tree = ast.parse(source_code)
|
|
1027
|
+
except SyntaxError:
|
|
1028
|
+
return source_code
|
|
1029
|
+
|
|
1030
|
+
lines = source_code.splitlines(keepends=True)
|
|
1031
|
+
removed: set[int] = set() # 1-indexed line numbers to drop
|
|
1032
|
+
# Names introduced ONLY to feed a stripped ``@task(env=...)`` decorator.
|
|
1033
|
+
# Collected from ``env=`` keywords; used below to strip the matching
|
|
1034
|
+
# module-level assignment/import binding.
|
|
1035
|
+
collected_env_names: set[str] = set()
|
|
1036
|
+
|
|
1037
|
+
for node in ast.walk(tree):
|
|
1038
|
+
# Authoring imports — delete the whole (possibly multi-line) statement.
|
|
1039
|
+
if isinstance(node, (ast.Import, ast.ImportFrom)) and _is_authoring_import(node):
|
|
1040
|
+
start = node.lineno
|
|
1041
|
+
end = node.end_lineno or node.lineno
|
|
1042
|
+
removed.update(range(start, end + 1))
|
|
1043
|
+
continue
|
|
1044
|
+
|
|
1045
|
+
# @task / @pipeline / @subpipeline decorators on functions/classes.
|
|
1046
|
+
# The "@" shares the decorator expression's first line, so removing the
|
|
1047
|
+
# node's full line range removes the "@" too. Real-world decorators span
|
|
1048
|
+
# multiple lines, hence lineno..end_lineno rather than a prefix match.
|
|
1049
|
+
decorator_list = getattr(node, "decorator_list", None)
|
|
1050
|
+
if not decorator_list:
|
|
1051
|
+
continue
|
|
1052
|
+
for decorator in decorator_list:
|
|
1053
|
+
if _decorator_called_name(decorator) in _AUTHORING_DECORATOR_NAMES:
|
|
1054
|
+
start = decorator.lineno
|
|
1055
|
+
end = decorator.end_lineno or decorator.lineno
|
|
1056
|
+
removed.update(range(start, end + 1))
|
|
1057
|
+
# Record the env-only authoring name this @task(env=...) needs
|
|
1058
|
+
# stripped from module scope (None for inline TaskEnv(...)).
|
|
1059
|
+
if isinstance(decorator, ast.Call):
|
|
1060
|
+
env_name = _env_keyword_binding_name(decorator)
|
|
1061
|
+
if env_name is not None:
|
|
1062
|
+
collected_env_names.add(env_name)
|
|
1063
|
+
|
|
1064
|
+
# --- Fail-fast: nested/conditional env imports cannot be stripped (N1/N2) -
|
|
1065
|
+
#
|
|
1066
|
+
# Module-level removal below only touches ``tree.body``. An env import
|
|
1067
|
+
# nested inside an ``if`` / ``try`` / function body (i.e. NOT a direct child
|
|
1068
|
+
# of ``tree.body``) is therefore NOT stripped and would LEAK into the baked
|
|
1069
|
+
# program -> ``ImportError`` on a thin runtime image (or re-binding an
|
|
1070
|
+
# authoring-only name) at container start. We also must NOT line-delete a
|
|
1071
|
+
# nested import: removing the only statement in a block leaves an empty
|
|
1072
|
+
# suite -> ``IndentationError``. Converting the silent leak into a loud,
|
|
1073
|
+
# actionable error is the correct, safe behavior (FIX N2, §3.5).
|
|
1074
|
+
if collected_env_names:
|
|
1075
|
+
top_level_stmt_ids = {id(stmt) for stmt in tree.body}
|
|
1076
|
+
for node in ast.walk(tree):
|
|
1077
|
+
if not isinstance(node, (ast.Import, ast.ImportFrom)):
|
|
1078
|
+
continue
|
|
1079
|
+
if id(node) in top_level_stmt_ids:
|
|
1080
|
+
continue # module-level imports are handled by the strip below
|
|
1081
|
+
nested_env = sorted(collected_env_names & _import_bound_names(node).keys())
|
|
1082
|
+
if nested_env:
|
|
1083
|
+
names_repr = ", ".join(repr(n) for n in nested_env)
|
|
1084
|
+
raise AuthoringStripError(
|
|
1085
|
+
f"env name {names_repr} is imported inside a nested block "
|
|
1086
|
+
"(if/try/function); TaskEnv env imports must be module-level "
|
|
1087
|
+
"/ authoring-only. A nested env import is not stripped and "
|
|
1088
|
+
"would leak into the baked runtime program (ImportError at "
|
|
1089
|
+
"container start). Move it to a top-level import so it can be "
|
|
1090
|
+
"stripped, and keep TaskEnv values authoring-only."
|
|
1091
|
+
)
|
|
1092
|
+
|
|
1093
|
+
# --- TaskEnv env-only declarations / imports (§3.5) ---------------------
|
|
1094
|
+
#
|
|
1095
|
+
# Restricted to module-level statements (``tree.body``) so nested code is
|
|
1096
|
+
# never touched. Two kinds of statement are stripped:
|
|
1097
|
+
# 1. assignments that construct a TaskEnv directly (``X = TaskEnv(...)``)
|
|
1098
|
+
# or whose target is a collected env name (``UPI = make_task_env(...)``
|
|
1099
|
+
# when ``@task(env=UPI)`` was seen), and
|
|
1100
|
+
# 2. imports that bind a collected env name/module (``from _envs import
|
|
1101
|
+
# UPI`` / ``import _envs``) when that name is env-only.
|
|
1102
|
+
#
|
|
1103
|
+
# We record each candidate's bound name(s) + line range, then verify (after
|
|
1104
|
+
# a reference scan) that removing it cannot break kept runtime code.
|
|
1105
|
+
env_assign_bindings: list[tuple[set[str], int, int]] = [] # (names, start, end)
|
|
1106
|
+
env_import_candidates: list[tuple[ast.Import | ast.ImportFrom, int, int]] = []
|
|
1107
|
+
for stmt in tree.body:
|
|
1108
|
+
if isinstance(stmt, ast.Assign):
|
|
1109
|
+
simple_targets = {t.id for t in stmt.targets if isinstance(t, ast.Name)}
|
|
1110
|
+
if _is_task_env_construction(stmt.value) or (simple_targets & collected_env_names):
|
|
1111
|
+
env_assign_bindings.append((simple_targets, stmt.lineno, stmt.end_lineno or stmt.lineno))
|
|
1112
|
+
elif isinstance(stmt, ast.AnnAssign) and isinstance(stmt.target, ast.Name):
|
|
1113
|
+
tname = stmt.target.id
|
|
1114
|
+
if _is_task_env_construction(stmt.value) or tname in collected_env_names:
|
|
1115
|
+
env_assign_bindings.append(({tname}, stmt.lineno, stmt.end_lineno or stmt.lineno))
|
|
1116
|
+
elif isinstance(stmt, (ast.Import, ast.ImportFrom)):
|
|
1117
|
+
if _is_authoring_import(stmt):
|
|
1118
|
+
continue # already removed above
|
|
1119
|
+
bound = _import_bound_names(stmt)
|
|
1120
|
+
if collected_env_names & bound.keys():
|
|
1121
|
+
env_import_candidates.append((stmt, stmt.lineno, stmt.end_lineno or stmt.lineno))
|
|
1122
|
+
|
|
1123
|
+
# Provisionally drop every env declaration/import candidate. Their own line
|
|
1124
|
+
# ranges hold no runtime ``Load`` of the bound name (assignment targets are
|
|
1125
|
+
# ``Store``; import bindings are aliases), so including them now does not
|
|
1126
|
+
# mask a real runtime reference detected below.
|
|
1127
|
+
for _names, start, end in env_assign_bindings:
|
|
1128
|
+
removed.update(range(start, end + 1))
|
|
1129
|
+
for _stmt, start, end in env_import_candidates:
|
|
1130
|
+
removed.update(range(start, end + 1))
|
|
1131
|
+
|
|
1132
|
+
# Reference scan: every ``Name`` used in a ``Load`` context, mapped to the
|
|
1133
|
+
# 1-indexed lines it appears on. Attribute roots (``_envs`` in
|
|
1134
|
+
# ``_envs.UPI``) are plain ``Name`` Load nodes too, so this covers them.
|
|
1135
|
+
#
|
|
1136
|
+
# FIX N1 (§3.5): exclude ``Name`` nodes that live in a type-annotation slot
|
|
1137
|
+
# (param/return/AnnAssign). Annotations are stripped from the baked output by
|
|
1138
|
+
# ``_strip_type_hints`` (which runs later), so an env name used ONLY as a
|
|
1139
|
+
# type annotation (``def f(x: UPI) -> UPI:``) is NOT a live runtime
|
|
1140
|
+
# reference and must not trip the body-ref fail-fast. A real body reference
|
|
1141
|
+
# (outside annotations) still records a Load and still fails fast.
|
|
1142
|
+
if env_assign_bindings or env_import_candidates:
|
|
1143
|
+
annotation_name_ids = _annotation_name_node_ids(tree)
|
|
1144
|
+
load_lines: dict[str, set[int]] = {}
|
|
1145
|
+
for node in ast.walk(tree):
|
|
1146
|
+
if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load) and id(node) not in annotation_name_ids:
|
|
1147
|
+
load_lines.setdefault(node.id, set()).add(node.lineno)
|
|
1148
|
+
|
|
1149
|
+
def _referenced_in_kept(name: str) -> bool:
|
|
1150
|
+
# ``name`` is used by runtime code iff it has a ``Load`` on a line
|
|
1151
|
+
# that survives the strip (i.e. not in ``removed``).
|
|
1152
|
+
return any(line not in removed for line in load_lines.get(name, ()))
|
|
1153
|
+
|
|
1154
|
+
# Fail fast: a stripped env declaration whose target the kept body still
|
|
1155
|
+
# references would leave a dangling ``NameError`` — env names are
|
|
1156
|
+
# authoring-only by contract.
|
|
1157
|
+
for names, _start, _end in env_assign_bindings:
|
|
1158
|
+
for name in names:
|
|
1159
|
+
if _referenced_in_kept(name):
|
|
1160
|
+
raise AuthoringStripError(
|
|
1161
|
+
f"TaskEnv authoring name {name!r} is referenced by the "
|
|
1162
|
+
"baked runtime code, but its declaration is stripped "
|
|
1163
|
+
"because it is a @task(env=...) environment. TaskEnv "
|
|
1164
|
+
"values are authoring-only: do not reference them from "
|
|
1165
|
+
"a task body or other runtime code. Move the runtime "
|
|
1166
|
+
"use out, or keep the value as a plain runtime object "
|
|
1167
|
+
"that is not used as @task(env=...)."
|
|
1168
|
+
)
|
|
1169
|
+
|
|
1170
|
+
for stmt, _start, _end in env_import_candidates:
|
|
1171
|
+
bound = _import_bound_names(stmt)
|
|
1172
|
+
env_bound = collected_env_names & bound.keys()
|
|
1173
|
+
other_bound = bound.keys() - env_bound
|
|
1174
|
+
# (a) Mixed import: an env-only name shares the statement with a
|
|
1175
|
+
# runtime name that is actually used. We cannot line-delete just
|
|
1176
|
+
# part of the statement, so fail fast with split guidance.
|
|
1177
|
+
used_others = sorted(n for n in other_bound if _referenced_in_kept(n))
|
|
1178
|
+
if used_others:
|
|
1179
|
+
raise AuthoringStripError(
|
|
1180
|
+
"Import " + ", ".join(sorted(env_bound)) + " is a @task(env=...) environment but shares an import "
|
|
1181
|
+
"statement with runtime name(s) "
|
|
1182
|
+
+ ", ".join(used_others)
|
|
1183
|
+
+ ". Split the import so TaskEnv env names are imported on "
|
|
1184
|
+
"their own line (e.g. `from _envs import UPI` separate from "
|
|
1185
|
+
"`from _envs import helper`); env imports are authoring-only "
|
|
1186
|
+
"and stripped from the baked runtime program."
|
|
1187
|
+
)
|
|
1188
|
+
# (b) The env name itself is still referenced by kept runtime code.
|
|
1189
|
+
for name in sorted(env_bound):
|
|
1190
|
+
if _referenced_in_kept(name):
|
|
1191
|
+
raise AuthoringStripError(
|
|
1192
|
+
f"TaskEnv authoring name {name!r} is imported and "
|
|
1193
|
+
"referenced by the baked runtime code, but its import is "
|
|
1194
|
+
"stripped because it is a @task(env=...) environment. "
|
|
1195
|
+
"TaskEnv values are authoring-only: do not reference "
|
|
1196
|
+
"them from a task body or other runtime code."
|
|
1197
|
+
)
|
|
1198
|
+
|
|
1199
|
+
if not removed:
|
|
1200
|
+
return source_code
|
|
1201
|
+
|
|
1202
|
+
kept = [line for i, line in enumerate(lines, 1) if i not in removed]
|
|
1203
|
+
return "".join(kept)
|
|
1204
|
+
|
|
1205
|
+
|
|
1206
|
+
# ============================================================================
|
|
1207
|
+
# Type hint stripping (replicating SDK strip_type_hints)
|
|
1208
|
+
# ============================================================================
|
|
1209
|
+
|
|
1210
|
+
|
|
1211
|
+
def _strip_type_hints(source_code: str) -> str:
|
|
1212
|
+
"""Strip type annotations from function definitions using the ast module."""
|
|
1213
|
+
try:
|
|
1214
|
+
return _strip_type_hints_ast(source_code)
|
|
1215
|
+
except Exception as e:
|
|
1216
|
+
warnings.warn(f"Failed to strip type hints (using source as-is): {e}")
|
|
1217
|
+
return source_code
|
|
1218
|
+
|
|
1219
|
+
|
|
1220
|
+
def _byte_col_to_char_col(line: str, byte_col: int) -> int:
|
|
1221
|
+
"""Convert a UTF-8 byte offset to a Python string character index.
|
|
1222
|
+
|
|
1223
|
+
AST col_offset/end_col_offset are UTF-8 byte offsets, not character indices.
|
|
1224
|
+
For ASCII-only lines they're identical, but non-ASCII characters (e.g. "café")
|
|
1225
|
+
cause the two to diverge.
|
|
1226
|
+
"""
|
|
1227
|
+
return len(line.encode("utf-8")[:byte_col].decode("utf-8", errors="replace"))
|
|
1228
|
+
|
|
1229
|
+
|
|
1230
|
+
def _strip_type_hints_ast(source_code: str) -> str:
|
|
1231
|
+
"""Strip type annotations from function definitions using the ast module.
|
|
1232
|
+
|
|
1233
|
+
Removes parameter annotations (`: type`) and return annotations (`-> type`)
|
|
1234
|
+
from all function definitions. Uses AST to locate annotations, then performs
|
|
1235
|
+
surgical string removal to preserve original formatting.
|
|
1236
|
+
"""
|
|
1237
|
+
tree = ast.parse(source_code)
|
|
1238
|
+
lines = source_code.splitlines(keepends=True)
|
|
1239
|
+
|
|
1240
|
+
# Collect (line, col_start, col_end) ranges to remove, in source order.
|
|
1241
|
+
# We'll process them in reverse order so removals don't shift earlier offsets.
|
|
1242
|
+
# All columns here are character indices (converted from AST byte offsets).
|
|
1243
|
+
removals: list[tuple[int, int, int, int]] = [] # (start_line, start_col, end_line, end_col)
|
|
1244
|
+
|
|
1245
|
+
for node in ast.walk(tree):
|
|
1246
|
+
if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
1247
|
+
continue
|
|
1248
|
+
|
|
1249
|
+
# --- Return annotation: remove " -> <type>" before the colon ---
|
|
1250
|
+
if node.returns is not None:
|
|
1251
|
+
ret = node.returns
|
|
1252
|
+
ret_start_line = ret.lineno # 1-indexed
|
|
1253
|
+
ret_line_text = lines[ret_start_line - 1]
|
|
1254
|
+
ret_start_col = _byte_col_to_char_col(ret_line_text, ret.col_offset)
|
|
1255
|
+
ret_end_line = ret.end_lineno or ret_start_line
|
|
1256
|
+
ret_end_line_text = lines[ret_end_line - 1]
|
|
1257
|
+
ret_end_col = _byte_col_to_char_col(ret_end_line_text, ret.end_col_offset or (ret.col_offset + 1))
|
|
1258
|
+
|
|
1259
|
+
# Find the "->" token by scanning backwards from the annotation start.
|
|
1260
|
+
# The arrow may be on the same line as the type, or on a preceding line
|
|
1261
|
+
# (e.g. `def f()\n -> str:`), so we search backwards through lines.
|
|
1262
|
+
# Bound the search to the def line to avoid matching a previous function.
|
|
1263
|
+
min_line_idx = node.lineno - 1 # 0-indexed; the "def" line
|
|
1264
|
+
arrow_line_idx = ret_start_line - 1 # 0-indexed
|
|
1265
|
+
arrow_pos = -1
|
|
1266
|
+
while arrow_line_idx >= min_line_idx:
|
|
1267
|
+
search_region = lines[arrow_line_idx]
|
|
1268
|
+
if arrow_line_idx == ret_start_line - 1:
|
|
1269
|
+
search_region = search_region[:ret_start_col]
|
|
1270
|
+
arrow_pos = search_region.rfind("->")
|
|
1271
|
+
if arrow_pos != -1:
|
|
1272
|
+
break
|
|
1273
|
+
arrow_line_idx -= 1
|
|
1274
|
+
|
|
1275
|
+
if arrow_pos != -1:
|
|
1276
|
+
# Strip any whitespace before the arrow too
|
|
1277
|
+
strip_start = arrow_pos
|
|
1278
|
+
line_text = lines[arrow_line_idx]
|
|
1279
|
+
while strip_start > 0 and line_text[strip_start - 1] == " ":
|
|
1280
|
+
strip_start -= 1
|
|
1281
|
+
removals.append((arrow_line_idx + 1, strip_start, ret_end_line, ret_end_col))
|
|
1282
|
+
|
|
1283
|
+
# --- Parameter annotations: remove ": <type>" from each arg ---
|
|
1284
|
+
for arg in node.args.args + node.args.posonlyargs + node.args.kwonlyargs:
|
|
1285
|
+
if arg.annotation is None:
|
|
1286
|
+
continue
|
|
1287
|
+
ann = arg.annotation
|
|
1288
|
+
# The annotation text starts after "param_name" with ": "
|
|
1289
|
+
# arg node: name at (arg.lineno, arg.col_offset), length = len(arg.arg)
|
|
1290
|
+
arg_line_text = lines[arg.lineno - 1]
|
|
1291
|
+
name_end_col = _byte_col_to_char_col(arg_line_text, arg.col_offset) + len(arg.arg)
|
|
1292
|
+
ann_end_line = ann.end_lineno or ann.lineno
|
|
1293
|
+
ann_end_line_text = lines[ann_end_line - 1]
|
|
1294
|
+
ann_end_col = _byte_col_to_char_col(ann_end_line_text, ann.end_col_offset or (ann.col_offset + 1))
|
|
1295
|
+
removals.append((arg.lineno, name_end_col, ann_end_line, ann_end_col))
|
|
1296
|
+
|
|
1297
|
+
# vararg (*args) and kwarg (**kwargs)
|
|
1298
|
+
for maybe_arg in (node.args.vararg, node.args.kwarg):
|
|
1299
|
+
if maybe_arg is not None and maybe_arg.annotation is not None:
|
|
1300
|
+
ann = maybe_arg.annotation
|
|
1301
|
+
arg_line_text = lines[maybe_arg.lineno - 1]
|
|
1302
|
+
name_end_col = _byte_col_to_char_col(arg_line_text, maybe_arg.col_offset) + len(maybe_arg.arg)
|
|
1303
|
+
ann_end_line = ann.end_lineno or ann.lineno
|
|
1304
|
+
ann_end_line_text = lines[ann_end_line - 1]
|
|
1305
|
+
ann_end_col = _byte_col_to_char_col(ann_end_line_text, ann.end_col_offset or (ann.col_offset + 1))
|
|
1306
|
+
removals.append((maybe_arg.lineno, name_end_col, ann_end_line, ann_end_col))
|
|
1307
|
+
|
|
1308
|
+
if not removals:
|
|
1309
|
+
return source_code
|
|
1310
|
+
|
|
1311
|
+
# Sort removals in reverse order so later removals don't affect earlier offsets
|
|
1312
|
+
removals.sort(key=lambda r: (r[0], r[1]), reverse=True)
|
|
1313
|
+
|
|
1314
|
+
for start_line, start_col, end_line, end_col in removals:
|
|
1315
|
+
if start_line == end_line:
|
|
1316
|
+
# Single-line removal
|
|
1317
|
+
line_idx = start_line - 1
|
|
1318
|
+
line = lines[line_idx]
|
|
1319
|
+
lines[line_idx] = line[:start_col] + line[end_col:]
|
|
1320
|
+
else:
|
|
1321
|
+
# Multi-line removal (rare but possible for complex annotations)
|
|
1322
|
+
first_idx = start_line - 1
|
|
1323
|
+
last_idx = end_line - 1
|
|
1324
|
+
lines[first_idx] = lines[first_idx][:start_col] + lines[last_idx][end_col:]
|
|
1325
|
+
del lines[first_idx + 1 : last_idx + 1]
|
|
1326
|
+
|
|
1327
|
+
return "".join(lines)
|
|
1328
|
+
|
|
1329
|
+
|
|
1330
|
+
# ============================================================================
|
|
1331
|
+
# Dependencies reading
|
|
1332
|
+
# ============================================================================
|
|
1333
|
+
|
|
1334
|
+
|
|
1335
|
+
def read_dependencies(toml_path: Path) -> list[str]:
|
|
1336
|
+
"""Read pip dependencies from a pyproject.toml or component TOML file."""
|
|
1337
|
+
with open(toml_path, "rb") as f:
|
|
1338
|
+
data = tomllib.load(f)
|
|
1339
|
+
# Standard pyproject.toml format
|
|
1340
|
+
deps = data.get("project", {}).get("dependencies", [])
|
|
1341
|
+
if deps:
|
|
1342
|
+
return list(deps)
|
|
1343
|
+
return []
|
|
1344
|
+
|
|
1345
|
+
|
|
1346
|
+
# ============================================================================
|
|
1347
|
+
# Code generation
|
|
1348
|
+
# ============================================================================
|
|
1349
|
+
|
|
1350
|
+
|
|
1351
|
+
def _build_argparse_code(spec: FunctionSpec) -> str:
|
|
1352
|
+
"""Generate argparse wrapper code for the component function.
|
|
1353
|
+
|
|
1354
|
+
Type-specific definitions (e.g. _deserialize_bool, import json) are placed
|
|
1355
|
+
right before 'import argparse', matching the Cloud-Pipelines SDK layout.
|
|
1356
|
+
"""
|
|
1357
|
+
# Collect definitions needed by parameter types (deduplicated by content)
|
|
1358
|
+
definitions: dict[str, str] = {}
|
|
1359
|
+
for param in spec.inputs + spec.outputs:
|
|
1360
|
+
if param.tangle_type and param.tangle_type in _TYPE_DEFINITIONS:
|
|
1361
|
+
defn = _TYPE_DEFINITIONS[param.tangle_type]
|
|
1362
|
+
definitions[defn] = defn # dedup by content
|
|
1363
|
+
|
|
1364
|
+
# If there are return outputs, we need serializer helpers and json import
|
|
1365
|
+
has_return_outputs = len(spec.return_params) > 0
|
|
1366
|
+
if has_return_outputs:
|
|
1367
|
+
# Check if any return output needs json.dumps
|
|
1368
|
+
needs_json = any(
|
|
1369
|
+
_TYPE_TO_SERIALIZER.get(p.tangle_type or "String", "") == "json.dumps" for p in spec.return_params
|
|
1370
|
+
)
|
|
1371
|
+
if needs_json:
|
|
1372
|
+
definitions["import json"] = "import json"
|
|
1373
|
+
|
|
1374
|
+
lines = sorted(definitions.values()) + [
|
|
1375
|
+
"import argparse",
|
|
1376
|
+
f"_parser = argparse.ArgumentParser(prog={repr(spec.component_name)}, "
|
|
1377
|
+
f"description={repr(spec.description or '')})",
|
|
1378
|
+
]
|
|
1379
|
+
|
|
1380
|
+
# Add arguments for all inputs and file-based outputs (OutputPath params)
|
|
1381
|
+
all_params = spec.inputs + spec.outputs
|
|
1382
|
+
for param in all_params:
|
|
1383
|
+
flag = "--" + param.yaml_name.replace("_", "-")
|
|
1384
|
+
is_required = param.kind == "output" or not param.optional
|
|
1385
|
+
line = (
|
|
1386
|
+
f'_parser.add_argument("{flag}", dest="{param.name}", '
|
|
1387
|
+
f"type={param.deserializer}, required={is_required}, "
|
|
1388
|
+
f"default=argparse.SUPPRESS)"
|
|
1389
|
+
)
|
|
1390
|
+
lines.append(line)
|
|
1391
|
+
|
|
1392
|
+
# Add ----output-paths argument for NamedTuple return outputs
|
|
1393
|
+
if has_return_outputs:
|
|
1394
|
+
n = len(spec.return_params)
|
|
1395
|
+
lines.append(f'_parser.add_argument("----output-paths", dest="_output_paths", ' f"type=str, nargs={n})")
|
|
1396
|
+
|
|
1397
|
+
lines.append("_parsed_args = vars(_parser.parse_args())")
|
|
1398
|
+
|
|
1399
|
+
if has_return_outputs:
|
|
1400
|
+
lines.append('_output_files = _parsed_args.pop("_output_paths", [])')
|
|
1401
|
+
|
|
1402
|
+
lines.append("")
|
|
1403
|
+
lines.append(f"_outputs = {spec.name}(**_parsed_args)")
|
|
1404
|
+
|
|
1405
|
+
# Single return value (not NamedTuple) must be wrapped in a list
|
|
1406
|
+
# to be zipped with the serializers and output paths
|
|
1407
|
+
if has_return_outputs and spec.single_return_output:
|
|
1408
|
+
lines.append("_outputs = [_outputs]")
|
|
1409
|
+
|
|
1410
|
+
# Add output serialization for return outputs
|
|
1411
|
+
if has_return_outputs:
|
|
1412
|
+
lines.append("")
|
|
1413
|
+
serializers = []
|
|
1414
|
+
for rp in spec.return_params:
|
|
1415
|
+
serializer = _TYPE_TO_SERIALIZER.get(rp.tangle_type or "String", "_serialize_str")
|
|
1416
|
+
serializers.append(f" {serializer},")
|
|
1417
|
+
lines.append("_output_serializers = [")
|
|
1418
|
+
lines.extend(serializers)
|
|
1419
|
+
lines.append("]")
|
|
1420
|
+
lines.append("")
|
|
1421
|
+
lines.append("import os")
|
|
1422
|
+
lines.append("for idx, output_file in enumerate(_output_files):")
|
|
1423
|
+
lines.append(" try:")
|
|
1424
|
+
lines.append(" os.makedirs(os.path.dirname(output_file))")
|
|
1425
|
+
lines.append(" except OSError:")
|
|
1426
|
+
lines.append(" pass")
|
|
1427
|
+
lines.append(" with open(output_file, 'w') as f:")
|
|
1428
|
+
lines.append(" f.write(_output_serializers[idx](_outputs[idx]))")
|
|
1429
|
+
|
|
1430
|
+
return "\n".join(lines)
|
|
1431
|
+
|
|
1432
|
+
|
|
1433
|
+
def _build_args_section(spec: FunctionSpec) -> list[Any]:
|
|
1434
|
+
"""Build the YAML args section with input/output placeholders."""
|
|
1435
|
+
args: list[Any] = []
|
|
1436
|
+
|
|
1437
|
+
all_params = spec.inputs + spec.outputs
|
|
1438
|
+
for param in all_params:
|
|
1439
|
+
flag = "--" + param.yaml_name.replace("_", "-")
|
|
1440
|
+
|
|
1441
|
+
# Determine the placeholder type
|
|
1442
|
+
if param.kind == "output":
|
|
1443
|
+
placeholder = {"outputPath": param.yaml_name}
|
|
1444
|
+
elif param.kind == "input_path":
|
|
1445
|
+
placeholder = {"inputPath": param.yaml_name}
|
|
1446
|
+
else:
|
|
1447
|
+
placeholder = {"inputValue": param.yaml_name}
|
|
1448
|
+
|
|
1449
|
+
if param.optional:
|
|
1450
|
+
# Wrap in if/cond/isPresent/then for optional params
|
|
1451
|
+
args.append(
|
|
1452
|
+
{
|
|
1453
|
+
"if": {
|
|
1454
|
+
"cond": {"isPresent": param.yaml_name},
|
|
1455
|
+
"then": [flag, placeholder],
|
|
1456
|
+
}
|
|
1457
|
+
}
|
|
1458
|
+
)
|
|
1459
|
+
else:
|
|
1460
|
+
args.append(flag)
|
|
1461
|
+
args.append(placeholder)
|
|
1462
|
+
|
|
1463
|
+
# Add ----output-paths entries for NamedTuple return outputs
|
|
1464
|
+
if spec.return_params:
|
|
1465
|
+
args.append("----output-paths")
|
|
1466
|
+
for rp in spec.return_params:
|
|
1467
|
+
args.append({"outputPath": rp.yaml_name})
|
|
1468
|
+
|
|
1469
|
+
return args
|
|
1470
|
+
|
|
1471
|
+
|
|
1472
|
+
def _build_pip_install_command(deps: list[str]) -> list[str]:
|
|
1473
|
+
"""Build the pip install command prefix for the container."""
|
|
1474
|
+
if not deps:
|
|
1475
|
+
return []
|
|
1476
|
+
quoted = " ".join(repr(str(d)) for d in deps)
|
|
1477
|
+
install_cmd = (
|
|
1478
|
+
f"PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install " f"--quiet --no-warn-script-location {quoted}"
|
|
1479
|
+
)
|
|
1480
|
+
return [
|
|
1481
|
+
"sh",
|
|
1482
|
+
"-c",
|
|
1483
|
+
f'({install_cmd} || {install_cmd} --user) && "$0" "$@"',
|
|
1484
|
+
]
|
|
1485
|
+
|
|
1486
|
+
|
|
1487
|
+
def _build_python_source(
|
|
1488
|
+
spec: FunctionSpec,
|
|
1489
|
+
mode: Literal["inline", "bundle"],
|
|
1490
|
+
bundled_modules_b64: str | None = None,
|
|
1491
|
+
) -> str:
|
|
1492
|
+
"""Build the full Python source code to embed in the YAML.
|
|
1493
|
+
|
|
1494
|
+
For inline mode: helper functions + stripped source + argparse wrapper.
|
|
1495
|
+
For bundle mode: helper functions + sys.modules injection + stripped source + argparse wrapper.
|
|
1496
|
+
"""
|
|
1497
|
+
parts: list[str] = []
|
|
1498
|
+
|
|
1499
|
+
# Add _make_parent_dirs_and_return_path helper if needed
|
|
1500
|
+
has_output_path = any(p.kind == "output" for p in spec.params)
|
|
1501
|
+
if has_output_path:
|
|
1502
|
+
parts.append(_MAKE_PARENT_DIRS_HELPER)
|
|
1503
|
+
|
|
1504
|
+
# Add _serialize_str helper if needed for NamedTuple return outputs
|
|
1505
|
+
if spec.return_params:
|
|
1506
|
+
needs_serialize_str = any(
|
|
1507
|
+
_TYPE_TO_SERIALIZER.get(p.tangle_type or "String", "_serialize_str") == "_serialize_str"
|
|
1508
|
+
for p in spec.return_params
|
|
1509
|
+
)
|
|
1510
|
+
if needs_serialize_str:
|
|
1511
|
+
parts.append(_SERIALIZE_STR_HELPER)
|
|
1512
|
+
|
|
1513
|
+
# For bundle mode: add sys.modules injection from compressed embedded source text
|
|
1514
|
+
if mode == "bundle" and bundled_modules_b64:
|
|
1515
|
+
parts.append(ModuleBundler.build_injection(bundled_modules_b64))
|
|
1516
|
+
|
|
1517
|
+
# Add the source code (type-hint-stripped)
|
|
1518
|
+
# Use full module source when available — this preserves helper functions defined
|
|
1519
|
+
# outside the target function, module-level imports, and constants.
|
|
1520
|
+
if spec.module_source_stripped:
|
|
1521
|
+
parts.append(spec.module_source_stripped)
|
|
1522
|
+
else:
|
|
1523
|
+
parts.append(spec.source_code_stripped)
|
|
1524
|
+
|
|
1525
|
+
# Add argparse wrapper
|
|
1526
|
+
parts.append(_build_argparse_code(spec))
|
|
1527
|
+
|
|
1528
|
+
full_source = "\n\n".join(parts)
|
|
1529
|
+
# Clean up consecutive blank lines
|
|
1530
|
+
full_source = re.sub(r"\n\n\n+", "\n\n", full_source).strip("\n") + "\n"
|
|
1531
|
+
return full_source
|
|
1532
|
+
|
|
1533
|
+
|
|
1534
|
+
def _serialize_default(value: Any, tangle_type: str | None) -> str | None:
|
|
1535
|
+
"""Serialize a default value to a string for YAML."""
|
|
1536
|
+
if value is inspect.Parameter.empty or value is None:
|
|
1537
|
+
return None
|
|
1538
|
+
if isinstance(value, str):
|
|
1539
|
+
return value
|
|
1540
|
+
if isinstance(value, bool):
|
|
1541
|
+
return str(value)
|
|
1542
|
+
if isinstance(value, (int, float)):
|
|
1543
|
+
return str(value)
|
|
1544
|
+
if isinstance(value, (list, dict)):
|
|
1545
|
+
return json.dumps(value, sort_keys=True)
|
|
1546
|
+
return str(value)
|
|
1547
|
+
|
|
1548
|
+
|
|
1549
|
+
# ============================================================================
|
|
1550
|
+
# Component YAML building
|
|
1551
|
+
# ============================================================================
|
|
1552
|
+
|
|
1553
|
+
|
|
1554
|
+
def build_component_dict(
|
|
1555
|
+
spec: FunctionSpec,
|
|
1556
|
+
container_image: str,
|
|
1557
|
+
dependencies: list[str],
|
|
1558
|
+
annotations: dict[str, str],
|
|
1559
|
+
mode: Literal["inline", "bundle"] = "inline",
|
|
1560
|
+
bundled_modules_b64: str | None = None,
|
|
1561
|
+
) -> dict[str, Any]:
|
|
1562
|
+
"""Build the complete component YAML dict.
|
|
1563
|
+
|
|
1564
|
+
Args:
|
|
1565
|
+
spec: Extracted function specification
|
|
1566
|
+
container_image: Docker image for the container
|
|
1567
|
+
dependencies: List of pip dependencies
|
|
1568
|
+
annotations: Metadata annotations dict
|
|
1569
|
+
mode: Generation mode
|
|
1570
|
+
bundled_modules_b64: Base64-encoded pickled modules (bundle mode only)
|
|
1571
|
+
|
|
1572
|
+
Returns:
|
|
1573
|
+
Dict representing the full component YAML structure.
|
|
1574
|
+
"""
|
|
1575
|
+
# Build inputs
|
|
1576
|
+
inputs = []
|
|
1577
|
+
for param in spec.inputs:
|
|
1578
|
+
input_spec: dict[str, Any] = {
|
|
1579
|
+
"name": param.yaml_name,
|
|
1580
|
+
"type": param.tangle_type,
|
|
1581
|
+
}
|
|
1582
|
+
if param.description:
|
|
1583
|
+
input_spec["description"] = param.description
|
|
1584
|
+
if param.default is not inspect.Parameter.empty and param.default is not None:
|
|
1585
|
+
serialized = _serialize_default(param.default, param.tangle_type)
|
|
1586
|
+
if serialized is not None:
|
|
1587
|
+
input_spec["default"] = serialized
|
|
1588
|
+
if param.optional:
|
|
1589
|
+
input_spec["optional"] = True
|
|
1590
|
+
inputs.append(input_spec)
|
|
1591
|
+
|
|
1592
|
+
# Build outputs (OutputPath params + NamedTuple return fields)
|
|
1593
|
+
outputs = []
|
|
1594
|
+
for param in spec.all_outputs:
|
|
1595
|
+
output_spec: dict[str, Any] = {
|
|
1596
|
+
"name": param.yaml_name,
|
|
1597
|
+
"type": param.tangle_type,
|
|
1598
|
+
}
|
|
1599
|
+
if param.description:
|
|
1600
|
+
output_spec["description"] = param.description
|
|
1601
|
+
outputs.append(output_spec)
|
|
1602
|
+
|
|
1603
|
+
# Build implementation
|
|
1604
|
+
all_deps = list(dependencies)
|
|
1605
|
+
|
|
1606
|
+
pip_install = _build_pip_install_command(all_deps)
|
|
1607
|
+
python_source = _build_python_source(spec, mode, bundled_modules_b64)
|
|
1608
|
+
args = _build_args_section(spec)
|
|
1609
|
+
|
|
1610
|
+
shell_bootstrap = textwrap.dedent("""\
|
|
1611
|
+
program_path=$(mktemp)
|
|
1612
|
+
printf "%s" "$0" > "$program_path"
|
|
1613
|
+
python3 -u "$program_path" "$@"
|
|
1614
|
+
""")
|
|
1615
|
+
|
|
1616
|
+
command = pip_install + ["sh", "-ec", shell_bootstrap, python_source]
|
|
1617
|
+
|
|
1618
|
+
# Tangle's schema rejects ``description: null``, so fall back to a generic
|
|
1619
|
+
# placeholder when the function has no docstring. Users can override by
|
|
1620
|
+
# adding a docstring to the function (its first paragraph becomes the
|
|
1621
|
+
# description — see ``extract_function_spec``).
|
|
1622
|
+
description = spec.description or f"{spec.component_name} component"
|
|
1623
|
+
|
|
1624
|
+
component: dict[str, Any] = {
|
|
1625
|
+
"name": spec.component_name,
|
|
1626
|
+
"description": description,
|
|
1627
|
+
}
|
|
1628
|
+
|
|
1629
|
+
if annotations:
|
|
1630
|
+
component["metadata"] = {"annotations": annotations}
|
|
1631
|
+
|
|
1632
|
+
if inputs:
|
|
1633
|
+
component["inputs"] = inputs
|
|
1634
|
+
if outputs:
|
|
1635
|
+
component["outputs"] = outputs
|
|
1636
|
+
|
|
1637
|
+
component["implementation"] = {
|
|
1638
|
+
"container": {
|
|
1639
|
+
"image": container_image,
|
|
1640
|
+
"command": command,
|
|
1641
|
+
"args": args,
|
|
1642
|
+
}
|
|
1643
|
+
}
|
|
1644
|
+
|
|
1645
|
+
return component
|
|
1646
|
+
|
|
1647
|
+
|
|
1648
|
+
# ============================================================================
|
|
1649
|
+
# Top-level generation function
|
|
1650
|
+
# ============================================================================
|
|
1651
|
+
|
|
1652
|
+
|
|
1653
|
+
def generate_component_yaml(
|
|
1654
|
+
file_path: Path,
|
|
1655
|
+
output_path: Path,
|
|
1656
|
+
container_image: str,
|
|
1657
|
+
function_name: str | None = None,
|
|
1658
|
+
dependencies_from: Path | None = None,
|
|
1659
|
+
mode: Literal["inline", "bundle"] = "inline",
|
|
1660
|
+
custom_name: str | None = None,
|
|
1661
|
+
custom_annotations: dict[str, str] | None = None,
|
|
1662
|
+
strip_code: bool = False,
|
|
1663
|
+
strip_source_path: bool = False,
|
|
1664
|
+
resolve_root: Path | None = None,
|
|
1665
|
+
emit_generation_annotations: bool = True,
|
|
1666
|
+
path_annotation_mode: Literal["oss", "td_legacy"] = "oss",
|
|
1667
|
+
) -> bool:
|
|
1668
|
+
"""Generate a component YAML file from a Python function.
|
|
1669
|
+
|
|
1670
|
+
Args:
|
|
1671
|
+
file_path: Path to the Python source file
|
|
1672
|
+
output_path: Where to write the generated YAML
|
|
1673
|
+
container_image: Docker image reference
|
|
1674
|
+
function_name: Function to extract (auto-detected if None)
|
|
1675
|
+
dependencies_from: Path to pyproject.toml with pip dependencies
|
|
1676
|
+
mode: "inline" for single-file, "bundle" for multi-file
|
|
1677
|
+
custom_name: Override the component name
|
|
1678
|
+
custom_annotations: Additional annotations to merge
|
|
1679
|
+
strip_code: Omit python_original_code annotation
|
|
1680
|
+
strip_source_path: Omit python_original_code_path annotation
|
|
1681
|
+
resolve_root: Root directory for resolving local module imports in bundle
|
|
1682
|
+
mode. Defaults to ``file_path.parent``. Set this when local modules
|
|
1683
|
+
live in sibling directories (e.g. ``src/utils`` alongside ``src/components``).
|
|
1684
|
+
emit_generation_annotations: Persist tangle-cli regeneration context
|
|
1685
|
+
annotations. Disable for downstream legacy snapshot compatibility.
|
|
1686
|
+
path_annotation_mode: ``"oss"`` always records source/YAML paths relative
|
|
1687
|
+
to their common ancestor. ``"td_legacy"`` only uses that relative
|
|
1688
|
+
common-root behavior inside a git checkout; outside git it records
|
|
1689
|
+
``file_path.name`` / ``output_path.name`` like legacy tangle-deploy.
|
|
1690
|
+
|
|
1691
|
+
Returns:
|
|
1692
|
+
True on success, False on failure.
|
|
1693
|
+
"""
|
|
1694
|
+
try:
|
|
1695
|
+
if path_annotation_mode not in {"oss", "td_legacy"}:
|
|
1696
|
+
raise ValueError("path_annotation_mode must be 'oss' or 'td_legacy'")
|
|
1697
|
+
|
|
1698
|
+
# 1. Extract metadata from source (AST-based, before module loading)
|
|
1699
|
+
file_metadata, resolved_func_name = extract_file_metadata(file_path, function_name)
|
|
1700
|
+
if not resolved_func_name:
|
|
1701
|
+
raise ValueError(f"No public functions found in {file_path}")
|
|
1702
|
+
|
|
1703
|
+
# 2. Load module and get function
|
|
1704
|
+
# Only add resolve_root to sys.path in bundle mode — in inline mode the
|
|
1705
|
+
# sibling modules won't be embedded, so letting the import succeed would
|
|
1706
|
+
# produce YAML that fails at runtime in the container.
|
|
1707
|
+
extra_paths = [resolve_root] if resolve_root and mode == "bundle" else None
|
|
1708
|
+
module = load_python_module(file_path, extra_sys_path=extra_paths)
|
|
1709
|
+
func = get_function_from_module(module, resolved_func_name)
|
|
1710
|
+
|
|
1711
|
+
# 3. Extract interface, passing pre-computed metadata
|
|
1712
|
+
spec = extract_interface(func, docstring_metadata=file_metadata)
|
|
1713
|
+
if custom_name:
|
|
1714
|
+
spec.component_name = custom_name
|
|
1715
|
+
|
|
1716
|
+
# Populate full module source (preserves helper functions, imports, constants)
|
|
1717
|
+
# Remove cloud_pipelines import since it's only used for type annotations
|
|
1718
|
+
module_source = file_path.read_text()
|
|
1719
|
+
lines = module_source.split("\n")
|
|
1720
|
+
lines = [
|
|
1721
|
+
line for line in lines if not (line.strip().startswith(("from cloud_pipelines", "import cloud_pipelines")))
|
|
1722
|
+
]
|
|
1723
|
+
filtered_source = "\n".join(lines)
|
|
1724
|
+
filtered_source = _strip_main_guard(filtered_source)
|
|
1725
|
+
# Strip python-pipeline authoring imports + @task/@pipeline/@subpipeline
|
|
1726
|
+
# decorators so the baked runtime program does not re-run the authoring
|
|
1727
|
+
# decorator (which would turn the function into a CallableRef and crash).
|
|
1728
|
+
# Operates on module_source_stripped only; python_original_code stays
|
|
1729
|
+
# byte-verbatim (it is read separately from module_code below).
|
|
1730
|
+
filtered_source = _strip_authoring_constructs(filtered_source)
|
|
1731
|
+
spec.module_source_stripped = _strip_type_hints(filtered_source)
|
|
1732
|
+
|
|
1733
|
+
# 3. Read dependencies
|
|
1734
|
+
deps: list[str] = []
|
|
1735
|
+
if dependencies_from:
|
|
1736
|
+
deps = read_dependencies(dependencies_from)
|
|
1737
|
+
|
|
1738
|
+
# 4. Build annotations
|
|
1739
|
+
directory = file_path.parent.resolve()
|
|
1740
|
+
module_code = file_path.read_text()
|
|
1741
|
+
|
|
1742
|
+
annotations: dict[str, str] = {
|
|
1743
|
+
"cloud_pipelines.net": "true",
|
|
1744
|
+
"components new regenerate python-function-component": "true",
|
|
1745
|
+
}
|
|
1746
|
+
if not strip_source_path:
|
|
1747
|
+
annotations["python_original_code_path"] = file_path.name
|
|
1748
|
+
if not strip_code:
|
|
1749
|
+
annotations["python_original_code"] = module_code
|
|
1750
|
+
|
|
1751
|
+
# Add all docstring metadata to annotations (version, updated_at, custom keys)
|
|
1752
|
+
# Skip "name" and "description" since they're used for top-level fields, not annotations
|
|
1753
|
+
for key, value in spec.docstring_metadata.items():
|
|
1754
|
+
if key not in ("name", "description"):
|
|
1755
|
+
annotations[key] = value
|
|
1756
|
+
|
|
1757
|
+
if deps:
|
|
1758
|
+
annotations["python_dependencies"] = json.dumps(deps)
|
|
1759
|
+
|
|
1760
|
+
if emit_generation_annotations:
|
|
1761
|
+
annotations["tangle_cli_generation_function_name"] = resolved_func_name
|
|
1762
|
+
annotations["tangle_cli_generation_mode"] = mode
|
|
1763
|
+
|
|
1764
|
+
# Use the common ancestor of source and output so both paths are clean
|
|
1765
|
+
# forward references (no ".."). This lets later local maintenance
|
|
1766
|
+
# commands find the source even when YAML is generated into a separate
|
|
1767
|
+
# output directory. TD legacy compatibility keeps basename-only paths
|
|
1768
|
+
# outside a git checkout to preserve historical snapshots.
|
|
1769
|
+
resolved_source = file_path.resolve()
|
|
1770
|
+
resolved_output = output_path.resolve()
|
|
1771
|
+
common_dir = Path(os.path.commonpath([resolved_source, resolved_output]))
|
|
1772
|
+
git_root = get_git_root(directory)
|
|
1773
|
+
use_common_paths = path_annotation_mode == "oss" or git_root is not None
|
|
1774
|
+
|
|
1775
|
+
def _path_annotation(path: Path) -> str:
|
|
1776
|
+
if use_common_paths:
|
|
1777
|
+
try:
|
|
1778
|
+
return str(path.resolve().relative_to(common_dir))
|
|
1779
|
+
except ValueError:
|
|
1780
|
+
return str(path)
|
|
1781
|
+
return path.name
|
|
1782
|
+
|
|
1783
|
+
if not strip_source_path:
|
|
1784
|
+
annotations["python_original_code_path"] = _path_annotation(file_path)
|
|
1785
|
+
annotations["component_yaml_path"] = _path_annotation(output_path)
|
|
1786
|
+
if emit_generation_annotations:
|
|
1787
|
+
if dependencies_from:
|
|
1788
|
+
annotations["tangle_cli_generation_dependencies_from"] = _path_annotation(dependencies_from)
|
|
1789
|
+
if resolve_root:
|
|
1790
|
+
annotations["tangle_cli_generation_resolve_root"] = _path_annotation(resolve_root)
|
|
1791
|
+
|
|
1792
|
+
# Git info — use the same common ancestor as git_relative_dir when common paths are active.
|
|
1793
|
+
if git_root:
|
|
1794
|
+
git_info = get_git_info(common_dir)
|
|
1795
|
+
git_info.pop("_git_root", None)
|
|
1796
|
+
# Override git_relative_dir to be the common ancestor
|
|
1797
|
+
try:
|
|
1798
|
+
git_info["git_relative_dir"] = str(common_dir.relative_to(git_root))
|
|
1799
|
+
except ValueError:
|
|
1800
|
+
pass
|
|
1801
|
+
annotations.update(git_info)
|
|
1802
|
+
else:
|
|
1803
|
+
git_info = get_git_info(directory)
|
|
1804
|
+
git_info.pop("_git_root", None)
|
|
1805
|
+
annotations.update(git_info)
|
|
1806
|
+
|
|
1807
|
+
# Custom annotations
|
|
1808
|
+
if custom_annotations:
|
|
1809
|
+
annotations.update(custom_annotations)
|
|
1810
|
+
|
|
1811
|
+
# Filter None values (annotation values must be strings)
|
|
1812
|
+
annotations = {k: v for k, v in annotations.items() if isinstance(v, str)}
|
|
1813
|
+
|
|
1814
|
+
# 5. Handle bundle mode — embed source text of local modules
|
|
1815
|
+
# (not bytecode, which is Python-version-specific)
|
|
1816
|
+
bundled_modules_b64: str | None = None
|
|
1817
|
+
if mode == "bundle":
|
|
1818
|
+
module_sources = ModuleBundler.collect_sources(
|
|
1819
|
+
file_path,
|
|
1820
|
+
resolve_root=resolve_root,
|
|
1821
|
+
pip_deps=deps,
|
|
1822
|
+
source=spec.module_source_stripped,
|
|
1823
|
+
)
|
|
1824
|
+
if module_sources:
|
|
1825
|
+
bundled_modules_b64 = ModuleBundler.encode(module_sources)
|
|
1826
|
+
if bundled_modules_b64:
|
|
1827
|
+
sorted_names = sorted(module_sources.keys(), key=lambda k: (k.count("."), k))
|
|
1828
|
+
annotations["bundled_modules"] = json.dumps(sorted_names)
|
|
1829
|
+
|
|
1830
|
+
# 6. Build and write YAML
|
|
1831
|
+
component = build_component_dict(
|
|
1832
|
+
spec=spec,
|
|
1833
|
+
container_image=container_image,
|
|
1834
|
+
dependencies=deps,
|
|
1835
|
+
annotations=annotations,
|
|
1836
|
+
mode=mode,
|
|
1837
|
+
bundled_modules_b64=bundled_modules_b64,
|
|
1838
|
+
)
|
|
1839
|
+
|
|
1840
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1841
|
+
with open(output_path, "w") as f:
|
|
1842
|
+
f.write(dump_yaml(component, width=120))
|
|
1843
|
+
|
|
1844
|
+
return True
|
|
1845
|
+
|
|
1846
|
+
except AuthoringStripError:
|
|
1847
|
+
# TaskEnv authoring-violation (§3.5): fail LOUD with the actionable
|
|
1848
|
+
# guidance instead of swallowing it into a warning + False. A silent
|
|
1849
|
+
# False would only resurface later as a confusing missing/broken
|
|
1850
|
+
# component at hydrate or backend run time, defeating the
|
|
1851
|
+
# "fail fast with a clear generator error" intent. Every OTHER failure
|
|
1852
|
+
# keeps the conservative warn + return False behaviour below.
|
|
1853
|
+
raise
|
|
1854
|
+
except Exception as e:
|
|
1855
|
+
warnings.warn(f"Error generating component YAML: {e}")
|
|
1856
|
+
return False
|