tangle-cli 0.0.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. tangle_cli/__init__.py +19 -0
  2. tangle_cli/api_cli.py +787 -0
  3. tangle_cli/api_schema.py +633 -0
  4. tangle_cli/api_transport.py +461 -0
  5. tangle_cli/args_container.py +244 -0
  6. tangle_cli/artifacts.py +293 -0
  7. tangle_cli/artifacts_cli.py +108 -0
  8. tangle_cli/cli.py +57 -0
  9. tangle_cli/cli_helpers.py +116 -0
  10. tangle_cli/cli_options.py +52 -0
  11. tangle_cli/client.py +677 -0
  12. tangle_cli/component_from_func.py +1856 -0
  13. tangle_cli/component_generator.py +298 -0
  14. tangle_cli/component_inspector.py +494 -0
  15. tangle_cli/component_publisher.py +921 -0
  16. tangle_cli/components_cli.py +269 -0
  17. tangle_cli/dynamic_discovery_client.py +296 -0
  18. tangle_cli/generated_model_extensions.py +405 -0
  19. tangle_cli/generated_runtime.py +43 -0
  20. tangle_cli/handler.py +96 -0
  21. tangle_cli/hydration_trust.py +222 -0
  22. tangle_cli/logger.py +166 -0
  23. tangle_cli/models.py +407 -0
  24. tangle_cli/module_bundler.py +662 -0
  25. tangle_cli/openapi/__init__.py +0 -0
  26. tangle_cli/openapi/codegen.py +1090 -0
  27. tangle_cli/openapi/parser.py +77 -0
  28. tangle_cli/pipeline_dehydrator.py +720 -0
  29. tangle_cli/pipeline_hydrator.py +1785 -0
  30. tangle_cli/pipeline_run_annotations.py +41 -0
  31. tangle_cli/pipeline_run_details.py +203 -0
  32. tangle_cli/pipeline_run_manager.py +1994 -0
  33. tangle_cli/pipeline_run_search.py +712 -0
  34. tangle_cli/pipeline_runner.py +620 -0
  35. tangle_cli/pipeline_runs_cli.py +584 -0
  36. tangle_cli/pipelines.py +581 -0
  37. tangle_cli/pipelines_cli.py +271 -0
  38. tangle_cli/published_components_cli.py +373 -0
  39. tangle_cli/py.typed +0 -0
  40. tangle_cli/quickstart.py +110 -0
  41. tangle_cli/secrets.py +156 -0
  42. tangle_cli/secrets_cli.py +269 -0
  43. tangle_cli/utils.py +942 -0
  44. tangle_cli/version_manager.py +470 -0
  45. tangle_cli-0.0.1a1.dist-info/METADATA +561 -0
  46. tangle_cli-0.0.1a1.dist-info/RECORD +48 -0
  47. tangle_cli-0.0.1a1.dist-info/WHEEL +4 -0
  48. tangle_cli-0.0.1a1.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,1856 @@
1
+ """
2
+ Component YAML generator from Python functions.
3
+
4
+ Converts Python functions into Tangle component YAML files. Supports two modes:
5
+
6
+ - **inline** (default): Single-file components with source code embedded directly.
7
+ - **bundle**: Multi-file components with local dependency modules serialized via
8
+ zlib-compressed source text and injected into sys.modules at runtime.
9
+
10
+ Key functions:
11
+ - generate_component_yaml() - Top-level entry point for YAML generation
12
+ - extract_interface() - Introspects a function's signature, types, and docstring
13
+ - extract_file_metadata() - Extracts metadata (name, version, etc.) from source via AST
14
+ - extract_docstring_metadata() - Parses the Metadata section from a docstring string
15
+ """
16
+
17
+ import ast
18
+ import importlib.util
19
+ import inspect
20
+ import json
21
+ import os
22
+ import re
23
+ import sys
24
+ import textwrap
25
+ import types
26
+ import typing
27
+ import warnings
28
+ from dataclasses import dataclass, field
29
+ from pathlib import Path
30
+ from typing import Any, Callable, Literal
31
+
32
+ import docstring_parser
33
+
34
+ from tangle_cli.module_bundler import ModuleBundler
35
+ from tangle_cli.utils import dump_yaml, get_git_info, get_git_root
36
+
37
+ try:
38
+ import tomllib
39
+ except ModuleNotFoundError:
40
+ import tomli as tomllib # type: ignore[no-redef]
41
+
42
+
43
+ # ============================================================================
44
+ # InputPath / OutputPath annotation types
45
+ # ============================================================================
46
+ # These mirror the cloud_pipelines.components types so we can introspect
47
+ # functions that use them without requiring the cloud_pipelines SDK.
48
+
49
+
50
+ class InputPath:
51
+ """Annotation indicating a function parameter receives a file path for input data."""
52
+
53
+ def __init__(self, type: str | None = None):
54
+ self.type = type
55
+
56
+
57
+ class OutputPath:
58
+ """Annotation indicating a function parameter receives a file path for output data."""
59
+
60
+ def __init__(self, type: str | None = None):
61
+ self.type = type
62
+
63
+
64
+ # ============================================================================
65
+ # Type mapping (replicating Cloud-Pipelines SDK _data_passing.py)
66
+ # ============================================================================
67
+
68
+ # Python type → Tangle type name
69
+ _TYPE_TO_TANGLE: dict[type, str] = {
70
+ str: "String",
71
+ int: "Integer",
72
+ float: "Float",
73
+ bool: "Boolean",
74
+ list: "JsonArray",
75
+ dict: "JsonObject",
76
+ }
77
+
78
+ # Tangle type name → argparse deserializer expression
79
+ _TYPE_TO_DESERIALIZER: dict[str, str] = {
80
+ "String": "str",
81
+ "Integer": "int",
82
+ "Float": "float",
83
+ "Boolean": "_deserialize_bool",
84
+ "JsonArray": "json.loads",
85
+ "JsonObject": "json.loads",
86
+ }
87
+
88
+ # Tangle type names that need extra definitions in the generated code
89
+ _TYPE_DEFINITIONS: dict[str, str] = {
90
+ "Boolean": textwrap.dedent("""\
91
+ def _deserialize_bool(s):
92
+ s = s.lower()
93
+ if s in ("true", "1", "yes"):
94
+ return True
95
+ if s in ("false", "0", "no"):
96
+ return False
97
+ raise TypeError(
98
+ f'Error parsing "{s}" as bool value. Supported values: "true", "false", "1", "0".'
99
+ )"""),
100
+ "JsonArray": "import json",
101
+ "JsonObject": "import json",
102
+ }
103
+
104
+ _MAKE_PARENT_DIRS_HELPER = textwrap.dedent("""\
105
+ def _make_parent_dirs_and_return_path(file_path: str):
106
+ import os
107
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
108
+ return file_path""")
109
+
110
+ # Tangle type name → output serializer expression (for NamedTuple return fields)
111
+ _TYPE_TO_SERIALIZER: dict[str, str] = {
112
+ "String": "_serialize_str",
113
+ "Integer": "str",
114
+ "Float": "str",
115
+ "Boolean": "str",
116
+ "JsonArray": "json.dumps",
117
+ "JsonObject": "json.dumps",
118
+ }
119
+
120
+ _SERIALIZE_STR_HELPER = textwrap.dedent("""\
121
+ def _serialize_str(str_value) -> str:
122
+ if isinstance(str_value, str):
123
+ return str_value
124
+ else:
125
+ return str(str_value)""")
126
+
127
+
128
+ # ============================================================================
129
+ # Data structures
130
+ # ============================================================================
131
+
132
+
133
+ @dataclass
134
+ class ParamInfo:
135
+ """Describes a single function parameter mapped to a component input or output."""
136
+
137
+ name: str # Python parameter name
138
+ yaml_name: str # Name in YAML (may have _path/_file suffix stripped)
139
+ python_type: str | None # Original Python type annotation string
140
+ tangle_type: str | None # Tangle type: String, Integer, Float, etc.
141
+ kind: Literal["input", "output", "input_path", "return_output"]
142
+ description: str | None = None
143
+ default: Any = inspect.Parameter.empty
144
+ optional: bool = False
145
+ deserializer: str = "str" # argparse type= expression
146
+
147
+
148
+ @dataclass
149
+ class FunctionSpec:
150
+ """Complete specification of a function for component generation."""
151
+
152
+ name: str
153
+ component_name: str
154
+ description: str | None
155
+ params: list[ParamInfo] = field(default_factory=list)
156
+ return_params: list[ParamInfo] = field(default_factory=list) # Return value outputs
157
+ single_return_output: bool = False # True when -> str (not NamedTuple); needs _outputs=[_outputs] wrapping
158
+ source_code: str = ""
159
+ source_code_stripped: str = ""
160
+ module_source_stripped: str = "" # Full module source (for bundle mode)
161
+ docstring_metadata: dict[str, str] = field(default_factory=dict) # name, version, updated_at from Metadata:
162
+
163
+ @property
164
+ def inputs(self) -> list[ParamInfo]:
165
+ return [p for p in self.params if p.kind in ("input", "input_path")]
166
+
167
+ @property
168
+ def outputs(self) -> list[ParamInfo]:
169
+ """OutputPath parameter outputs."""
170
+ return [p for p in self.params if p.kind == "output"]
171
+
172
+ @property
173
+ def all_outputs(self) -> list[ParamInfo]:
174
+ """All outputs: OutputPath parameters + NamedTuple return fields."""
175
+ return self.outputs + self.return_params
176
+
177
+
178
+ # ============================================================================
179
+ # Module loading
180
+ # ============================================================================
181
+
182
+
183
+ def _ensure_cloud_pipelines_shim() -> None:
184
+ """Register import-time shims used while introspecting authoring files.
185
+
186
+ This allows loading Python files that use `from cloud_pipelines import components`
187
+ and/or TD authoring decorators without requiring those authoring packages.
188
+ The TD authoring constructs are stripped from generated runtime code later.
189
+ """
190
+ if "cloud_pipelines" not in sys.modules:
191
+ components_mod = types.ModuleType("cloud_pipelines.components")
192
+ setattr(components_mod, "InputPath", InputPath)
193
+ setattr(components_mod, "OutputPath", OutputPath)
194
+
195
+ cloud_pipelines_mod = types.ModuleType("cloud_pipelines")
196
+ setattr(cloud_pipelines_mod, "components", components_mod)
197
+
198
+ sys.modules["cloud_pipelines"] = cloud_pipelines_mod
199
+ sys.modules["cloud_pipelines.components"] = components_mod
200
+
201
+ _ensure_tangle_deploy_authoring_shim()
202
+
203
+
204
+ def _identity_decorator(*args, **kwargs):
205
+ def decorate(func):
206
+ return func
207
+
208
+ return decorate
209
+
210
+
211
+ class _AuthoringGeneric:
212
+ def __class_getitem__(cls, item):
213
+ return cls
214
+
215
+ def __init__(self, *args, **kwargs):
216
+ pass
217
+
218
+
219
+ def _ensure_tangle_deploy_authoring_shim() -> None:
220
+ """Register a tiny shim for TD pipeline authoring imports if absent."""
221
+ if "tangle_deploy.python_pipeline" in sys.modules:
222
+ return
223
+
224
+ tangle_deploy_mod = sys.modules.get("tangle_deploy") or types.ModuleType("tangle_deploy")
225
+ python_pipeline_mod = types.ModuleType("tangle_deploy.python_pipeline")
226
+ for name in ("task", "pipeline", "subpipeline", "registered"):
227
+ setattr(python_pipeline_mod, name, _identity_decorator)
228
+ for name in ("In", "Out", "Outputs", "TaskEnv"):
229
+ setattr(python_pipeline_mod, name, _AuthoringGeneric)
230
+ setattr(python_pipeline_mod, "ref", lambda *args, **kwargs: None)
231
+
232
+ setattr(tangle_deploy_mod, "python_pipeline", python_pipeline_mod)
233
+ sys.modules.setdefault("tangle_deploy", tangle_deploy_mod)
234
+ sys.modules["tangle_deploy.python_pipeline"] = python_pipeline_mod
235
+
236
+
237
+ def load_python_module(file_path: Path, extra_sys_path: list[Path] | None = None) -> Any:
238
+ """Dynamically import a Python module from a file path.
239
+
240
+ Args:
241
+ file_path: Path to the Python source file.
242
+ extra_sys_path: Additional directories to add to ``sys.path`` during
243
+ module loading. This is needed when the module imports sibling
244
+ packages that live outside ``file_path.parent`` (e.g. when
245
+ ``--resolve-root`` points at a parent ``src/`` directory).
246
+ """
247
+ _ensure_cloud_pipelines_shim()
248
+
249
+ module_name = file_path.stem
250
+ spec = importlib.util.spec_from_file_location(module_name, location=str(file_path))
251
+ if not spec or not spec.loader:
252
+ raise ValueError(f"Unable to create module spec for {file_path}")
253
+ module = importlib.util.module_from_spec(spec)
254
+ # Add the module's directory to sys.path so relative imports work
255
+ module_dir = str(file_path.parent.resolve())
256
+ original_path = sys.path.copy()
257
+ if module_dir not in sys.path:
258
+ sys.path.insert(0, module_dir)
259
+ # Add extra directories (e.g. resolve_root) so sibling imports resolve
260
+ if extra_sys_path:
261
+ for p in reversed(extra_sys_path):
262
+ p_str = str(p.resolve())
263
+ if p_str not in sys.path:
264
+ sys.path.insert(0, p_str)
265
+ try:
266
+ spec.loader.exec_module(module)
267
+ finally:
268
+ sys.path = original_path
269
+ return module
270
+
271
+
272
+ def get_function_from_module(module: Any, function_name: str | None = None) -> Callable:
273
+ """Get a function from a loaded module.
274
+
275
+ If function_name is specified, returns that function.
276
+ Otherwise, returns the single public function (errors if 0 or >1).
277
+ """
278
+ if function_name:
279
+ func = getattr(module, function_name, None)
280
+ if func is None or not callable(func):
281
+ raise ValueError(f"Function '{function_name}' not found in module {module.__name__}")
282
+ return func
283
+
284
+ functions = [
285
+ getattr(module, name)
286
+ for name in dir(module)
287
+ if not name.startswith("_") and callable(getattr(module, name)) and not isinstance(getattr(module, name), type)
288
+ ]
289
+
290
+ if not functions:
291
+ raise ValueError(f"No public functions found in module {module.__name__}")
292
+ if len(functions) > 1:
293
+ names = [f.__name__ for f in functions]
294
+ raise ValueError(
295
+ f"Found multiple functions in module {module.__name__}: {names}. " "Please specify --function-name."
296
+ )
297
+ return functions[0]
298
+
299
+
300
+ # ============================================================================
301
+ # Type annotation resolution
302
+ # ============================================================================
303
+
304
+
305
+ def _resolve_annotation(annotation: Any) -> tuple[str | None, str, Literal["input", "output", "input_path"]]:
306
+ """Resolve a parameter annotation to (tangle_type, deserializer, kind).
307
+
308
+ Returns:
309
+ (tangle_type, deserializer_code, kind)
310
+ """
311
+ if annotation is inspect.Parameter.empty or annotation is None:
312
+ return "String", "str", "input"
313
+
314
+ # Handle InputPath / OutputPath (both our local versions and cloud_pipelines versions)
315
+ type_name = type(annotation).__name__
316
+ if type_name == "OutputPath":
317
+ inner_type = getattr(annotation, "type", None) or "String"
318
+ return inner_type, "_make_parent_dirs_and_return_path", "output"
319
+ if type_name == "InputPath":
320
+ inner_type = getattr(annotation, "type", None) or "String"
321
+ return inner_type, "str", "input_path"
322
+
323
+ # Handle generic types first: Optional[T], list[T], dict[K,V], Union[T, None]
324
+ # Must come before isinstance(type) check because list[str] passes isinstance(type) in Python 3.10
325
+ origin = typing.get_origin(annotation)
326
+ if origin in (list,):
327
+ return "JsonArray", "json.loads", "input"
328
+ if origin in (dict,):
329
+ return "JsonObject", "json.loads", "input"
330
+ if origin is typing.Union or origin is types.UnionType:
331
+ args = typing.get_args(annotation)
332
+ # Optional[T] == Union[T, None]
333
+ if len(args) == 2 and type(None) in args:
334
+ non_none = args[0] if args[1] is type(None) else args[1]
335
+ return _resolve_annotation(non_none)
336
+ return None, "str", "input"
337
+
338
+ # Handle direct Python types (after generic check)
339
+ if isinstance(annotation, type):
340
+ tangle = _TYPE_TO_TANGLE.get(annotation)
341
+ if tangle:
342
+ return tangle, _TYPE_TO_DESERIALIZER[tangle], "input"
343
+ return str(annotation.__name__), "str", "input"
344
+
345
+ # ForwardRef or other annotation — use string representation
346
+ return str(getattr(annotation, "__forward_arg__", annotation)), "str", "input"
347
+
348
+
349
+ def _make_return_param(name: str, annotation: type) -> ParamInfo:
350
+ """Create a ParamInfo for a return value output."""
351
+ tangle_type = _TYPE_TO_TANGLE.get(annotation, "String")
352
+ return ParamInfo(
353
+ name=name,
354
+ yaml_name=name,
355
+ python_type=str(annotation) if annotation else None,
356
+ tangle_type=tangle_type,
357
+ kind="return_output",
358
+ description=None,
359
+ deserializer=_TYPE_TO_SERIALIZER.get(tangle_type, "_serialize_str"),
360
+ )
361
+
362
+
363
+ def _resolve_namedtuple_return(return_ann: Any) -> list[ParamInfo]:
364
+ """Extract output parameters from a NamedTuple return annotation."""
365
+ # __annotations__ doesn't exist in python 3.5 and earlier
366
+ # _field_types doesn't exist in python 3.9 and later
367
+ field_annotations = getattr(return_ann, "__annotations__", None) or getattr(return_ann, "_field_types", None)
368
+ return [
369
+ _make_return_param(
370
+ name=field_name,
371
+ annotation=field_annotations.get(field_name, str) if field_annotations else str,
372
+ )
373
+ for field_name in return_ann._fields
374
+ ]
375
+
376
+
377
+ def _resolve_single_return(return_ann: type) -> ParamInfo | None:
378
+ """Create an output parameter for a single (non-NamedTuple) return type.
379
+
380
+ Returns None if the type is not a recognized Tangle type.
381
+ """
382
+ if return_ann not in _TYPE_TO_TANGLE:
383
+ return None
384
+ return _make_return_param(name="Output", annotation=return_ann)
385
+
386
+
387
+ def _resolve_return_type(func: Callable) -> tuple[list[ParamInfo], bool]:
388
+ """Extract output parameters from the function's return type annotation.
389
+
390
+ Matches the Cloud-Pipelines SDK behavior:
391
+ - NamedTuple return -> one output per field (multi-output)
392
+ - Single type return (str, int, etc.) -> one output named "Output" (single-output)
393
+ - No return annotation -> no outputs
394
+
395
+ Returns:
396
+ (return_params, single_return_output) where single_return_output is True
397
+ when the return is a plain type (not NamedTuple) and the generated code
398
+ needs ``_outputs = [_outputs]`` wrapping.
399
+ """
400
+ # Use inspect.signature like the SDK does (avoids typing.get_type_hints issues
401
+ # with InputPath/OutputPath instances that aren't valid types for Optional[]).
402
+ return_ann = inspect.signature(func).return_annotation
403
+ if return_ann is None or return_ann is inspect.Parameter.empty:
404
+ return [], False
405
+
406
+ if hasattr(return_ann, "_fields"):
407
+ return _resolve_namedtuple_return(return_ann), False
408
+
409
+ param = _resolve_single_return(return_ann)
410
+ if param:
411
+ return [param], True
412
+
413
+ return [], False
414
+
415
+
416
+ # ============================================================================
417
+ # Interface extraction
418
+ # ============================================================================
419
+
420
+
421
+ def _python_name_to_component_name(name: str) -> str:
422
+ """Convert a Python function name to a human-readable component name."""
423
+ name_with_spaces = re.sub(" +", " ", name.replace("_", " ")).strip()
424
+ if not name_with_spaces:
425
+ return name
426
+ return name_with_spaces[0].upper() + name_with_spaces[1:]
427
+
428
+
429
+ def extract_docstring_metadata(docstring: str) -> dict[str, str]:
430
+ """Extract metadata and description from a docstring.
431
+
432
+ Extracts the main description text (before any sections) and key-value pairs
433
+ from the Metadata section:
434
+
435
+ Processes and validates input data.
436
+
437
+ Metadata:
438
+ name: My Component Name
439
+ version: 1.2
440
+ updated_at: 2025-01-01T00:00:00Z
441
+
442
+ Args:
443
+ ...
444
+
445
+ Returns:
446
+ Dict with keys like "description", "name", "version", "updated_at" (only present if found).
447
+ """
448
+ sections = [
449
+ "args",
450
+ "arguments",
451
+ "parameters",
452
+ "returns",
453
+ "raises",
454
+ "yields",
455
+ "note",
456
+ "notes",
457
+ "example",
458
+ "examples",
459
+ "metadata",
460
+ ]
461
+
462
+ metadata: dict[str, str] = {}
463
+ in_metadata = False
464
+ in_description = True
465
+ description_lines: list[str] = []
466
+
467
+ for line in docstring.split("\n"):
468
+ stripped = line.strip()
469
+
470
+ # Check for section headers
471
+ if stripped and stripped.rstrip(":").lower() in sections:
472
+ in_description = False
473
+ if stripped.lower() == "metadata:":
474
+ in_metadata = True
475
+ elif in_metadata:
476
+ break
477
+ continue
478
+
479
+ if in_metadata:
480
+ # Parse any key: value pair
481
+ kv_match = re.match(r"^(\w[\w_]*)\s*:\s*(.+)", stripped)
482
+ if kv_match:
483
+ key = kv_match.group(1).lower()
484
+ value = kv_match.group(2).strip()
485
+ # Normalize version_timestamp to updated_at
486
+ if key == "version_timestamp":
487
+ key = "updated_at"
488
+ metadata[key] = value
489
+ elif in_description:
490
+ # Collect description lines (before any section)
491
+ if stripped:
492
+ description_lines.append(stripped)
493
+
494
+ if description_lines:
495
+ metadata["description"] = " ".join(description_lines)
496
+
497
+ return metadata
498
+
499
+
500
+ def find_function_in_source(
501
+ file_path: Path, function_name: str | None = None
502
+ ) -> tuple[str | None, ast.FunctionDef | None]:
503
+ """Find a function in a Python source file by AST parsing.
504
+
505
+ Args:
506
+ file_path: Path to the Python file
507
+ function_name: Name of function to find. If not found or not provided,
508
+ falls back to first public function in the file.
509
+
510
+ Returns:
511
+ Tuple of (function_name, function_node) or (None, None) if no functions found.
512
+ """
513
+ try:
514
+ content = file_path.read_text()
515
+ tree = ast.parse(content)
516
+
517
+ all_functions = [
518
+ node
519
+ for node in ast.iter_child_nodes(tree)
520
+ if isinstance(node, ast.FunctionDef) and not node.name.startswith("_")
521
+ ]
522
+
523
+ if not all_functions:
524
+ return None, None
525
+
526
+ if function_name:
527
+ for func in all_functions:
528
+ if func.name == function_name:
529
+ return func.name, func
530
+ # Function not found, fall back to first function
531
+ first_func = all_functions[0]
532
+ warnings.warn(
533
+ f"Function '{function_name}' not found in {file_path.name}, " f"using '{first_func.name}' instead"
534
+ )
535
+ return first_func.name, first_func
536
+
537
+ first_func = all_functions[0]
538
+ return first_func.name, first_func
539
+
540
+ except (SyntaxError, ValueError, OSError) as e:
541
+ warnings.warn(f"Could not parse {file_path}: {e}")
542
+ return None, None
543
+
544
+
545
+ def extract_file_metadata(file_path: Path, function_name: str | None = None) -> tuple[dict[str, str], str | None]:
546
+ """Extract metadata from a function's docstring in a Python source file.
547
+
548
+ Finds the function via AST, extracts its docstring, and parses the Metadata
549
+ section for keys like name, version, updated_at, plus the description.
550
+
551
+ Args:
552
+ file_path: Path to the Python file
553
+ function_name: Function to extract from. Defaults to file stem.
554
+
555
+ Returns:
556
+ Tuple of (metadata_dict, actual_function_name_used)
557
+ """
558
+ if not function_name:
559
+ function_name = file_path.stem.replace("-", "_")
560
+
561
+ actual_func_name, func_node = find_function_in_source(file_path, function_name)
562
+ if not func_node:
563
+ return {}, None
564
+
565
+ docstring = ast.get_docstring(func_node)
566
+ if docstring:
567
+ return extract_docstring_metadata(docstring), actual_func_name
568
+
569
+ return {}, actual_func_name
570
+
571
+
572
+ def extract_interface(
573
+ func: Callable,
574
+ docstring_metadata: dict[str, str],
575
+ ) -> FunctionSpec:
576
+ """Extract component interface from a Python function.
577
+
578
+ Uses inspect.signature() for parameter info and docstring_parser for descriptions.
579
+
580
+ Args:
581
+ func: The Python function to introspect.
582
+ docstring_metadata: Metadata from extract_file_metadata or extract_docstring_metadata.
583
+ """
584
+ signature = inspect.signature(func)
585
+ parsed_docstring = docstring_parser.parse(inspect.getdoc(func) or "")
586
+ doc_dict = {p.arg_name: p.description for p in parsed_docstring.params}
587
+
588
+ params: list[ParamInfo] = []
589
+
590
+ for param in signature.parameters.values():
591
+ annotation = param.annotation
592
+ tangle_type, deserializer, kind = _resolve_annotation(annotation)
593
+
594
+ # Determine the YAML name (strip _path/_file suffixes for InputPath/OutputPath)
595
+ yaml_name = param.name
596
+ if kind in ("output", "input_path"):
597
+ if yaml_name.endswith("_path"):
598
+ yaml_name = yaml_name[: -len("_path")]
599
+ elif yaml_name.endswith("_file"):
600
+ yaml_name = yaml_name[: -len("_file")]
601
+
602
+ # Determine optionality and default
603
+ optional = False
604
+ default = inspect.Parameter.empty
605
+ if param.default is not inspect.Parameter.empty:
606
+ if kind == "input":
607
+ optional = True
608
+ default = param.default
609
+ elif kind == "input_path" and param.default is None:
610
+ optional = True
611
+
612
+ params.append(
613
+ ParamInfo(
614
+ name=param.name,
615
+ yaml_name=yaml_name,
616
+ python_type=str(annotation) if annotation is not inspect.Parameter.empty else None,
617
+ tangle_type=tangle_type,
618
+ kind=kind,
619
+ description=doc_dict.get(param.name),
620
+ default=default,
621
+ optional=optional,
622
+ deserializer=deserializer,
623
+ )
624
+ )
625
+
626
+ component_name = docstring_metadata.get("name") or _python_name_to_component_name(func.__name__)
627
+ description = parsed_docstring.description
628
+ if description:
629
+ # Strip Metadata: section that docstring_parser doesn't understand
630
+ desc_lines = []
631
+ for line in description.split("\n"):
632
+ if line.strip().lower() == "metadata:":
633
+ break
634
+ desc_lines.append(line)
635
+ description = "\n".join(desc_lines).strip()
636
+
637
+ # Get source code
638
+ source_code = ""
639
+ source_code_stripped = ""
640
+ module_source_stripped = ""
641
+ try:
642
+ raw_source = inspect.getsource(func)
643
+ source_code = textwrap.dedent(raw_source)
644
+ # Remove decorators
645
+ lines = source_code.split("\n")
646
+ while lines and not lines[0].startswith("def "):
647
+ del lines[0]
648
+ source_code = "\n".join(lines)
649
+ source_code_stripped = _strip_type_hints(source_code)
650
+
651
+ # module_source_stripped is populated externally via generate_component_yaml
652
+ # (since we have the file path there but not here)
653
+ except (OSError, TypeError) as e:
654
+ warnings.warn(f"Could not get source code for {func.__name__}: {e}")
655
+
656
+ # Extract return type outputs (NamedTuple or single value)
657
+ return_params, single_return_output = _resolve_return_type(func)
658
+
659
+ # Enrich return_params with descriptions from docstring Returns section.
660
+ # docstring_parser interprets "field_name: description" under Returns as
661
+ # type_name=field_name, so we check both return_name and type_name.
662
+ if return_params and parsed_docstring.many_returns:
663
+ returns_dict: dict[str, str] = {}
664
+ for r in parsed_docstring.many_returns:
665
+ name = r.return_name or r.type_name
666
+ if name and r.description:
667
+ returns_dict[name] = r.description
668
+ for rp in return_params:
669
+ if rp.name in returns_dict:
670
+ rp.description = returns_dict[rp.name]
671
+
672
+ return FunctionSpec(
673
+ name=func.__name__,
674
+ component_name=component_name,
675
+ description=description,
676
+ params=params,
677
+ return_params=return_params,
678
+ single_return_output=single_return_output,
679
+ source_code=source_code,
680
+ source_code_stripped=source_code_stripped,
681
+ module_source_stripped=module_source_stripped,
682
+ docstring_metadata=docstring_metadata,
683
+ )
684
+
685
+
686
+ # ============================================================================
687
+ # __main__ guard stripping
688
+ # ============================================================================
689
+
690
+
691
+ def _strip_main_guard(source_code: str) -> str:
692
+ """Remove ``if __name__ == "__main__":`` blocks from source code.
693
+
694
+ These guards conflict with the generated argparse wrapper because both
695
+ execute at module level. When the guard appears *before* the wrapper it
696
+ fires first and typically calls ``sys.exit()``, preventing the component
697
+ from running.
698
+ """
699
+ try:
700
+ tree = ast.parse(source_code)
701
+ except SyntaxError:
702
+ return source_code
703
+
704
+ lines = source_code.splitlines(keepends=True)
705
+
706
+ # Collect line ranges to remove (1-indexed, inclusive)
707
+ ranges_to_remove: list[tuple[int, int]] = []
708
+ for node in ast.iter_child_nodes(tree):
709
+ if not isinstance(node, ast.If):
710
+ continue
711
+ if _is_name_main_test(node.test):
712
+ start = node.lineno
713
+ end = node.end_lineno or node.lineno
714
+ ranges_to_remove.append((start, end))
715
+
716
+ if not ranges_to_remove:
717
+ return source_code
718
+
719
+ removed: set[int] = set()
720
+ for start, end in ranges_to_remove:
721
+ removed.update(range(start, end + 1))
722
+
723
+ kept = [line for i, line in enumerate(lines, 1) if i not in removed]
724
+ return "".join(kept)
725
+
726
+
727
+ def _is_name_main_test(node: ast.expr) -> bool:
728
+ """Return True if *node* is ``__name__ == "__main__"`` (in either order)."""
729
+ if not isinstance(node, ast.Compare):
730
+ return False
731
+ if len(node.ops) != 1 or not isinstance(node.ops[0], ast.Eq):
732
+ return False
733
+ if len(node.comparators) != 1:
734
+ return False
735
+
736
+ left = node.left
737
+ right = node.comparators[0]
738
+
739
+ def _is_dunder_name(n: ast.expr) -> bool:
740
+ return isinstance(n, ast.Name) and n.id == "__name__"
741
+
742
+ def _is_main_str(n: ast.expr) -> bool:
743
+ return isinstance(n, ast.Constant) and n.value == "__main__"
744
+
745
+ return (_is_dunder_name(left) and _is_main_str(right)) or (_is_main_str(left) and _is_dunder_name(right))
746
+
747
+
748
+ # ============================================================================
749
+ # Authoring-construct stripping (authoring imports + @task/@pipeline/@subpipeline/@registered)
750
+ # ============================================================================
751
+
752
+ # Decorators that exist purely to *record* a function at authoring time. They
753
+ # must never survive into the baked operation program (see
754
+ # _strip_authoring_constructs). ``registered`` marks an op published separately
755
+ # via its own gen_config.yaml; when that same op is baked (through its
756
+ # local_from_python entry) the decorator + its authoring import must be stripped
757
+ # too, exactly like @task.
758
+ _AUTHORING_DECORATOR_NAMES = frozenset({"task", "pipeline", "subpipeline", "registered"})
759
+
760
+ # The python-pipeline authoring module. ONLY imports of this module (and its
761
+ # submodules) are authoring-only and stripped from the baked source. We
762
+ # deliberately do NOT strip other ``tangle_deploy.*`` packages (e.g.
763
+ # ``tangle_deploy.utils``): those may be legitimate runtime helpers used inside a
764
+ # ``@task`` body, and dropping them would raise ``NameError`` in the operation
765
+ # container.
766
+ _AUTHORING_IMPORT_MODULE = "tangle_deploy.python_pipeline"
767
+
768
+ # The authoring-only ``TaskEnv`` class name. A module-level ``X = TaskEnv(...)``
769
+ # (or ``X = <alias>.TaskEnv(...)``) declaration is authoring-only by contract and
770
+ # is stripped from the baked source by ``_strip_authoring_constructs``.
771
+ # Matched by trailing NAME only (like the authoring decorators), because in
772
+ # python-pipeline authoring files ``TaskEnv`` always
773
+ # resolves to ``tangle_deploy.python_pipeline.TaskEnv``.
774
+ _AUTHORING_ENV_CLASS_NAME = "TaskEnv"
775
+
776
+
777
+ class AuthoringStripError(ValueError):
778
+ """Raised when env-only authoring code cannot be safely stripped.
779
+
780
+ The TaskEnv runtime-strip hardening (``_strip_authoring_constructs``)
781
+ raises this when a ``@task(env=...)`` env binding is entangled with
782
+ runtime code — e.g. a mixed ``from _envs import UPI, helper`` import whose
783
+ ``helper`` is used at runtime, or a collected env name referenced by the
784
+ kept task body. Failing fast here is intentional: silently baking a broken
785
+ ``from _envs import UPI`` / ``UPI = TaskEnv(...)`` would only surface as a
786
+ ``NameError`` / ``ImportError`` at container start. The message tells the
787
+ author how to split the import or keep TaskEnv values authoring-only.
788
+ """
789
+
790
+
791
+ def _decorator_called_name(node: ast.expr) -> str | None:
792
+ """Return the trailing name a decorator expression resolves to.
793
+
794
+ Handles ``@name`` / ``@name(...)`` and ``@mod.name`` / ``@mod.name(...)``
795
+ forms, returning the trailing attribute/name (e.g. ``task`` for both
796
+ ``@task(...)`` and ``@tangle_deploy.python_pipeline.task(...)``). Returns
797
+ ``None`` for shapes we do not recognise so callers leave them untouched.
798
+
799
+ Limitation (v1, intentional): matching is by trailing NAME only, not by
800
+ import resolution. A hypothetical unrelated ``@some_other_lib.task(...)``
801
+ decorator would therefore also match. This is acceptable because in
802
+ python-pipeline authoring files the only decorators named ``task`` /
803
+ ``pipeline`` / ``subpipeline`` are the authoring decorators; resolving the
804
+ import binding is deferred unless a real collision appears.
805
+ """
806
+ if isinstance(node, ast.Call):
807
+ node = node.func
808
+ if isinstance(node, ast.Name):
809
+ return node.id
810
+ if isinstance(node, ast.Attribute):
811
+ return node.attr
812
+ return None
813
+
814
+
815
+ def _is_authoring_import(node: ast.stmt) -> bool:
816
+ """Return True if *node* imports the python-pipeline authoring surface.
817
+
818
+ Matches ONLY the ``tangle_deploy.python_pipeline`` module (and its
819
+ submodules):
820
+
821
+ - ``from tangle_deploy.python_pipeline import ...`` (including the aliased
822
+ ``from tangle_deploy.python_pipeline import ref as operation_by_ref`` form
823
+ and submodules like ``from tangle_deploy.python_pipeline.x import y``);
824
+ - ``import tangle_deploy.python_pipeline`` / ``import
825
+ tangle_deploy.python_pipeline as tp``.
826
+
827
+ It does NOT match other ``tangle_deploy.*`` packages (e.g.
828
+ ``from tangle_deploy.utils import X``) — those can be genuine runtime helpers
829
+ referenced inside a ``@task`` body and must survive into the baked program.
830
+ Relative imports (``from . import x``) are never authoring imports.
831
+ """
832
+ if isinstance(node, ast.ImportFrom):
833
+ if node.level: # relative import — not the authoring package
834
+ return False
835
+ module = node.module or ""
836
+ return module == _AUTHORING_IMPORT_MODULE or module.startswith(_AUTHORING_IMPORT_MODULE + ".")
837
+ if isinstance(node, ast.Import):
838
+ return any(
839
+ alias.name == _AUTHORING_IMPORT_MODULE or alias.name.startswith(_AUTHORING_IMPORT_MODULE + ".")
840
+ for alias in node.names
841
+ )
842
+ return False
843
+
844
+
845
+ def _attr_root_name(node: ast.expr) -> str | None:
846
+ """Return the root ``Name`` id of an attribute chain (``a.b.c`` -> ``a``).
847
+
848
+ Returns ``None`` for shapes that don't bottom out in a plain ``Name``
849
+ (e.g. ``foo().bar``), so callers leave them untouched.
850
+ """
851
+ while isinstance(node, ast.Attribute):
852
+ node = node.value
853
+ return node.id if isinstance(node, ast.Name) else None
854
+
855
+
856
+ def _env_keyword_binding_name(call: ast.Call) -> str | None:
857
+ """Return the module-level authoring name a ``@task(env=...)`` keyword needs.
858
+
859
+ Inspects the ``env=`` keyword of a (stripped) ``@task(...)`` decorator and
860
+ returns the name of the module-level binding that must also be stripped so
861
+ the baked runtime program does not crash referencing an authoring-only name:
862
+
863
+ - ``env=UPI`` -> ``"UPI"`` (a module-level env *binding* to strip, either an
864
+ ``UPI = TaskEnv(...)`` assignment or a ``from _envs import UPI`` import);
865
+ - ``env=_envs.UPI`` -> ``"_envs"`` (the module-alias root, so the
866
+ ``import _envs`` line can be stripped);
867
+ - ``env=TaskEnv(...)`` / ``env=tp.TaskEnv(...)`` (inline) -> ``None``: the
868
+ whole decorator line range is already deleted, so there is no residual
869
+ module-level binding to strip;
870
+ - anything else -> ``None`` (leave it untouched).
871
+ """
872
+ for keyword in call.keywords:
873
+ if keyword.arg != "env":
874
+ continue
875
+ value = keyword.value
876
+ if isinstance(value, ast.Name):
877
+ return value.id
878
+ if isinstance(value, ast.Attribute):
879
+ return _attr_root_name(value)
880
+ # env=TaskEnv(...) / env=tp.TaskEnv(...) inline, or any other shape:
881
+ # the decorator range already covers it, no residual binding.
882
+ return None
883
+ return None
884
+
885
+
886
+ def _is_task_env_construction(value: ast.expr | None) -> bool:
887
+ """True if *value* is a direct ``TaskEnv(...)`` / ``<alias>.TaskEnv(...)`` call.
888
+
889
+ Matched by trailing call name (mirroring ``_decorator_called_name``), so
890
+ both ``TaskEnv(image=...)`` and ``tp.TaskEnv(image=...)`` qualify. Used to
891
+ detect module-level env declarations like ``UPI = TaskEnv(...)`` regardless
892
+ of whether a ``@task(env=UPI)`` references them.
893
+ """
894
+ return isinstance(value, ast.Call) and _decorator_called_name(value) == _AUTHORING_ENV_CLASS_NAME
895
+
896
+
897
+ def _import_bound_names(node: ast.Import | ast.ImportFrom) -> dict[str, ast.alias]:
898
+ """Map each name a top-level import binds into the namespace to its alias.
899
+
900
+ - ``from m import UPI`` -> ``{"UPI": alias}``
901
+ - ``from m import UPI as U`` -> ``{"U": alias}``
902
+ - ``import _envs`` -> ``{"_envs": alias}`` (root of a dotted module path)
903
+ - ``import a.b.c`` -> ``{"a": alias}``
904
+ - ``import envs as task_envs`` -> ``{"task_envs": alias}``
905
+ """
906
+ bound: dict[str, ast.alias] = {}
907
+ for alias in node.names:
908
+ if alias.asname:
909
+ bound[alias.asname] = alias
910
+ elif isinstance(node, ast.Import):
911
+ # ``import a.b.c`` binds only the top-level package ``a``.
912
+ bound[alias.name.split(".", 1)[0]] = alias
913
+ else:
914
+ bound[alias.name] = alias
915
+ return bound
916
+
917
+
918
+ def _annotation_name_node_ids(tree: ast.AST) -> set[int]:
919
+ """Return ``id()`` of every ``ast.Name`` that lives inside a type-annotation slot.
920
+
921
+ Annotation slots are stripped from the baked output by ``_strip_type_hints``
922
+ (which runs AFTER ``_strip_authoring_constructs``), so a name that appears
923
+ ONLY in an annotation is NOT a live runtime reference. Excluding these from
924
+ the fail-fast reference scan prevents a false positive where an env name
925
+ used only as a parameter/return type annotation (``def f(x: UPI) -> UPI:``)
926
+ is mistaken for a kept runtime reference (FIX N1, §3.5).
927
+
928
+ Annotation slots covered (matching ``_strip_type_hints_ast``):
929
+
930
+ - function parameter annotations: ``args.args`` / ``posonlyargs`` /
931
+ ``kwonlyargs`` plus ``*args`` (``vararg``) and ``**kwargs`` (``kwarg``);
932
+ - ``FunctionDef`` / ``AsyncFunctionDef`` return annotations (``-> T``);
933
+ - ``AnnAssign`` annotations (``x: T`` / ``x: T = ...``).
934
+
935
+ Because ``tree`` stays alive for the duration of the caller, every node's
936
+ ``id()`` is stable and unique, so identity membership is reliable.
937
+ """
938
+ annotation_slots: list[ast.expr] = []
939
+ for node in ast.walk(tree):
940
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
941
+ args = node.args
942
+ for arg in (
943
+ *args.posonlyargs,
944
+ *args.args,
945
+ *args.kwonlyargs,
946
+ args.vararg,
947
+ args.kwarg,
948
+ ):
949
+ if arg is not None and arg.annotation is not None:
950
+ annotation_slots.append(arg.annotation)
951
+ if node.returns is not None:
952
+ annotation_slots.append(node.returns)
953
+ elif isinstance(node, ast.AnnAssign):
954
+ annotation_slots.append(node.annotation)
955
+
956
+ name_ids: set[int] = set()
957
+ for slot in annotation_slots:
958
+ for sub in ast.walk(slot):
959
+ if isinstance(sub, ast.Name):
960
+ name_ids.add(id(sub))
961
+ return name_ids
962
+
963
+
964
+ def _strip_authoring_constructs(source_code: str) -> str:
965
+ """Strip python-pipeline authoring imports and decorators from baked source.
966
+
967
+ The generated operation container re-executes ``module_source_stripped`` at
968
+ startup and then calls the target function directly. Authoring constructs
969
+ must NOT survive into that runtime program:
970
+
971
+ - re-running an ``@task`` / ``@pipeline`` / ``@subpipeline`` decorator
972
+ replaces the function with a ``CallableRef`` recorder, which raises at
973
+ call time because there is no active ``@pipeline`` trace context;
974
+ - on a thin image the ``from tangle_deploy.python_pipeline import ...``
975
+ import itself can fail with ``ImportError``.
976
+
977
+ This removes them via surgical AST line-range deletion (mirroring
978
+ ``_strip_main_guard``), so comments/formatting in the rest of the source
979
+ survive — we deliberately avoid a full ``ast.unparse`` round-trip.
980
+
981
+ Contract this relies on: authoring-surface names (``task``, ``pipeline``,
982
+ ``subpipeline``, ``In``, ``Out``, ``Outputs``, ``ref``, ...) appear ONLY in
983
+ decorators and type annotations — both stripped before the source is baked —
984
+ never in a runtime function body. Dropping the whole authoring import line is
985
+ therefore safe.
986
+
987
+ Scope of the strip (intentional v1 boundaries):
988
+
989
+ - imports: only ``tangle_deploy.python_pipeline`` (and submodules) are
990
+ dropped — see ``_is_authoring_import``. Other ``tangle_deploy.*`` runtime
991
+ helpers are preserved.
992
+ - decorators: matched by trailing NAME (``task`` / ``pipeline`` /
993
+ ``subpipeline``), not by import resolution — see ``_decorator_called_name``
994
+ for the limitation. Unrelated decorators (``@functools.cache``,
995
+ ``@property``, ...) are preserved.
996
+
997
+ TaskEnv authoring-strip hardening (``@task(env=...)``): an env
998
+ declaration that exists ONLY to feed a stripped ``@task(env=...)`` decorator
999
+ would otherwise crash the baked program (``NameError: TaskEnv`` for a
1000
+ co-located ``UPI = TaskEnv(...)`` whose import was stripped, or
1001
+ ``ImportError`` for a ``from _envs import UPI`` whose module is not in the
1002
+ runtime image). On top of the import/decorator strip this also removes, by
1003
+ line range:
1004
+
1005
+ - every module-level ``X = TaskEnv(...)`` / ``X: TaskEnv = TaskEnv(...)``
1006
+ declaration (direct ``TaskEnv(...)`` construction), and
1007
+ - module-level bindings (assignment OR import) of any name a stripped
1008
+ ``@task(env=...)`` referenced — ``env=UPI`` collects ``UPI``
1009
+ (``UPI = TaskEnv(...)`` / ``UPI = make_task_env(...)`` / ``from _envs import
1010
+ UPI``); ``env=_envs.UPI`` collects the module alias ``_envs``
1011
+ (``import _envs``).
1012
+
1013
+ It is deliberately narrow: only names PROVEN to participate in a stripped
1014
+ ``@task(env=...)`` decorator or a direct module-level ``TaskEnv(...)`` call
1015
+ are removed. It is NOT a general unused-import cleaner. It raises
1016
+ :class:`AuthoringStripError` (fail-fast) rather than bake a broken program
1017
+ when an env binding is entangled with runtime code: a mixed
1018
+ ``from _envs import UPI, helper`` whose ``helper`` is used at runtime, or a
1019
+ collected env name still referenced by the kept task body.
1020
+
1021
+ This intentionally operates on ``module_source_stripped`` ONLY. It must never
1022
+ touch the verbatim ``python_original_code`` annotation, which is read
1023
+ directly from the source file elsewhere and kept byte-verbatim.
1024
+ """
1025
+ try:
1026
+ tree = ast.parse(source_code)
1027
+ except SyntaxError:
1028
+ return source_code
1029
+
1030
+ lines = source_code.splitlines(keepends=True)
1031
+ removed: set[int] = set() # 1-indexed line numbers to drop
1032
+ # Names introduced ONLY to feed a stripped ``@task(env=...)`` decorator.
1033
+ # Collected from ``env=`` keywords; used below to strip the matching
1034
+ # module-level assignment/import binding.
1035
+ collected_env_names: set[str] = set()
1036
+
1037
+ for node in ast.walk(tree):
1038
+ # Authoring imports — delete the whole (possibly multi-line) statement.
1039
+ if isinstance(node, (ast.Import, ast.ImportFrom)) and _is_authoring_import(node):
1040
+ start = node.lineno
1041
+ end = node.end_lineno or node.lineno
1042
+ removed.update(range(start, end + 1))
1043
+ continue
1044
+
1045
+ # @task / @pipeline / @subpipeline decorators on functions/classes.
1046
+ # The "@" shares the decorator expression's first line, so removing the
1047
+ # node's full line range removes the "@" too. Real-world decorators span
1048
+ # multiple lines, hence lineno..end_lineno rather than a prefix match.
1049
+ decorator_list = getattr(node, "decorator_list", None)
1050
+ if not decorator_list:
1051
+ continue
1052
+ for decorator in decorator_list:
1053
+ if _decorator_called_name(decorator) in _AUTHORING_DECORATOR_NAMES:
1054
+ start = decorator.lineno
1055
+ end = decorator.end_lineno or decorator.lineno
1056
+ removed.update(range(start, end + 1))
1057
+ # Record the env-only authoring name this @task(env=...) needs
1058
+ # stripped from module scope (None for inline TaskEnv(...)).
1059
+ if isinstance(decorator, ast.Call):
1060
+ env_name = _env_keyword_binding_name(decorator)
1061
+ if env_name is not None:
1062
+ collected_env_names.add(env_name)
1063
+
1064
+ # --- Fail-fast: nested/conditional env imports cannot be stripped (N1/N2) -
1065
+ #
1066
+ # Module-level removal below only touches ``tree.body``. An env import
1067
+ # nested inside an ``if`` / ``try`` / function body (i.e. NOT a direct child
1068
+ # of ``tree.body``) is therefore NOT stripped and would LEAK into the baked
1069
+ # program -> ``ImportError`` on a thin runtime image (or re-binding an
1070
+ # authoring-only name) at container start. We also must NOT line-delete a
1071
+ # nested import: removing the only statement in a block leaves an empty
1072
+ # suite -> ``IndentationError``. Converting the silent leak into a loud,
1073
+ # actionable error is the correct, safe behavior (FIX N2, §3.5).
1074
+ if collected_env_names:
1075
+ top_level_stmt_ids = {id(stmt) for stmt in tree.body}
1076
+ for node in ast.walk(tree):
1077
+ if not isinstance(node, (ast.Import, ast.ImportFrom)):
1078
+ continue
1079
+ if id(node) in top_level_stmt_ids:
1080
+ continue # module-level imports are handled by the strip below
1081
+ nested_env = sorted(collected_env_names & _import_bound_names(node).keys())
1082
+ if nested_env:
1083
+ names_repr = ", ".join(repr(n) for n in nested_env)
1084
+ raise AuthoringStripError(
1085
+ f"env name {names_repr} is imported inside a nested block "
1086
+ "(if/try/function); TaskEnv env imports must be module-level "
1087
+ "/ authoring-only. A nested env import is not stripped and "
1088
+ "would leak into the baked runtime program (ImportError at "
1089
+ "container start). Move it to a top-level import so it can be "
1090
+ "stripped, and keep TaskEnv values authoring-only."
1091
+ )
1092
+
1093
+ # --- TaskEnv env-only declarations / imports (§3.5) ---------------------
1094
+ #
1095
+ # Restricted to module-level statements (``tree.body``) so nested code is
1096
+ # never touched. Two kinds of statement are stripped:
1097
+ # 1. assignments that construct a TaskEnv directly (``X = TaskEnv(...)``)
1098
+ # or whose target is a collected env name (``UPI = make_task_env(...)``
1099
+ # when ``@task(env=UPI)`` was seen), and
1100
+ # 2. imports that bind a collected env name/module (``from _envs import
1101
+ # UPI`` / ``import _envs``) when that name is env-only.
1102
+ #
1103
+ # We record each candidate's bound name(s) + line range, then verify (after
1104
+ # a reference scan) that removing it cannot break kept runtime code.
1105
+ env_assign_bindings: list[tuple[set[str], int, int]] = [] # (names, start, end)
1106
+ env_import_candidates: list[tuple[ast.Import | ast.ImportFrom, int, int]] = []
1107
+ for stmt in tree.body:
1108
+ if isinstance(stmt, ast.Assign):
1109
+ simple_targets = {t.id for t in stmt.targets if isinstance(t, ast.Name)}
1110
+ if _is_task_env_construction(stmt.value) or (simple_targets & collected_env_names):
1111
+ env_assign_bindings.append((simple_targets, stmt.lineno, stmt.end_lineno or stmt.lineno))
1112
+ elif isinstance(stmt, ast.AnnAssign) and isinstance(stmt.target, ast.Name):
1113
+ tname = stmt.target.id
1114
+ if _is_task_env_construction(stmt.value) or tname in collected_env_names:
1115
+ env_assign_bindings.append(({tname}, stmt.lineno, stmt.end_lineno or stmt.lineno))
1116
+ elif isinstance(stmt, (ast.Import, ast.ImportFrom)):
1117
+ if _is_authoring_import(stmt):
1118
+ continue # already removed above
1119
+ bound = _import_bound_names(stmt)
1120
+ if collected_env_names & bound.keys():
1121
+ env_import_candidates.append((stmt, stmt.lineno, stmt.end_lineno or stmt.lineno))
1122
+
1123
+ # Provisionally drop every env declaration/import candidate. Their own line
1124
+ # ranges hold no runtime ``Load`` of the bound name (assignment targets are
1125
+ # ``Store``; import bindings are aliases), so including them now does not
1126
+ # mask a real runtime reference detected below.
1127
+ for _names, start, end in env_assign_bindings:
1128
+ removed.update(range(start, end + 1))
1129
+ for _stmt, start, end in env_import_candidates:
1130
+ removed.update(range(start, end + 1))
1131
+
1132
+ # Reference scan: every ``Name`` used in a ``Load`` context, mapped to the
1133
+ # 1-indexed lines it appears on. Attribute roots (``_envs`` in
1134
+ # ``_envs.UPI``) are plain ``Name`` Load nodes too, so this covers them.
1135
+ #
1136
+ # FIX N1 (§3.5): exclude ``Name`` nodes that live in a type-annotation slot
1137
+ # (param/return/AnnAssign). Annotations are stripped from the baked output by
1138
+ # ``_strip_type_hints`` (which runs later), so an env name used ONLY as a
1139
+ # type annotation (``def f(x: UPI) -> UPI:``) is NOT a live runtime
1140
+ # reference and must not trip the body-ref fail-fast. A real body reference
1141
+ # (outside annotations) still records a Load and still fails fast.
1142
+ if env_assign_bindings or env_import_candidates:
1143
+ annotation_name_ids = _annotation_name_node_ids(tree)
1144
+ load_lines: dict[str, set[int]] = {}
1145
+ for node in ast.walk(tree):
1146
+ if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load) and id(node) not in annotation_name_ids:
1147
+ load_lines.setdefault(node.id, set()).add(node.lineno)
1148
+
1149
+ def _referenced_in_kept(name: str) -> bool:
1150
+ # ``name`` is used by runtime code iff it has a ``Load`` on a line
1151
+ # that survives the strip (i.e. not in ``removed``).
1152
+ return any(line not in removed for line in load_lines.get(name, ()))
1153
+
1154
+ # Fail fast: a stripped env declaration whose target the kept body still
1155
+ # references would leave a dangling ``NameError`` — env names are
1156
+ # authoring-only by contract.
1157
+ for names, _start, _end in env_assign_bindings:
1158
+ for name in names:
1159
+ if _referenced_in_kept(name):
1160
+ raise AuthoringStripError(
1161
+ f"TaskEnv authoring name {name!r} is referenced by the "
1162
+ "baked runtime code, but its declaration is stripped "
1163
+ "because it is a @task(env=...) environment. TaskEnv "
1164
+ "values are authoring-only: do not reference them from "
1165
+ "a task body or other runtime code. Move the runtime "
1166
+ "use out, or keep the value as a plain runtime object "
1167
+ "that is not used as @task(env=...)."
1168
+ )
1169
+
1170
+ for stmt, _start, _end in env_import_candidates:
1171
+ bound = _import_bound_names(stmt)
1172
+ env_bound = collected_env_names & bound.keys()
1173
+ other_bound = bound.keys() - env_bound
1174
+ # (a) Mixed import: an env-only name shares the statement with a
1175
+ # runtime name that is actually used. We cannot line-delete just
1176
+ # part of the statement, so fail fast with split guidance.
1177
+ used_others = sorted(n for n in other_bound if _referenced_in_kept(n))
1178
+ if used_others:
1179
+ raise AuthoringStripError(
1180
+ "Import " + ", ".join(sorted(env_bound)) + " is a @task(env=...) environment but shares an import "
1181
+ "statement with runtime name(s) "
1182
+ + ", ".join(used_others)
1183
+ + ". Split the import so TaskEnv env names are imported on "
1184
+ "their own line (e.g. `from _envs import UPI` separate from "
1185
+ "`from _envs import helper`); env imports are authoring-only "
1186
+ "and stripped from the baked runtime program."
1187
+ )
1188
+ # (b) The env name itself is still referenced by kept runtime code.
1189
+ for name in sorted(env_bound):
1190
+ if _referenced_in_kept(name):
1191
+ raise AuthoringStripError(
1192
+ f"TaskEnv authoring name {name!r} is imported and "
1193
+ "referenced by the baked runtime code, but its import is "
1194
+ "stripped because it is a @task(env=...) environment. "
1195
+ "TaskEnv values are authoring-only: do not reference "
1196
+ "them from a task body or other runtime code."
1197
+ )
1198
+
1199
+ if not removed:
1200
+ return source_code
1201
+
1202
+ kept = [line for i, line in enumerate(lines, 1) if i not in removed]
1203
+ return "".join(kept)
1204
+
1205
+
1206
+ # ============================================================================
1207
+ # Type hint stripping (replicating SDK strip_type_hints)
1208
+ # ============================================================================
1209
+
1210
+
1211
+ def _strip_type_hints(source_code: str) -> str:
1212
+ """Strip type annotations from function definitions using the ast module."""
1213
+ try:
1214
+ return _strip_type_hints_ast(source_code)
1215
+ except Exception as e:
1216
+ warnings.warn(f"Failed to strip type hints (using source as-is): {e}")
1217
+ return source_code
1218
+
1219
+
1220
+ def _byte_col_to_char_col(line: str, byte_col: int) -> int:
1221
+ """Convert a UTF-8 byte offset to a Python string character index.
1222
+
1223
+ AST col_offset/end_col_offset are UTF-8 byte offsets, not character indices.
1224
+ For ASCII-only lines they're identical, but non-ASCII characters (e.g. "café")
1225
+ cause the two to diverge.
1226
+ """
1227
+ return len(line.encode("utf-8")[:byte_col].decode("utf-8", errors="replace"))
1228
+
1229
+
1230
+ def _strip_type_hints_ast(source_code: str) -> str:
1231
+ """Strip type annotations from function definitions using the ast module.
1232
+
1233
+ Removes parameter annotations (`: type`) and return annotations (`-> type`)
1234
+ from all function definitions. Uses AST to locate annotations, then performs
1235
+ surgical string removal to preserve original formatting.
1236
+ """
1237
+ tree = ast.parse(source_code)
1238
+ lines = source_code.splitlines(keepends=True)
1239
+
1240
+ # Collect (line, col_start, col_end) ranges to remove, in source order.
1241
+ # We'll process them in reverse order so removals don't shift earlier offsets.
1242
+ # All columns here are character indices (converted from AST byte offsets).
1243
+ removals: list[tuple[int, int, int, int]] = [] # (start_line, start_col, end_line, end_col)
1244
+
1245
+ for node in ast.walk(tree):
1246
+ if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
1247
+ continue
1248
+
1249
+ # --- Return annotation: remove " -> <type>" before the colon ---
1250
+ if node.returns is not None:
1251
+ ret = node.returns
1252
+ ret_start_line = ret.lineno # 1-indexed
1253
+ ret_line_text = lines[ret_start_line - 1]
1254
+ ret_start_col = _byte_col_to_char_col(ret_line_text, ret.col_offset)
1255
+ ret_end_line = ret.end_lineno or ret_start_line
1256
+ ret_end_line_text = lines[ret_end_line - 1]
1257
+ ret_end_col = _byte_col_to_char_col(ret_end_line_text, ret.end_col_offset or (ret.col_offset + 1))
1258
+
1259
+ # Find the "->" token by scanning backwards from the annotation start.
1260
+ # The arrow may be on the same line as the type, or on a preceding line
1261
+ # (e.g. `def f()\n -> str:`), so we search backwards through lines.
1262
+ # Bound the search to the def line to avoid matching a previous function.
1263
+ min_line_idx = node.lineno - 1 # 0-indexed; the "def" line
1264
+ arrow_line_idx = ret_start_line - 1 # 0-indexed
1265
+ arrow_pos = -1
1266
+ while arrow_line_idx >= min_line_idx:
1267
+ search_region = lines[arrow_line_idx]
1268
+ if arrow_line_idx == ret_start_line - 1:
1269
+ search_region = search_region[:ret_start_col]
1270
+ arrow_pos = search_region.rfind("->")
1271
+ if arrow_pos != -1:
1272
+ break
1273
+ arrow_line_idx -= 1
1274
+
1275
+ if arrow_pos != -1:
1276
+ # Strip any whitespace before the arrow too
1277
+ strip_start = arrow_pos
1278
+ line_text = lines[arrow_line_idx]
1279
+ while strip_start > 0 and line_text[strip_start - 1] == " ":
1280
+ strip_start -= 1
1281
+ removals.append((arrow_line_idx + 1, strip_start, ret_end_line, ret_end_col))
1282
+
1283
+ # --- Parameter annotations: remove ": <type>" from each arg ---
1284
+ for arg in node.args.args + node.args.posonlyargs + node.args.kwonlyargs:
1285
+ if arg.annotation is None:
1286
+ continue
1287
+ ann = arg.annotation
1288
+ # The annotation text starts after "param_name" with ": "
1289
+ # arg node: name at (arg.lineno, arg.col_offset), length = len(arg.arg)
1290
+ arg_line_text = lines[arg.lineno - 1]
1291
+ name_end_col = _byte_col_to_char_col(arg_line_text, arg.col_offset) + len(arg.arg)
1292
+ ann_end_line = ann.end_lineno or ann.lineno
1293
+ ann_end_line_text = lines[ann_end_line - 1]
1294
+ ann_end_col = _byte_col_to_char_col(ann_end_line_text, ann.end_col_offset or (ann.col_offset + 1))
1295
+ removals.append((arg.lineno, name_end_col, ann_end_line, ann_end_col))
1296
+
1297
+ # vararg (*args) and kwarg (**kwargs)
1298
+ for maybe_arg in (node.args.vararg, node.args.kwarg):
1299
+ if maybe_arg is not None and maybe_arg.annotation is not None:
1300
+ ann = maybe_arg.annotation
1301
+ arg_line_text = lines[maybe_arg.lineno - 1]
1302
+ name_end_col = _byte_col_to_char_col(arg_line_text, maybe_arg.col_offset) + len(maybe_arg.arg)
1303
+ ann_end_line = ann.end_lineno or ann.lineno
1304
+ ann_end_line_text = lines[ann_end_line - 1]
1305
+ ann_end_col = _byte_col_to_char_col(ann_end_line_text, ann.end_col_offset or (ann.col_offset + 1))
1306
+ removals.append((maybe_arg.lineno, name_end_col, ann_end_line, ann_end_col))
1307
+
1308
+ if not removals:
1309
+ return source_code
1310
+
1311
+ # Sort removals in reverse order so later removals don't affect earlier offsets
1312
+ removals.sort(key=lambda r: (r[0], r[1]), reverse=True)
1313
+
1314
+ for start_line, start_col, end_line, end_col in removals:
1315
+ if start_line == end_line:
1316
+ # Single-line removal
1317
+ line_idx = start_line - 1
1318
+ line = lines[line_idx]
1319
+ lines[line_idx] = line[:start_col] + line[end_col:]
1320
+ else:
1321
+ # Multi-line removal (rare but possible for complex annotations)
1322
+ first_idx = start_line - 1
1323
+ last_idx = end_line - 1
1324
+ lines[first_idx] = lines[first_idx][:start_col] + lines[last_idx][end_col:]
1325
+ del lines[first_idx + 1 : last_idx + 1]
1326
+
1327
+ return "".join(lines)
1328
+
1329
+
1330
+ # ============================================================================
1331
+ # Dependencies reading
1332
+ # ============================================================================
1333
+
1334
+
1335
+ def read_dependencies(toml_path: Path) -> list[str]:
1336
+ """Read pip dependencies from a pyproject.toml or component TOML file."""
1337
+ with open(toml_path, "rb") as f:
1338
+ data = tomllib.load(f)
1339
+ # Standard pyproject.toml format
1340
+ deps = data.get("project", {}).get("dependencies", [])
1341
+ if deps:
1342
+ return list(deps)
1343
+ return []
1344
+
1345
+
1346
+ # ============================================================================
1347
+ # Code generation
1348
+ # ============================================================================
1349
+
1350
+
1351
+ def _build_argparse_code(spec: FunctionSpec) -> str:
1352
+ """Generate argparse wrapper code for the component function.
1353
+
1354
+ Type-specific definitions (e.g. _deserialize_bool, import json) are placed
1355
+ right before 'import argparse', matching the Cloud-Pipelines SDK layout.
1356
+ """
1357
+ # Collect definitions needed by parameter types (deduplicated by content)
1358
+ definitions: dict[str, str] = {}
1359
+ for param in spec.inputs + spec.outputs:
1360
+ if param.tangle_type and param.tangle_type in _TYPE_DEFINITIONS:
1361
+ defn = _TYPE_DEFINITIONS[param.tangle_type]
1362
+ definitions[defn] = defn # dedup by content
1363
+
1364
+ # If there are return outputs, we need serializer helpers and json import
1365
+ has_return_outputs = len(spec.return_params) > 0
1366
+ if has_return_outputs:
1367
+ # Check if any return output needs json.dumps
1368
+ needs_json = any(
1369
+ _TYPE_TO_SERIALIZER.get(p.tangle_type or "String", "") == "json.dumps" for p in spec.return_params
1370
+ )
1371
+ if needs_json:
1372
+ definitions["import json"] = "import json"
1373
+
1374
+ lines = sorted(definitions.values()) + [
1375
+ "import argparse",
1376
+ f"_parser = argparse.ArgumentParser(prog={repr(spec.component_name)}, "
1377
+ f"description={repr(spec.description or '')})",
1378
+ ]
1379
+
1380
+ # Add arguments for all inputs and file-based outputs (OutputPath params)
1381
+ all_params = spec.inputs + spec.outputs
1382
+ for param in all_params:
1383
+ flag = "--" + param.yaml_name.replace("_", "-")
1384
+ is_required = param.kind == "output" or not param.optional
1385
+ line = (
1386
+ f'_parser.add_argument("{flag}", dest="{param.name}", '
1387
+ f"type={param.deserializer}, required={is_required}, "
1388
+ f"default=argparse.SUPPRESS)"
1389
+ )
1390
+ lines.append(line)
1391
+
1392
+ # Add ----output-paths argument for NamedTuple return outputs
1393
+ if has_return_outputs:
1394
+ n = len(spec.return_params)
1395
+ lines.append(f'_parser.add_argument("----output-paths", dest="_output_paths", ' f"type=str, nargs={n})")
1396
+
1397
+ lines.append("_parsed_args = vars(_parser.parse_args())")
1398
+
1399
+ if has_return_outputs:
1400
+ lines.append('_output_files = _parsed_args.pop("_output_paths", [])')
1401
+
1402
+ lines.append("")
1403
+ lines.append(f"_outputs = {spec.name}(**_parsed_args)")
1404
+
1405
+ # Single return value (not NamedTuple) must be wrapped in a list
1406
+ # to be zipped with the serializers and output paths
1407
+ if has_return_outputs and spec.single_return_output:
1408
+ lines.append("_outputs = [_outputs]")
1409
+
1410
+ # Add output serialization for return outputs
1411
+ if has_return_outputs:
1412
+ lines.append("")
1413
+ serializers = []
1414
+ for rp in spec.return_params:
1415
+ serializer = _TYPE_TO_SERIALIZER.get(rp.tangle_type or "String", "_serialize_str")
1416
+ serializers.append(f" {serializer},")
1417
+ lines.append("_output_serializers = [")
1418
+ lines.extend(serializers)
1419
+ lines.append("]")
1420
+ lines.append("")
1421
+ lines.append("import os")
1422
+ lines.append("for idx, output_file in enumerate(_output_files):")
1423
+ lines.append(" try:")
1424
+ lines.append(" os.makedirs(os.path.dirname(output_file))")
1425
+ lines.append(" except OSError:")
1426
+ lines.append(" pass")
1427
+ lines.append(" with open(output_file, 'w') as f:")
1428
+ lines.append(" f.write(_output_serializers[idx](_outputs[idx]))")
1429
+
1430
+ return "\n".join(lines)
1431
+
1432
+
1433
+ def _build_args_section(spec: FunctionSpec) -> list[Any]:
1434
+ """Build the YAML args section with input/output placeholders."""
1435
+ args: list[Any] = []
1436
+
1437
+ all_params = spec.inputs + spec.outputs
1438
+ for param in all_params:
1439
+ flag = "--" + param.yaml_name.replace("_", "-")
1440
+
1441
+ # Determine the placeholder type
1442
+ if param.kind == "output":
1443
+ placeholder = {"outputPath": param.yaml_name}
1444
+ elif param.kind == "input_path":
1445
+ placeholder = {"inputPath": param.yaml_name}
1446
+ else:
1447
+ placeholder = {"inputValue": param.yaml_name}
1448
+
1449
+ if param.optional:
1450
+ # Wrap in if/cond/isPresent/then for optional params
1451
+ args.append(
1452
+ {
1453
+ "if": {
1454
+ "cond": {"isPresent": param.yaml_name},
1455
+ "then": [flag, placeholder],
1456
+ }
1457
+ }
1458
+ )
1459
+ else:
1460
+ args.append(flag)
1461
+ args.append(placeholder)
1462
+
1463
+ # Add ----output-paths entries for NamedTuple return outputs
1464
+ if spec.return_params:
1465
+ args.append("----output-paths")
1466
+ for rp in spec.return_params:
1467
+ args.append({"outputPath": rp.yaml_name})
1468
+
1469
+ return args
1470
+
1471
+
1472
+ def _build_pip_install_command(deps: list[str]) -> list[str]:
1473
+ """Build the pip install command prefix for the container."""
1474
+ if not deps:
1475
+ return []
1476
+ quoted = " ".join(repr(str(d)) for d in deps)
1477
+ install_cmd = (
1478
+ f"PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install " f"--quiet --no-warn-script-location {quoted}"
1479
+ )
1480
+ return [
1481
+ "sh",
1482
+ "-c",
1483
+ f'({install_cmd} || {install_cmd} --user) && "$0" "$@"',
1484
+ ]
1485
+
1486
+
1487
+ def _build_python_source(
1488
+ spec: FunctionSpec,
1489
+ mode: Literal["inline", "bundle"],
1490
+ bundled_modules_b64: str | None = None,
1491
+ ) -> str:
1492
+ """Build the full Python source code to embed in the YAML.
1493
+
1494
+ For inline mode: helper functions + stripped source + argparse wrapper.
1495
+ For bundle mode: helper functions + sys.modules injection + stripped source + argparse wrapper.
1496
+ """
1497
+ parts: list[str] = []
1498
+
1499
+ # Add _make_parent_dirs_and_return_path helper if needed
1500
+ has_output_path = any(p.kind == "output" for p in spec.params)
1501
+ if has_output_path:
1502
+ parts.append(_MAKE_PARENT_DIRS_HELPER)
1503
+
1504
+ # Add _serialize_str helper if needed for NamedTuple return outputs
1505
+ if spec.return_params:
1506
+ needs_serialize_str = any(
1507
+ _TYPE_TO_SERIALIZER.get(p.tangle_type or "String", "_serialize_str") == "_serialize_str"
1508
+ for p in spec.return_params
1509
+ )
1510
+ if needs_serialize_str:
1511
+ parts.append(_SERIALIZE_STR_HELPER)
1512
+
1513
+ # For bundle mode: add sys.modules injection from compressed embedded source text
1514
+ if mode == "bundle" and bundled_modules_b64:
1515
+ parts.append(ModuleBundler.build_injection(bundled_modules_b64))
1516
+
1517
+ # Add the source code (type-hint-stripped)
1518
+ # Use full module source when available — this preserves helper functions defined
1519
+ # outside the target function, module-level imports, and constants.
1520
+ if spec.module_source_stripped:
1521
+ parts.append(spec.module_source_stripped)
1522
+ else:
1523
+ parts.append(spec.source_code_stripped)
1524
+
1525
+ # Add argparse wrapper
1526
+ parts.append(_build_argparse_code(spec))
1527
+
1528
+ full_source = "\n\n".join(parts)
1529
+ # Clean up consecutive blank lines
1530
+ full_source = re.sub(r"\n\n\n+", "\n\n", full_source).strip("\n") + "\n"
1531
+ return full_source
1532
+
1533
+
1534
+ def _serialize_default(value: Any, tangle_type: str | None) -> str | None:
1535
+ """Serialize a default value to a string for YAML."""
1536
+ if value is inspect.Parameter.empty or value is None:
1537
+ return None
1538
+ if isinstance(value, str):
1539
+ return value
1540
+ if isinstance(value, bool):
1541
+ return str(value)
1542
+ if isinstance(value, (int, float)):
1543
+ return str(value)
1544
+ if isinstance(value, (list, dict)):
1545
+ return json.dumps(value, sort_keys=True)
1546
+ return str(value)
1547
+
1548
+
1549
+ # ============================================================================
1550
+ # Component YAML building
1551
+ # ============================================================================
1552
+
1553
+
1554
+ def build_component_dict(
1555
+ spec: FunctionSpec,
1556
+ container_image: str,
1557
+ dependencies: list[str],
1558
+ annotations: dict[str, str],
1559
+ mode: Literal["inline", "bundle"] = "inline",
1560
+ bundled_modules_b64: str | None = None,
1561
+ ) -> dict[str, Any]:
1562
+ """Build the complete component YAML dict.
1563
+
1564
+ Args:
1565
+ spec: Extracted function specification
1566
+ container_image: Docker image for the container
1567
+ dependencies: List of pip dependencies
1568
+ annotations: Metadata annotations dict
1569
+ mode: Generation mode
1570
+ bundled_modules_b64: Base64-encoded pickled modules (bundle mode only)
1571
+
1572
+ Returns:
1573
+ Dict representing the full component YAML structure.
1574
+ """
1575
+ # Build inputs
1576
+ inputs = []
1577
+ for param in spec.inputs:
1578
+ input_spec: dict[str, Any] = {
1579
+ "name": param.yaml_name,
1580
+ "type": param.tangle_type,
1581
+ }
1582
+ if param.description:
1583
+ input_spec["description"] = param.description
1584
+ if param.default is not inspect.Parameter.empty and param.default is not None:
1585
+ serialized = _serialize_default(param.default, param.tangle_type)
1586
+ if serialized is not None:
1587
+ input_spec["default"] = serialized
1588
+ if param.optional:
1589
+ input_spec["optional"] = True
1590
+ inputs.append(input_spec)
1591
+
1592
+ # Build outputs (OutputPath params + NamedTuple return fields)
1593
+ outputs = []
1594
+ for param in spec.all_outputs:
1595
+ output_spec: dict[str, Any] = {
1596
+ "name": param.yaml_name,
1597
+ "type": param.tangle_type,
1598
+ }
1599
+ if param.description:
1600
+ output_spec["description"] = param.description
1601
+ outputs.append(output_spec)
1602
+
1603
+ # Build implementation
1604
+ all_deps = list(dependencies)
1605
+
1606
+ pip_install = _build_pip_install_command(all_deps)
1607
+ python_source = _build_python_source(spec, mode, bundled_modules_b64)
1608
+ args = _build_args_section(spec)
1609
+
1610
+ shell_bootstrap = textwrap.dedent("""\
1611
+ program_path=$(mktemp)
1612
+ printf "%s" "$0" > "$program_path"
1613
+ python3 -u "$program_path" "$@"
1614
+ """)
1615
+
1616
+ command = pip_install + ["sh", "-ec", shell_bootstrap, python_source]
1617
+
1618
+ # Tangle's schema rejects ``description: null``, so fall back to a generic
1619
+ # placeholder when the function has no docstring. Users can override by
1620
+ # adding a docstring to the function (its first paragraph becomes the
1621
+ # description — see ``extract_function_spec``).
1622
+ description = spec.description or f"{spec.component_name} component"
1623
+
1624
+ component: dict[str, Any] = {
1625
+ "name": spec.component_name,
1626
+ "description": description,
1627
+ }
1628
+
1629
+ if annotations:
1630
+ component["metadata"] = {"annotations": annotations}
1631
+
1632
+ if inputs:
1633
+ component["inputs"] = inputs
1634
+ if outputs:
1635
+ component["outputs"] = outputs
1636
+
1637
+ component["implementation"] = {
1638
+ "container": {
1639
+ "image": container_image,
1640
+ "command": command,
1641
+ "args": args,
1642
+ }
1643
+ }
1644
+
1645
+ return component
1646
+
1647
+
1648
+ # ============================================================================
1649
+ # Top-level generation function
1650
+ # ============================================================================
1651
+
1652
+
1653
+ def generate_component_yaml(
1654
+ file_path: Path,
1655
+ output_path: Path,
1656
+ container_image: str,
1657
+ function_name: str | None = None,
1658
+ dependencies_from: Path | None = None,
1659
+ mode: Literal["inline", "bundle"] = "inline",
1660
+ custom_name: str | None = None,
1661
+ custom_annotations: dict[str, str] | None = None,
1662
+ strip_code: bool = False,
1663
+ strip_source_path: bool = False,
1664
+ resolve_root: Path | None = None,
1665
+ emit_generation_annotations: bool = True,
1666
+ path_annotation_mode: Literal["oss", "td_legacy"] = "oss",
1667
+ ) -> bool:
1668
+ """Generate a component YAML file from a Python function.
1669
+
1670
+ Args:
1671
+ file_path: Path to the Python source file
1672
+ output_path: Where to write the generated YAML
1673
+ container_image: Docker image reference
1674
+ function_name: Function to extract (auto-detected if None)
1675
+ dependencies_from: Path to pyproject.toml with pip dependencies
1676
+ mode: "inline" for single-file, "bundle" for multi-file
1677
+ custom_name: Override the component name
1678
+ custom_annotations: Additional annotations to merge
1679
+ strip_code: Omit python_original_code annotation
1680
+ strip_source_path: Omit python_original_code_path annotation
1681
+ resolve_root: Root directory for resolving local module imports in bundle
1682
+ mode. Defaults to ``file_path.parent``. Set this when local modules
1683
+ live in sibling directories (e.g. ``src/utils`` alongside ``src/components``).
1684
+ emit_generation_annotations: Persist tangle-cli regeneration context
1685
+ annotations. Disable for downstream legacy snapshot compatibility.
1686
+ path_annotation_mode: ``"oss"`` always records source/YAML paths relative
1687
+ to their common ancestor. ``"td_legacy"`` only uses that relative
1688
+ common-root behavior inside a git checkout; outside git it records
1689
+ ``file_path.name`` / ``output_path.name`` like legacy tangle-deploy.
1690
+
1691
+ Returns:
1692
+ True on success, False on failure.
1693
+ """
1694
+ try:
1695
+ if path_annotation_mode not in {"oss", "td_legacy"}:
1696
+ raise ValueError("path_annotation_mode must be 'oss' or 'td_legacy'")
1697
+
1698
+ # 1. Extract metadata from source (AST-based, before module loading)
1699
+ file_metadata, resolved_func_name = extract_file_metadata(file_path, function_name)
1700
+ if not resolved_func_name:
1701
+ raise ValueError(f"No public functions found in {file_path}")
1702
+
1703
+ # 2. Load module and get function
1704
+ # Only add resolve_root to sys.path in bundle mode — in inline mode the
1705
+ # sibling modules won't be embedded, so letting the import succeed would
1706
+ # produce YAML that fails at runtime in the container.
1707
+ extra_paths = [resolve_root] if resolve_root and mode == "bundle" else None
1708
+ module = load_python_module(file_path, extra_sys_path=extra_paths)
1709
+ func = get_function_from_module(module, resolved_func_name)
1710
+
1711
+ # 3. Extract interface, passing pre-computed metadata
1712
+ spec = extract_interface(func, docstring_metadata=file_metadata)
1713
+ if custom_name:
1714
+ spec.component_name = custom_name
1715
+
1716
+ # Populate full module source (preserves helper functions, imports, constants)
1717
+ # Remove cloud_pipelines import since it's only used for type annotations
1718
+ module_source = file_path.read_text()
1719
+ lines = module_source.split("\n")
1720
+ lines = [
1721
+ line for line in lines if not (line.strip().startswith(("from cloud_pipelines", "import cloud_pipelines")))
1722
+ ]
1723
+ filtered_source = "\n".join(lines)
1724
+ filtered_source = _strip_main_guard(filtered_source)
1725
+ # Strip python-pipeline authoring imports + @task/@pipeline/@subpipeline
1726
+ # decorators so the baked runtime program does not re-run the authoring
1727
+ # decorator (which would turn the function into a CallableRef and crash).
1728
+ # Operates on module_source_stripped only; python_original_code stays
1729
+ # byte-verbatim (it is read separately from module_code below).
1730
+ filtered_source = _strip_authoring_constructs(filtered_source)
1731
+ spec.module_source_stripped = _strip_type_hints(filtered_source)
1732
+
1733
+ # 3. Read dependencies
1734
+ deps: list[str] = []
1735
+ if dependencies_from:
1736
+ deps = read_dependencies(dependencies_from)
1737
+
1738
+ # 4. Build annotations
1739
+ directory = file_path.parent.resolve()
1740
+ module_code = file_path.read_text()
1741
+
1742
+ annotations: dict[str, str] = {
1743
+ "cloud_pipelines.net": "true",
1744
+ "components new regenerate python-function-component": "true",
1745
+ }
1746
+ if not strip_source_path:
1747
+ annotations["python_original_code_path"] = file_path.name
1748
+ if not strip_code:
1749
+ annotations["python_original_code"] = module_code
1750
+
1751
+ # Add all docstring metadata to annotations (version, updated_at, custom keys)
1752
+ # Skip "name" and "description" since they're used for top-level fields, not annotations
1753
+ for key, value in spec.docstring_metadata.items():
1754
+ if key not in ("name", "description"):
1755
+ annotations[key] = value
1756
+
1757
+ if deps:
1758
+ annotations["python_dependencies"] = json.dumps(deps)
1759
+
1760
+ if emit_generation_annotations:
1761
+ annotations["tangle_cli_generation_function_name"] = resolved_func_name
1762
+ annotations["tangle_cli_generation_mode"] = mode
1763
+
1764
+ # Use the common ancestor of source and output so both paths are clean
1765
+ # forward references (no ".."). This lets later local maintenance
1766
+ # commands find the source even when YAML is generated into a separate
1767
+ # output directory. TD legacy compatibility keeps basename-only paths
1768
+ # outside a git checkout to preserve historical snapshots.
1769
+ resolved_source = file_path.resolve()
1770
+ resolved_output = output_path.resolve()
1771
+ common_dir = Path(os.path.commonpath([resolved_source, resolved_output]))
1772
+ git_root = get_git_root(directory)
1773
+ use_common_paths = path_annotation_mode == "oss" or git_root is not None
1774
+
1775
+ def _path_annotation(path: Path) -> str:
1776
+ if use_common_paths:
1777
+ try:
1778
+ return str(path.resolve().relative_to(common_dir))
1779
+ except ValueError:
1780
+ return str(path)
1781
+ return path.name
1782
+
1783
+ if not strip_source_path:
1784
+ annotations["python_original_code_path"] = _path_annotation(file_path)
1785
+ annotations["component_yaml_path"] = _path_annotation(output_path)
1786
+ if emit_generation_annotations:
1787
+ if dependencies_from:
1788
+ annotations["tangle_cli_generation_dependencies_from"] = _path_annotation(dependencies_from)
1789
+ if resolve_root:
1790
+ annotations["tangle_cli_generation_resolve_root"] = _path_annotation(resolve_root)
1791
+
1792
+ # Git info — use the same common ancestor as git_relative_dir when common paths are active.
1793
+ if git_root:
1794
+ git_info = get_git_info(common_dir)
1795
+ git_info.pop("_git_root", None)
1796
+ # Override git_relative_dir to be the common ancestor
1797
+ try:
1798
+ git_info["git_relative_dir"] = str(common_dir.relative_to(git_root))
1799
+ except ValueError:
1800
+ pass
1801
+ annotations.update(git_info)
1802
+ else:
1803
+ git_info = get_git_info(directory)
1804
+ git_info.pop("_git_root", None)
1805
+ annotations.update(git_info)
1806
+
1807
+ # Custom annotations
1808
+ if custom_annotations:
1809
+ annotations.update(custom_annotations)
1810
+
1811
+ # Filter None values (annotation values must be strings)
1812
+ annotations = {k: v for k, v in annotations.items() if isinstance(v, str)}
1813
+
1814
+ # 5. Handle bundle mode — embed source text of local modules
1815
+ # (not bytecode, which is Python-version-specific)
1816
+ bundled_modules_b64: str | None = None
1817
+ if mode == "bundle":
1818
+ module_sources = ModuleBundler.collect_sources(
1819
+ file_path,
1820
+ resolve_root=resolve_root,
1821
+ pip_deps=deps,
1822
+ source=spec.module_source_stripped,
1823
+ )
1824
+ if module_sources:
1825
+ bundled_modules_b64 = ModuleBundler.encode(module_sources)
1826
+ if bundled_modules_b64:
1827
+ sorted_names = sorted(module_sources.keys(), key=lambda k: (k.count("."), k))
1828
+ annotations["bundled_modules"] = json.dumps(sorted_names)
1829
+
1830
+ # 6. Build and write YAML
1831
+ component = build_component_dict(
1832
+ spec=spec,
1833
+ container_image=container_image,
1834
+ dependencies=deps,
1835
+ annotations=annotations,
1836
+ mode=mode,
1837
+ bundled_modules_b64=bundled_modules_b64,
1838
+ )
1839
+
1840
+ output_path.parent.mkdir(parents=True, exist_ok=True)
1841
+ with open(output_path, "w") as f:
1842
+ f.write(dump_yaml(component, width=120))
1843
+
1844
+ return True
1845
+
1846
+ except AuthoringStripError:
1847
+ # TaskEnv authoring-violation (§3.5): fail LOUD with the actionable
1848
+ # guidance instead of swallowing it into a warning + False. A silent
1849
+ # False would only resurface later as a confusing missing/broken
1850
+ # component at hydrate or backend run time, defeating the
1851
+ # "fail fast with a clear generator error" intent. Every OTHER failure
1852
+ # keeps the conservative warn + return False behaviour below.
1853
+ raise
1854
+ except Exception as e:
1855
+ warnings.warn(f"Error generating component YAML: {e}")
1856
+ return False