dvcgen 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dvcgen/__init__.py ADDED
@@ -0,0 +1,21 @@
1
+ """Generate DVC pipeline files from Python declarations."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+
6
+ def dep(path):
7
+ """Declare a pipeline dependency and return its runtime value."""
8
+ return path
9
+
10
+
11
+ def out(path):
12
+ """Declare a pipeline output and return its runtime value."""
13
+ return path
14
+
15
+
16
+ def param(name, default):
17
+ """Declare a pipeline parameter and return its default runtime value."""
18
+ return default
19
+
20
+
21
+ __all__ = ["__version__", "dep", "out", "param"]
dvcgen/cli.py ADDED
@@ -0,0 +1,119 @@
1
+ """Command-line interface for dvcgen."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ from collections.abc import Sequence
7
+ from pathlib import Path
8
+ import sys
9
+ from typing import Optional, TextIO
10
+
11
+ from dvcgen import __version__
12
+ from dvcgen.generate import write_files
13
+ from dvcgen.inspect import inspect_files
14
+
15
+
16
+ def build_parser() -> argparse.ArgumentParser:
17
+ parser = argparse.ArgumentParser(
18
+ prog="dvcgen",
19
+ description="Generate dvc.yaml and params.yaml from Python pipeline scripts.",
20
+ )
21
+ parser.add_argument(
22
+ "scripts",
23
+ nargs="*",
24
+ help="Python pipeline scripts to inspect.",
25
+ )
26
+ parser.add_argument(
27
+ "-o",
28
+ "--output-dir",
29
+ default=".",
30
+ help="Directory where dvc.yaml and params.yaml are written. Defaults to the current directory.",
31
+ )
32
+ parser.add_argument(
33
+ "-f",
34
+ "--force",
35
+ action="store_true",
36
+ help="Overwrite existing dvc.yaml and params.yaml files.",
37
+ )
38
+ parser.add_argument(
39
+ "--version",
40
+ action="version",
41
+ version=f"%(prog)s {__version__}",
42
+ )
43
+ return parser
44
+
45
+
46
+ def main(
47
+ argv: Optional[Sequence[str]] = None,
48
+ stdout: Optional[TextIO] = None,
49
+ stderr: Optional[TextIO] = None,
50
+ ) -> int:
51
+ stdout = sys.stdout if stdout is None else stdout
52
+ stderr = sys.stderr if stderr is None else stderr
53
+ parser = build_parser()
54
+ args = parser.parse_args(argv)
55
+ if not args.scripts:
56
+ print("dvcgen: error: provide at least one Python pipeline script", file=stderr)
57
+ print("Try 'dvcgen --help' for usage.", file=stderr)
58
+ return 2
59
+
60
+ script_paths = tuple(Path(script) for script in args.scripts)
61
+ output_dir = Path(args.output_dir)
62
+ dvc_path = output_dir / "dvc.yaml"
63
+ params_path = output_dir / "params.yaml"
64
+
65
+ validation_message = _validation_error(
66
+ script_paths,
67
+ output_dir,
68
+ (dvc_path, params_path),
69
+ args.force,
70
+ )
71
+ if validation_message is not None:
72
+ print(f"dvcgen: error: {validation_message}", file=stderr)
73
+ return 2
74
+
75
+ try:
76
+ declarations = inspect_files(script_paths)
77
+ write_files(declarations, dvc_path=dvc_path, params_path=params_path)
78
+ except SyntaxError as syntax_error:
79
+ print(
80
+ f"dvcgen: error: failed to parse {syntax_error.filename}: {syntax_error.msg}",
81
+ file=stderr,
82
+ )
83
+ return 2
84
+ except OSError as os_error:
85
+ print(f"dvcgen: error: {os_error}", file=stderr)
86
+ return 2
87
+ except ValueError as value_error:
88
+ print(f"dvcgen: error: {value_error}", file=stderr)
89
+ return 2
90
+
91
+ print(f"Wrote {dvc_path} and {params_path}", file=stdout)
92
+ return 0
93
+
94
+
95
+ def _validation_error(
96
+ script_paths: Sequence[Path],
97
+ output_dir: Path,
98
+ output_paths: Sequence[Path],
99
+ force: bool,
100
+ ) -> Optional[str]:
101
+ for script_path in script_paths:
102
+ if not script_path.exists():
103
+ return f"input script not found: {script_path}"
104
+ if not script_path.is_file():
105
+ return f"input script is not a file: {script_path}"
106
+ if script_path.suffix != ".py":
107
+ return f"input script must be a .py file: {script_path}"
108
+
109
+ if output_dir.exists() and not output_dir.is_dir():
110
+ return f"output directory is not a directory: {output_dir}"
111
+ output_dir.mkdir(parents=True, exist_ok=True)
112
+
113
+ if not force:
114
+ existing_paths = [path for path in output_paths if path.exists()]
115
+ if existing_paths:
116
+ joined_paths = ", ".join(str(path) for path in existing_paths)
117
+ return f"refusing to overwrite existing file(s): {joined_paths}; use --force to replace them"
118
+
119
+ return None
dvcgen/generate.py ADDED
@@ -0,0 +1,145 @@
1
+ """Generate DVC configuration files from extracted declarations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from collections.abc import Iterable, Mapping, Sequence
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from dvcgen.inspect import SourceDeclarations
11
+
12
+
13
+ def dvc_document(declarations: Iterable[SourceDeclarations]) -> dict[str, Any]:
14
+ """Build a dvc.yaml document from source declarations."""
15
+ stages: dict[str, dict[str, Any]] = {}
16
+
17
+ for source_declarations in sorted(declarations, key=_stage_name):
18
+ stage_name = _stage_name(source_declarations)
19
+ if stage_name in stages:
20
+ raise ValueError(f"duplicate stage name: {stage_name}")
21
+
22
+ stage: dict[str, Any] = {
23
+ "cmd": f"python {source_declarations.source}",
24
+ "deps": [
25
+ source_declarations.source,
26
+ *(dep.path for dep in source_declarations.deps),
27
+ ],
28
+ }
29
+
30
+ if source_declarations.outs:
31
+ stage["outs"] = [out.path for out in source_declarations.outs]
32
+ if source_declarations.params:
33
+ stage["params"] = sorted(param.name for param in source_declarations.params)
34
+
35
+ stages[stage_name] = stage
36
+
37
+ return {"stages": stages}
38
+
39
+
40
+ def params_document(declarations: Iterable[SourceDeclarations]) -> dict[str, Any]:
41
+ """Build a params.yaml document from source declarations."""
42
+ params: dict[str, Any] = {}
43
+
44
+ for source_declarations in sorted(declarations, key=_stage_name):
45
+ for param in sorted(source_declarations.params, key=lambda item: item.name):
46
+ _assign_dotted(params, param.name, param.default)
47
+
48
+ return params
49
+
50
+
51
+ def write_files(
52
+ declarations: Sequence[SourceDeclarations],
53
+ dvc_path: str | Path = "dvc.yaml",
54
+ params_path: str | Path = "params.yaml",
55
+ ) -> None:
56
+ """Write dvc.yaml and params.yaml for the supplied declarations."""
57
+ Path(dvc_path).write_text(
58
+ dump_yaml(dvc_document(declarations)),
59
+ encoding="utf-8",
60
+ )
61
+ Path(params_path).write_text(
62
+ dump_yaml(params_document(declarations)),
63
+ encoding="utf-8",
64
+ )
65
+
66
+
67
+ def dump_yaml(value: Any) -> str:
68
+ """Serialize a small, deterministic YAML subset."""
69
+ return "\n".join(_yaml_lines(value, indent=0)) + "\n"
70
+
71
+
72
+ def _stage_name(declarations: SourceDeclarations) -> str:
73
+ return Path(declarations.source).stem
74
+
75
+
76
+ def _assign_dotted(document: dict[str, Any], name: str, value: Any) -> None:
77
+ parts = name.split(".")
78
+ if not all(parts):
79
+ raise ValueError(f"invalid parameter name: {name}")
80
+
81
+ cursor = document
82
+ for part in parts[:-1]:
83
+ existing = cursor.setdefault(part, {})
84
+ if not isinstance(existing, dict):
85
+ raise ValueError(f"conflicting parameter name: {name}")
86
+ cursor = existing
87
+
88
+ leaf = parts[-1]
89
+ if leaf in cursor and isinstance(cursor[leaf], dict):
90
+ raise ValueError(f"conflicting parameter name: {name}")
91
+ cursor[leaf] = value
92
+
93
+
94
+ def _yaml_lines(value: Any, indent: int) -> list[str]:
95
+ if isinstance(value, Mapping):
96
+ return _mapping_lines(value, indent)
97
+ if isinstance(value, list):
98
+ return _list_lines(value, indent)
99
+ return [" " * indent + _scalar(value)]
100
+
101
+
102
+ def _mapping_lines(value: Mapping[str, Any], indent: int) -> list[str]:
103
+ lines: list[str] = []
104
+ prefix = " " * indent
105
+
106
+ for key in sorted(value):
107
+ item = value[key]
108
+ yaml_key = _string(key)
109
+ if isinstance(item, (Mapping, list)):
110
+ lines.append(f"{prefix}{yaml_key}:")
111
+ lines.extend(_yaml_lines(item, indent + 2))
112
+ else:
113
+ lines.append(f"{prefix}{yaml_key}: {_scalar(item)}")
114
+
115
+ return lines
116
+
117
+
118
+ def _list_lines(value: list[Any], indent: int) -> list[str]:
119
+ lines: list[str] = []
120
+ prefix = " " * indent
121
+
122
+ for item in value:
123
+ if isinstance(item, (Mapping, list)):
124
+ lines.append(f"{prefix}-")
125
+ lines.extend(_yaml_lines(item, indent + 2))
126
+ else:
127
+ lines.append(f"{prefix}- {_scalar(item)}")
128
+
129
+ return lines
130
+
131
+
132
+ def _scalar(value: Any) -> str:
133
+ if isinstance(value, bool):
134
+ return "true" if value else "false"
135
+ if value is None:
136
+ return "null"
137
+ if isinstance(value, (int, float)):
138
+ return str(value)
139
+ if isinstance(value, str):
140
+ return _string(value)
141
+ raise TypeError(f"unsupported YAML value: {value!r}")
142
+
143
+
144
+ def _string(value: str) -> str:
145
+ return json.dumps(value)
dvcgen/inspect.py ADDED
@@ -0,0 +1,160 @@
1
+ """Inspect Python pipeline scripts for dvcgen declarations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ast
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any, Iterable, Optional, Union
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class PathDeclaration:
13
+ """A dependency or output declaration extracted from source code."""
14
+
15
+ target: str
16
+ path: str
17
+ lineno: int
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class ParamDeclaration:
22
+ """A parameter declaration extracted from source code."""
23
+
24
+ target: str
25
+ name: str
26
+ default: Any
27
+ lineno: int
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class SourceDeclarations:
32
+ """Declarations extracted from a single Python source file."""
33
+
34
+ source: str
35
+ deps: tuple[PathDeclaration, ...]
36
+ outs: tuple[PathDeclaration, ...]
37
+ params: tuple[ParamDeclaration, ...]
38
+
39
+
40
+ PathLike = Union[str, Path]
41
+
42
+
43
+ def inspect_file(path: PathLike) -> SourceDeclarations:
44
+ """Parse a Python file and return supported dvcgen declarations."""
45
+ source_path = Path(path)
46
+ return inspect_source(
47
+ source_path.read_text(encoding="utf-8"),
48
+ source=str(source_path),
49
+ )
50
+
51
+
52
+ def inspect_files(paths: Iterable[PathLike]) -> tuple[SourceDeclarations, ...]:
53
+ """Parse multiple Python files and return declarations per file."""
54
+ return tuple(inspect_file(path) for path in paths)
55
+
56
+
57
+ def inspect_source(source_code: str, source: str = "<string>") -> SourceDeclarations:
58
+ """Parse Python source and return supported top-level declarations."""
59
+ tree = ast.parse(source_code, filename=source)
60
+ deps: list[PathDeclaration] = []
61
+ outs: list[PathDeclaration] = []
62
+ params: list[ParamDeclaration] = []
63
+
64
+ for statement in tree.body:
65
+ target = _assignment_target(statement)
66
+ if target is None:
67
+ continue
68
+
69
+ value = _assignment_value(statement)
70
+ if not isinstance(value, ast.Call):
71
+ continue
72
+
73
+ call_name = _simple_call_name(value)
74
+ if call_name == "dep":
75
+ path_declaration = _path_declaration(target, value)
76
+ if path_declaration is not None:
77
+ deps.append(path_declaration)
78
+ elif call_name == "out":
79
+ path_declaration = _path_declaration(target, value)
80
+ if path_declaration is not None:
81
+ outs.append(path_declaration)
82
+ elif call_name == "param":
83
+ param_declaration = _param_declaration(target, value)
84
+ if param_declaration is not None:
85
+ params.append(param_declaration)
86
+
87
+ return SourceDeclarations(
88
+ source=source,
89
+ deps=tuple(deps),
90
+ outs=tuple(outs),
91
+ params=tuple(params),
92
+ )
93
+
94
+
95
+ def _assignment_target(statement: ast.stmt) -> Optional[str]:
96
+ if isinstance(statement, ast.Assign) and len(statement.targets) == 1:
97
+ target = statement.targets[0]
98
+ elif isinstance(statement, ast.AnnAssign) and statement.simple:
99
+ target = statement.target
100
+ else:
101
+ return None
102
+
103
+ if isinstance(target, ast.Name):
104
+ return target.id
105
+ return None
106
+
107
+
108
+ def _assignment_value(statement: ast.stmt) -> Optional[ast.expr]:
109
+ if isinstance(statement, ast.Assign):
110
+ return statement.value
111
+ if isinstance(statement, ast.AnnAssign):
112
+ return statement.value
113
+ return None
114
+
115
+
116
+ def _simple_call_name(call: ast.Call) -> Optional[str]:
117
+ if isinstance(call.func, ast.Name):
118
+ return call.func.id
119
+ return None
120
+
121
+
122
+ def _path_declaration(target: str, call: ast.Call) -> Optional[PathDeclaration]:
123
+ if len(call.args) != 1 or call.keywords:
124
+ return None
125
+
126
+ path = _literal(call.args[0])
127
+ if not isinstance(path, str):
128
+ return None
129
+
130
+ return PathDeclaration(target=target, path=path, lineno=call.lineno)
131
+
132
+
133
+ def _param_declaration(target: str, call: ast.Call) -> Optional[ParamDeclaration]:
134
+ if len(call.args) != 2 or call.keywords:
135
+ return None
136
+
137
+ name = _literal(call.args[0])
138
+ if not isinstance(name, str):
139
+ return None
140
+
141
+ default = _literal(call.args[1])
142
+ if default is _UNSUPPORTED:
143
+ return None
144
+
145
+ return ParamDeclaration(
146
+ target=target,
147
+ name=name,
148
+ default=default,
149
+ lineno=call.lineno,
150
+ )
151
+
152
+
153
+ _UNSUPPORTED = object()
154
+
155
+
156
+ def _literal(node: ast.AST) -> Any:
157
+ try:
158
+ return ast.literal_eval(node)
159
+ except (ValueError, TypeError):
160
+ return _UNSUPPORTED
@@ -0,0 +1,175 @@
1
+ Metadata-Version: 2.4
2
+ Name: dvcgen
3
+ Version: 0.2.0
4
+ Summary: Generate DVC pipeline files from Python declarations.
5
+ Author: pillyshi
6
+ License: MIT
7
+ Keywords: dvc,params,pipeline,yaml
8
+ Classifier: Development Status :: 2 - Pre-Alpha
9
+ Classifier: Environment :: Console
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Software Development :: Build Tools
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+
21
+ # dvcgen
22
+
23
+ Write your DVC pipeline once, in Python.
24
+
25
+ `dvcgen` is an early-stage command-line tool for generating DVC pipeline files
26
+ from lightweight declarations embedded in Python pipeline scripts.
27
+
28
+ ## Current Status
29
+
30
+ Implemented:
31
+
32
+ - A Python package named `dvcgen`
33
+ - A `dvcgen` console command
34
+ - CLI argument parsing for pipeline script paths
35
+ - CLI input validation and overwrite protection
36
+ - Public declaration helpers: `dep()`, `out()`, and `param()`
37
+ - Python script inspection for top-level literal declarations
38
+ - `dvc.yaml` generation
39
+ - `params.yaml` generation
40
+
41
+ ## Installation
42
+
43
+ ```bash
44
+ uv tool install dvcgen
45
+ ```
46
+
47
+ Or run without installing:
48
+
49
+ ```bash
50
+ uvx dvcgen --help
51
+ ```
52
+
53
+ ## Usage
54
+
55
+ Show CLI help:
56
+
57
+ ```bash
58
+ dvcgen --help
59
+ ```
60
+
61
+ Generate DVC files from one or more Python pipeline scripts:
62
+
63
+ ```bash
64
+ dvcgen pipeline/*.py
65
+ ```
66
+
67
+ The command writes `dvc.yaml` and `params.yaml` in the current directory.
68
+ Stage names are derived from input Python filenames. For example,
69
+ `pipeline/train.py` becomes the `train` stage.
70
+
71
+ By default, `dvcgen` refuses to overwrite existing `dvc.yaml` or `params.yaml`
72
+ files. Use `--force` when you intentionally want to replace them:
73
+
74
+ ```bash
75
+ dvcgen --force pipeline/*.py
76
+ ```
77
+
78
+ Write files to another directory with `--output-dir`:
79
+
80
+ ```bash
81
+ dvcgen --output-dir generated pipeline/*.py
82
+ ```
83
+
84
+ Bad inputs fail with an error message and a non-zero exit code. Successful runs
85
+ print the files that were written.
86
+
87
+ Inspect declarations from Python without executing the pipeline script:
88
+
89
+ ```python
90
+ from dvcgen.inspect import inspect_file
91
+
92
+ declarations = inspect_file("pipeline/train.py")
93
+ print(declarations.deps)
94
+ print(declarations.outs)
95
+ print(declarations.params)
96
+ ```
97
+
98
+ ## Release
99
+
100
+ Publishing is intentionally manual while the project is early stage. Build and
101
+ validate artifacts before uploading anything:
102
+
103
+ ```bash
104
+ uv run python -m build
105
+ uv run twine check dist/*
106
+ ```
107
+
108
+ Use TestPyPI first when rehearsing a release. Create a TestPyPI API token, then
109
+ upload with the token as the password:
110
+
111
+ ```bash
112
+ uv run twine upload --repository testpypi dist/*
113
+ ```
114
+
115
+ Use the production PyPI repository only when the version, changelog, and package
116
+ name decision are ready:
117
+
118
+ ```bash
119
+ uv run twine upload dist/*
120
+ ```
121
+
122
+ For both repositories, use `__token__` as the username and the repository API
123
+ token as the password. Avoid committing tokens or storing them in project files.
124
+
125
+ Before the first production upload, decide whether to publish the current
126
+ minimal release to reserve the `dvcgen` package name on PyPI. Once a version is
127
+ uploaded to PyPI or TestPyPI, that exact version cannot be uploaded again; bump
128
+ the version before retrying with changed artifacts.
129
+
130
+ ## Planned MVP
131
+
132
+ The intended MVP is:
133
+
134
+ 1. Pipeline scripts declare dependencies, outputs, and parameters in Python.
135
+ 2. `dvcgen` inspects those declarations without executing the scripts.
136
+ 3. `dvcgen` writes `dvc.yaml` and `params.yaml`.
137
+
138
+ Example API:
139
+
140
+ ```python
141
+ from dvcgen import dep, out, param
142
+
143
+ TRAIN_DATA = dep("data/processed.csv")
144
+ MODEL = out("models/model.pkl")
145
+
146
+ LR = param("train.lr", 0.001)
147
+ ```
148
+
149
+ Running:
150
+
151
+ ```bash
152
+ dvcgen pipeline/train.py
153
+ ```
154
+
155
+ Generates `dvc.yaml`:
156
+
157
+ ```yaml
158
+ "stages":
159
+ "train":
160
+ "cmd": "python pipeline/train.py"
161
+ "deps":
162
+ - "pipeline/train.py"
163
+ - "data/processed.csv"
164
+ "outs":
165
+ - "models/model.pkl"
166
+ "params":
167
+ - "train.lr"
168
+ ```
169
+
170
+ And `params.yaml`:
171
+
172
+ ```yaml
173
+ "train":
174
+ "lr": 0.001
175
+ ```
@@ -0,0 +1,8 @@
1
+ dvcgen/__init__.py,sha256=Mlag_PpW7bf5LYn21xI__c3VgGEhT4sUXYLzOmxRK8w,460
2
+ dvcgen/cli.py,sha256=9kTsfZB0do6IydRHDo6UhSWQItiXUA2H_UbA-Q2o_Kw,3718
3
+ dvcgen/generate.py,sha256=jB0Wcm3RVvClXOpTIUiIzYoC7F_x6YGo1tt5b1lYDmc,4450
4
+ dvcgen/inspect.py,sha256=hU5DH_i3AYJIYuGw9Izmbj8wQGNYjuQdA-r6254KC20,4365
5
+ dvcgen-0.2.0.dist-info/METADATA,sha256=vkLfDAi2peIR8h1Bi5QVjWRYfvHG3HdE3N0gS0-xoXI,4141
6
+ dvcgen-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
7
+ dvcgen-0.2.0.dist-info/entry_points.txt,sha256=94FKuP8KZnWqZT3IOLj1TIrd4O_w8ycrt7dR_sJSUOk,43
8
+ dvcgen-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ dvcgen = dvcgen.cli:main