pypdown 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pypdown-0.1.0/PKG-INFO +89 -0
- pypdown-0.1.0/README.md +79 -0
- pypdown-0.1.0/pyproject.toml +38 -0
- pypdown-0.1.0/src/pypdown/__init__.py +3 -0
- pypdown-0.1.0/src/pypdown/models.py +66 -0
- pypdown-0.1.0/src/pypdown/run.py +55 -0
- pypdown-0.1.0/tests/__init__.py +0 -0
- pypdown-0.1.0/tests/long_test.py +76 -0
- pypdown-0.1.0/tests/package_test.py +2 -0
- pypdown-0.1.0/tests/requirements.txt +2 -0
- pypdown-0.1.0/tests/simple_test.py +43 -0
pypdown-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pypdown
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Pydantic model-based approach to data pipelining with file I/O linting.
|
|
5
|
+
Author-Email: Louis Maddox <louismmx@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Requires-Dist: pydantic>=2.8.2
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# pypdown
|
|
12
|
+
|
|
13
|
+
A Pydantic model-based approach to data pipelining with file I/O linting.
|
|
14
|
+
|
|
15
|
+
[](https://badge.fury.io/py/pypdown)
|
|
16
|
+
[](https://pypi.org/project/pypdown/)
|
|
17
|
+
[](https://opensource.org/licenses/MIT)
|
|
18
|
+
[](https://pypdown.vercel.app/)
|
|
19
|
+
[](https://github.com/lmmx/pypdown/actions/workflows/ci.yml)
|
|
20
|
+
[](https://results.pre-commit.ci/latest/github/lmmx/pypdown/master)
|
|
21
|
+
|
|
22
|
+
## Features
|
|
23
|
+
|
|
24
|
+
- Pydantic model-based approach to data pipelining
|
|
25
|
+
- File I/O linting for robust pipeline execution
|
|
26
|
+
- Easy-to-use API for defining and running pipeline steps
|
|
27
|
+
- Support for callback functions and keyword argument-based file paths
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install pypdown
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from pypdown import run_step
|
|
39
|
+
from pypdown.models import Step
|
|
40
|
+
from pydantic import BaseModel
|
|
41
|
+
from pathlib import Path
|
|
42
|
+
|
|
43
|
+
class StepParams(BaseModel):
|
|
44
|
+
input_file: Path = "input.txt"
|
|
45
|
+
output_file: Path = "output.txt"
|
|
46
|
+
final_file: Path = "final.txt"
|
|
47
|
+
|
|
48
|
+
def process_input(input_file: Path, output_file: Path, config: StepParams):
|
|
49
|
+
"""Process input file and create output file."""
|
|
50
|
+
output_file.write_text(input_file.read_text().upper())
|
|
51
|
+
|
|
52
|
+
def finalize_output(output_file: Path, final_file: Path, config: StepParams):
|
|
53
|
+
"""Process output file and create final file."""
|
|
54
|
+
final_file.write_text(f"Processed: {output_file.read_text()}")
|
|
55
|
+
|
|
56
|
+
config = StepParams()
|
|
57
|
+
|
|
58
|
+
# Define your pipeline tasks
|
|
59
|
+
tasks = [
|
|
60
|
+
{
|
|
61
|
+
"src": {"input_file": config.input_file},
|
|
62
|
+
"dst": {"output_file": config.output_file},
|
|
63
|
+
"fn": process_input,
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
"src": {"output_file": config.output_file},
|
|
67
|
+
"dst": {"final_file": config.final_file},
|
|
68
|
+
"fn": finalize_output,
|
|
69
|
+
},
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
# Create a Step
|
|
73
|
+
step = Step(name="Example Pipeline Step", tasks=tasks, config=config)
|
|
74
|
+
|
|
75
|
+
# Run the step
|
|
76
|
+
run_step(step)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Documentation
|
|
80
|
+
|
|
81
|
+
For full documentation, please visit [pypdown.vercel.app](https://pypdown.vercel.app/).
|
|
82
|
+
|
|
83
|
+
## Contributing
|
|
84
|
+
|
|
85
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
86
|
+
|
|
87
|
+
## License
|
|
88
|
+
|
|
89
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
pypdown-0.1.0/README.md
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# pypdown
|
|
2
|
+
|
|
3
|
+
A Pydantic model-based approach to data pipelining with file I/O linting.
|
|
4
|
+
|
|
5
|
+
[](https://badge.fury.io/py/pypdown)
|
|
6
|
+
[](https://pypi.org/project/pypdown/)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
[](https://pypdown.vercel.app/)
|
|
9
|
+
[](https://github.com/lmmx/pypdown/actions/workflows/ci.yml)
|
|
10
|
+
[](https://results.pre-commit.ci/latest/github/lmmx/pypdown/master)
|
|
11
|
+
|
|
12
|
+
## Features
|
|
13
|
+
|
|
14
|
+
- Pydantic model-based approach to data pipelining
|
|
15
|
+
- File I/O linting for robust pipeline execution
|
|
16
|
+
- Easy-to-use API for defining and running pipeline steps
|
|
17
|
+
- Support for callback functions and keyword argument-based file paths
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install pypdown
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from pypdown import run_step
|
|
29
|
+
from pypdown.models import Step
|
|
30
|
+
from pydantic import BaseModel
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
|
|
33
|
+
class StepParams(BaseModel):
|
|
34
|
+
input_file: Path = "input.txt"
|
|
35
|
+
output_file: Path = "output.txt"
|
|
36
|
+
final_file: Path = "final.txt"
|
|
37
|
+
|
|
38
|
+
def process_input(input_file: Path, output_file: Path, config: StepParams):
|
|
39
|
+
"""Process input file and create output file."""
|
|
40
|
+
output_file.write_text(input_file.read_text().upper())
|
|
41
|
+
|
|
42
|
+
def finalize_output(output_file: Path, final_file: Path, config: StepParams):
|
|
43
|
+
"""Process output file and create final file."""
|
|
44
|
+
final_file.write_text(f"Processed: {output_file.read_text()}")
|
|
45
|
+
|
|
46
|
+
config = StepParams()
|
|
47
|
+
|
|
48
|
+
# Define your pipeline tasks
|
|
49
|
+
tasks = [
|
|
50
|
+
{
|
|
51
|
+
"src": {"input_file": config.input_file},
|
|
52
|
+
"dst": {"output_file": config.output_file},
|
|
53
|
+
"fn": process_input,
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"src": {"output_file": config.output_file},
|
|
57
|
+
"dst": {"final_file": config.final_file},
|
|
58
|
+
"fn": finalize_output,
|
|
59
|
+
},
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
# Create a Step
|
|
63
|
+
step = Step(name="Example Pipeline Step", tasks=tasks, config=config)
|
|
64
|
+
|
|
65
|
+
# Run the step
|
|
66
|
+
run_step(step)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Documentation
|
|
70
|
+
|
|
71
|
+
For full documentation, please visit [pypdown.vercel.app](https://pypdown.vercel.app/).
|
|
72
|
+
|
|
73
|
+
## Contributing
|
|
74
|
+
|
|
75
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
76
|
+
|
|
77
|
+
## License
|
|
78
|
+
|
|
79
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = [
|
|
3
|
+
"pdm-backend",
|
|
4
|
+
]
|
|
5
|
+
build-backend = "pdm.backend"
|
|
6
|
+
|
|
7
|
+
[project]
|
|
8
|
+
name = "pypdown"
|
|
9
|
+
version = "0.1.0"
|
|
10
|
+
description = "A Pydantic model-based approach to data pipelining with file I/O linting."
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Louis Maddox", email = "louismmx@gmail.com" },
|
|
13
|
+
]
|
|
14
|
+
dependencies = [
|
|
15
|
+
"pydantic>=2.8.2",
|
|
16
|
+
]
|
|
17
|
+
requires-python = ">=3.12"
|
|
18
|
+
readme = "README.md"
|
|
19
|
+
|
|
20
|
+
[project.license]
|
|
21
|
+
text = "MIT"
|
|
22
|
+
|
|
23
|
+
[tool.pdm]
|
|
24
|
+
distribution = true
|
|
25
|
+
|
|
26
|
+
[tool.pdm.dev-dependencies]
|
|
27
|
+
docs = [
|
|
28
|
+
"mkdocs-material[recommended,imaging]>=9.5.2",
|
|
29
|
+
"mkdocs-section-index>=0.3.8",
|
|
30
|
+
"mkdocs>=1.5.3",
|
|
31
|
+
"mkdocstrings[python]>=0.24.0",
|
|
32
|
+
]
|
|
33
|
+
test = [
|
|
34
|
+
"pytest>=8.2.2",
|
|
35
|
+
]
|
|
36
|
+
vercel = [
|
|
37
|
+
"urllib3<2",
|
|
38
|
+
]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Pydantic models to represent the tasks within a step in a data pipeline."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TypeVar
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, FilePath, NewPath, OnErrorOmit, TypeAdapter
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"AvailableTask",
|
|
11
|
+
"CompletedTask",
|
|
12
|
+
"Task",
|
|
13
|
+
"Step",
|
|
14
|
+
"AvailableTA",
|
|
15
|
+
"CompletedTA",
|
|
16
|
+
"RunContext",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Executable(BaseModel):
|
|
21
|
+
"""All tasks must have an associated function to make them executable."""
|
|
22
|
+
|
|
23
|
+
fn: Callable
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AvailableTask(Executable):
|
|
27
|
+
"""A task is available when its input files exist and its outputs don't."""
|
|
28
|
+
|
|
29
|
+
src: dict[str, FilePath]
|
|
30
|
+
dst: dict[str, NewPath]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class CompletedTask(Executable):
|
|
34
|
+
"""A task is completed when its output files exist, whether inputs exist or not."""
|
|
35
|
+
|
|
36
|
+
src: dict[str, Path]
|
|
37
|
+
dst: dict[str, FilePath]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class Task(Executable):
|
|
41
|
+
"""A task has zero or more input files and zero or more output files."""
|
|
42
|
+
|
|
43
|
+
src: dict[str, Path]
|
|
44
|
+
dst: dict[str, Path]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
C = TypeVar("C", bound=BaseModel)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Step(BaseModel):
|
|
51
|
+
"""A named step in a data pipeline, split up into tasks with specified file I/O."""
|
|
52
|
+
|
|
53
|
+
name: str
|
|
54
|
+
tasks: list[Task]
|
|
55
|
+
config: C
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
AvailableTA = TypeAdapter(list[OnErrorOmit[AvailableTask]])
|
|
59
|
+
CompletedTA = TypeAdapter(list[OnErrorOmit[CompletedTask]])
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class RunContext(BaseModel):
|
|
63
|
+
"""The context available to a task runner."""
|
|
64
|
+
|
|
65
|
+
step: Step
|
|
66
|
+
idx: int
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Control flow using the Pydantic runtime file I/O checks."""
|
|
2
|
+
|
|
3
|
+
from .models import AvailableTA, AvailableTask, CompletedTA, Step, RunContext
|
|
4
|
+
|
|
5
|
+
__all__ = ["run_step"]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def task_runner(task: AvailableTask, context: RunContext) -> None:
|
|
9
|
+
print(f"Hello world {task.model_dump(mode='json', exclude='fn')}")
|
|
10
|
+
task.fn.__call__(**task.src, **task.dst, config=context.step.config)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def run_step(step: Step):
|
|
14
|
+
"""Run a pipeline step's tasks based on the availability of task files.
|
|
15
|
+
|
|
16
|
+
Tasks are iterated through, and the relevant in/output files' existence existence
|
|
17
|
+
is checked when the task is reached in the loop (rather than at the start). This
|
|
18
|
+
means that intermediate files can be created by tasks, and their existence will be
|
|
19
|
+
checked when those output files become inputs to subsequent tasks.
|
|
20
|
+
|
|
21
|
+
If any task's required input files are missing, the step bails out: no further tasks
|
|
22
|
+
will run.
|
|
23
|
+
"""
|
|
24
|
+
if step.tasks:
|
|
25
|
+
print(f"Running step {step.name!r} with {len(step.tasks)} tasks")
|
|
26
|
+
else:
|
|
27
|
+
raise ValueError("No tasks were assigned")
|
|
28
|
+
|
|
29
|
+
bail = False
|
|
30
|
+
for idx, task in enumerate(step.tasks):
|
|
31
|
+
if idx > 0 and not bail:
|
|
32
|
+
prev_task = step.tasks[idx - 1]
|
|
33
|
+
prev_completed = CompletedTA.validate_python([prev_task.model_dump()])
|
|
34
|
+
if not prev_completed:
|
|
35
|
+
bail = True
|
|
36
|
+
print("(!) Incomplete previous task detected, bailing")
|
|
37
|
+
task_repr = " --> ".join(
|
|
38
|
+
map(str, (task.model_dump(include=["src", "dst"], mode="json").values())),
|
|
39
|
+
)
|
|
40
|
+
print(f"\n--- Task {idx + 1} --- {task_repr}")
|
|
41
|
+
if bail:
|
|
42
|
+
print("(-) Bailing out of step, skipping task")
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
available = AvailableTA.validate_python([task.model_dump()])
|
|
46
|
+
completed = CompletedTA.validate_python([task.model_dump()])
|
|
47
|
+
|
|
48
|
+
if available:
|
|
49
|
+
print("\033[92;1m>>>\033[0m Running available task")
|
|
50
|
+
task_runner(task=task, context=RunContext(step=step, idx=idx))
|
|
51
|
+
elif completed:
|
|
52
|
+
print("(x) Task already completed, skipping")
|
|
53
|
+
else:
|
|
54
|
+
print("(!) Task requisite missing, bailing")
|
|
55
|
+
bail = True
|
|
File without changes
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from pypdown import run_step
|
|
2
|
+
from pypdown.models import Step
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_long_example():
|
|
8
|
+
class StepParams(BaseModel):
|
|
9
|
+
n1_o: Path = "nil1.out"
|
|
10
|
+
n2_o: Path = "nil2.out"
|
|
11
|
+
a_i: Path = "a.in"
|
|
12
|
+
a_o: Path = "a.out"
|
|
13
|
+
b_o: Path = "b.out"
|
|
14
|
+
c_o: Path = "c.out"
|
|
15
|
+
d_i: Path = "d.in"
|
|
16
|
+
d_o: Path = "d.out"
|
|
17
|
+
e_i: Path = "e.in"
|
|
18
|
+
e_o: Path = "e.out"
|
|
19
|
+
|
|
20
|
+
config = StepParams()
|
|
21
|
+
|
|
22
|
+
def cb_n1(n1_o: Path, config: StepParams):
|
|
23
|
+
n1_o.touch()
|
|
24
|
+
print(f"Touched {n1_o=}")
|
|
25
|
+
|
|
26
|
+
def cb_a(a_i: Path, a_o: Path, config: StepParams):
|
|
27
|
+
assert a_i.exists()
|
|
28
|
+
a_o.touch()
|
|
29
|
+
print(f"Touched {a_o=}")
|
|
30
|
+
|
|
31
|
+
def cb_b(a_o: Path, b_o: Path, config: StepParams):
|
|
32
|
+
assert a_o.exists()
|
|
33
|
+
b_o.touch()
|
|
34
|
+
print(f"Touched {b_o=}")
|
|
35
|
+
|
|
36
|
+
def cb_c(a_o: Path, b_o: Path, c_o: Path, config: StepParams):
|
|
37
|
+
assert a_o.exists() and b_o.exists()
|
|
38
|
+
c_o.touch()
|
|
39
|
+
print(f"Touched {c_o=}")
|
|
40
|
+
|
|
41
|
+
def cb_d(d_i: Path, d_o: Path, config: StepParams):
|
|
42
|
+
assert d_i.exists()
|
|
43
|
+
d_o.touch()
|
|
44
|
+
print(f"Touched {d_o=}")
|
|
45
|
+
|
|
46
|
+
def cb_e(e_i: Path, e_o: Path, config: StepParams):
|
|
47
|
+
assert e_i.exists()
|
|
48
|
+
e_o.touch()
|
|
49
|
+
print(f"Touched {e_o=}")
|
|
50
|
+
|
|
51
|
+
def cb_n2(n2_o: Path, config: StepParams):
|
|
52
|
+
n2_o.touch()
|
|
53
|
+
print(f"Touched {n2_o=}")
|
|
54
|
+
|
|
55
|
+
task_fields = [
|
|
56
|
+
([], ["n1_o"], cb_n1),
|
|
57
|
+
(["a_i"], ["a_o"], cb_a),
|
|
58
|
+
(["a_o"], ["b_o"], cb_b),
|
|
59
|
+
(["a_o", "b_o"], ["c_o"], cb_c),
|
|
60
|
+
(["d_i"], ["d_o"], cb_d),
|
|
61
|
+
(["e_i"], ["e_o"], cb_e),
|
|
62
|
+
([], ["n2_o"], cb_n2),
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
# Turn the in/output lists into dicts keyed by config field name with filename values
|
|
66
|
+
tasks = [
|
|
67
|
+
{
|
|
68
|
+
"src": {field: getattr(config, field) for field in inputs},
|
|
69
|
+
"dst": {field: getattr(config, field) for field in outputs},
|
|
70
|
+
"fn": func,
|
|
71
|
+
}
|
|
72
|
+
for inputs, outputs, func in task_fields
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
step = Step(name="Demo Step", tasks=tasks, config=config)
|
|
76
|
+
run_step(step)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from pypdown import run_step
|
|
2
|
+
from pypdown.models import Step
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_simple_example():
|
|
8
|
+
class StepParams(BaseModel):
|
|
9
|
+
a1_i: Path = "a1.in"
|
|
10
|
+
a2_i: Path = "a2.in"
|
|
11
|
+
a_o: Path = "a.out"
|
|
12
|
+
b_i: Path = "b.in"
|
|
13
|
+
b_o: Path = "b.out"
|
|
14
|
+
|
|
15
|
+
def cb_a(a1_i: Path, a2_i: Path, a_o: Path, config: StepParams):
|
|
16
|
+
assert a1_i.exists() and a2_i.exists()
|
|
17
|
+
a_o.touch()
|
|
18
|
+
print(f"Touched {a_o=}")
|
|
19
|
+
|
|
20
|
+
def cb_b(a_o: Path, b_i: Path, b_o: Path, config: StepParams):
|
|
21
|
+
assert a_o.exists() and b_i.exists()
|
|
22
|
+
b_o.touch()
|
|
23
|
+
print(f"Touched {b_o=}")
|
|
24
|
+
|
|
25
|
+
task_fields = [
|
|
26
|
+
(["a1_i", "a2_i"], ["a_o"], cb_a),
|
|
27
|
+
(["a_o", "b_i"], ["b_o"], cb_b),
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
config = StepParams()
|
|
31
|
+
|
|
32
|
+
# Turn the in/output lists into dicts keyed by config field name with filename values
|
|
33
|
+
tasks = [
|
|
34
|
+
{
|
|
35
|
+
"src": {field: getattr(config, field) for field in inputs},
|
|
36
|
+
"dst": {field: getattr(config, field) for field in outputs},
|
|
37
|
+
"fn": func,
|
|
38
|
+
}
|
|
39
|
+
for inputs, outputs, func in task_fields
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
step = Step(name="Demo Step", tasks=tasks, config=config)
|
|
43
|
+
run_step(step)
|