makeprov 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- makeprov-0.1.1/PKG-INFO +126 -0
- makeprov-0.1.1/README.md +111 -0
- makeprov-0.1.1/pyproject.toml +25 -0
- makeprov-0.1.1/setup.cfg +4 -0
- makeprov-0.1.1/src/config.py +45 -0
- makeprov-0.1.1/src/makeprov.egg-info/PKG-INFO +126 -0
- makeprov-0.1.1/src/makeprov.egg-info/SOURCES.txt +9 -0
- makeprov-0.1.1/src/makeprov.egg-info/dependency_links.txt +1 -0
- makeprov-0.1.1/src/makeprov.egg-info/top_level.txt +2 -0
- makeprov-0.1.1/src/makeprov.py +593 -0
- makeprov-0.1.1/tests/test_makeprov.py +71 -0
makeprov-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: makeprov
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: An RDF provenance tracking library for simple Python workflows
|
|
5
|
+
Author-email: Benno Kruit <b.b.kruit@amsterdamumc.nl>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/bennokr/makeprov
|
|
8
|
+
Project-URL: Documentation, https://makeprov.readthedocs.io
|
|
9
|
+
Project-URL: Issue Tracker, https://github.com/bennokr/makeprov/issues
|
|
10
|
+
Keywords: provenance,rdf,workflow,python
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# makeprov: Pythonic Provenance Tracking
|
|
17
|
+
|
|
18
|
+
This library provides a way to track file provenance in Python workflows using RDF and PROV (W3C Provenance) semantics. It supports defining input/output files via decorators and automatically generates provenance datasets.
|
|
19
|
+
|
|
20
|
+
## Features
|
|
21
|
+
|
|
22
|
+
- Use decorators to define rules for workflows.
|
|
23
|
+
- Automatically generate RDF-based provenance metadata.
|
|
24
|
+
- Handles input and output streams.
|
|
25
|
+
- Integrates with Python's type hints for easy configuration.
|
|
26
|
+
- Outputs provenance data in TRIG format.
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
You can install the module directly from PyPI:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install makeprov
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
Here’s an example of how to use this package in your Python scripts:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from makeprov import rule, InFile, OutFile, build
|
|
42
|
+
|
|
43
|
+
@rule()
|
|
44
|
+
def process_data(
|
|
45
|
+
input_file: InFile = InFile('input.txt'),
|
|
46
|
+
output_file: OutFile = OutFile('output.txt')
|
|
47
|
+
):
|
|
48
|
+
with input_file.open('r') as infile, output_file.open('w') as outfile:
|
|
49
|
+
data = infile.read()
|
|
50
|
+
outfile.write(data.upper())
|
|
51
|
+
|
|
52
|
+
if __name__ == '__main__':
|
|
53
|
+
process_data()
|
|
54
|
+
|
|
55
|
+
# or as a command line interface
|
|
56
|
+
import defopt
|
|
57
|
+
defopt.run(process_data)
|
|
58
|
+
|
|
59
|
+
# or as a workflow graph that automatically (re)generates all dependencies
|
|
60
|
+
from makeprov import build
|
|
61
|
+
build('output.txt')
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
You can execute `example.py` via the CLI like so:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
python example.py build-all
|
|
68
|
+
|
|
69
|
+
# Or set configuration through the CLI
|
|
70
|
+
python example.py build-all --conf='{"base_iri": "http://mybaseiri.org/", "prov_dir": "my_prov_directory"}' --force --input_file input.txt --output_file final_output.txt
|
|
71
|
+
|
|
72
|
+
# Or set configuration through a TOML file
|
|
73
|
+
python example.py build-all --conf=@my_config.toml
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Complex CSV-to-RDF Workflow
|
|
77
|
+
|
|
78
|
+
For a more involved scenario, see [`complex_example.py`](complex_example.py). It creates multiple CSV files, aggregates their contents, and emits an RDF graph that is both serialized to disk and embedded into the provenance dataset because the function returns an `rdflib.Graph`.
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
@rule()
|
|
82
|
+
def export_totals_graph(
|
|
83
|
+
totals_csv: InFile = InFile("data/region_totals.csv"),
|
|
84
|
+
graph_ttl: OutFile = OutFile("data/region_totals.ttl"),
|
|
85
|
+
) -> Graph:
|
|
86
|
+
graph = Graph()
|
|
87
|
+
graph.bind("sales", SALES)
|
|
88
|
+
|
|
89
|
+
with totals_csv.open("r", newline="") as handle:
|
|
90
|
+
for row in csv.DictReader(handle):
|
|
91
|
+
region_key = row["region"].lower().replace(" ", "-")
|
|
92
|
+
subject = SALES[f"region/{region_key}"]
|
|
93
|
+
|
|
94
|
+
graph.add((subject, RDF.type, SALES.RegionTotal))
|
|
95
|
+
graph.add((subject, SALES.regionName, Literal(row["region"])))
|
|
96
|
+
graph.add((subject, SALES.totalUnits, Literal(row["total_units"], datatype=XSD.integer)))
|
|
97
|
+
graph.add((subject, SALES.totalRevenue, Literal(row["total_revenue"], datatype=XSD.decimal)))
|
|
98
|
+
|
|
99
|
+
with graph_ttl.open("w") as handle:
|
|
100
|
+
handle.write(graph.serialize(format="turtle"))
|
|
101
|
+
|
|
102
|
+
return graph
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Run the entire workflow, including CSV generation and RDF export, with:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
python complex_example.py build-sales-report
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Configuration
|
|
112
|
+
|
|
113
|
+
You can customize the provenance tracking with the following options:
|
|
114
|
+
|
|
115
|
+
- `base_iri` (str): Base IRI for new resources
|
|
116
|
+
- `prov_dir` (str): Directory for writing PROV `.trig` files
|
|
117
|
+
- `force` (bool): Force running of dependencies
|
|
118
|
+
- `dry_run` (bool): Only check workflow, don't run anything
|
|
119
|
+
|
|
120
|
+
## Contributing
|
|
121
|
+
|
|
122
|
+
Contributions are welcome! Please open an issue or submit a pull request.
|
|
123
|
+
|
|
124
|
+
## License
|
|
125
|
+
|
|
126
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
makeprov-0.1.1/README.md
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# makeprov: Pythonic Provenance Tracking
|
|
2
|
+
|
|
3
|
+
This library provides a way to track file provenance in Python workflows using RDF and PROV (W3C Provenance) semantics. It supports defining input/output files via decorators and automatically generates provenance datasets.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Use decorators to define rules for workflows.
|
|
8
|
+
- Automatically generate RDF-based provenance metadata.
|
|
9
|
+
- Handles input and output streams.
|
|
10
|
+
- Integrates with Python's type hints for easy configuration.
|
|
11
|
+
- Outputs provenance data in TRIG format.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
You can install the module directly from PyPI:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install makeprov
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
Here’s an example of how to use this package in your Python scripts:
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from makeprov import rule, InFile, OutFile, build
|
|
27
|
+
|
|
28
|
+
@rule()
|
|
29
|
+
def process_data(
|
|
30
|
+
input_file: InFile = InFile('input.txt'),
|
|
31
|
+
output_file: OutFile = OutFile('output.txt')
|
|
32
|
+
):
|
|
33
|
+
with input_file.open('r') as infile, output_file.open('w') as outfile:
|
|
34
|
+
data = infile.read()
|
|
35
|
+
outfile.write(data.upper())
|
|
36
|
+
|
|
37
|
+
if __name__ == '__main__':
|
|
38
|
+
process_data()
|
|
39
|
+
|
|
40
|
+
# or as a command line interface
|
|
41
|
+
import defopt
|
|
42
|
+
defopt.run(process_data)
|
|
43
|
+
|
|
44
|
+
# or as a workflow graph that automatically (re)generates all dependencies
|
|
45
|
+
from makeprov import build
|
|
46
|
+
build('output.txt')
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
You can execute `example.py` via the CLI like so:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
python example.py build-all
|
|
53
|
+
|
|
54
|
+
# Or set configuration through the CLI
|
|
55
|
+
python example.py build-all --conf='{"base_iri": "http://mybaseiri.org/", "prov_dir": "my_prov_directory"}' --force --input_file input.txt --output_file final_output.txt
|
|
56
|
+
|
|
57
|
+
# Or set configuration through a TOML file
|
|
58
|
+
python example.py build-all --conf=@my_config.toml
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Complex CSV-to-RDF Workflow
|
|
62
|
+
|
|
63
|
+
For a more involved scenario, see [`complex_example.py`](complex_example.py). It creates multiple CSV files, aggregates their contents, and emits an RDF graph that is both serialized to disk and embedded into the provenance dataset because the function returns an `rdflib.Graph`.
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
@rule()
|
|
67
|
+
def export_totals_graph(
|
|
68
|
+
totals_csv: InFile = InFile("data/region_totals.csv"),
|
|
69
|
+
graph_ttl: OutFile = OutFile("data/region_totals.ttl"),
|
|
70
|
+
) -> Graph:
|
|
71
|
+
graph = Graph()
|
|
72
|
+
graph.bind("sales", SALES)
|
|
73
|
+
|
|
74
|
+
with totals_csv.open("r", newline="") as handle:
|
|
75
|
+
for row in csv.DictReader(handle):
|
|
76
|
+
region_key = row["region"].lower().replace(" ", "-")
|
|
77
|
+
subject = SALES[f"region/{region_key}"]
|
|
78
|
+
|
|
79
|
+
graph.add((subject, RDF.type, SALES.RegionTotal))
|
|
80
|
+
graph.add((subject, SALES.regionName, Literal(row["region"])))
|
|
81
|
+
graph.add((subject, SALES.totalUnits, Literal(row["total_units"], datatype=XSD.integer)))
|
|
82
|
+
graph.add((subject, SALES.totalRevenue, Literal(row["total_revenue"], datatype=XSD.decimal)))
|
|
83
|
+
|
|
84
|
+
with graph_ttl.open("w") as handle:
|
|
85
|
+
handle.write(graph.serialize(format="turtle"))
|
|
86
|
+
|
|
87
|
+
return graph
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Run the entire workflow, including CSV generation and RDF export, with:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
python complex_example.py build-sales-report
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Configuration
|
|
97
|
+
|
|
98
|
+
You can customize the provenance tracking with the following options:
|
|
99
|
+
|
|
100
|
+
- `base_iri` (str): Base IRI for new resources
|
|
101
|
+
- `prov_dir` (str): Directory for writing PROV `.trig` files
|
|
102
|
+
- `force` (bool): Force running of dependencies
|
|
103
|
+
- `dry_run` (bool): Only check workflow, don't run anything
|
|
104
|
+
|
|
105
|
+
## Contributing
|
|
106
|
+
|
|
107
|
+
Contributions are welcome! Please open an issue or submit a pull request.
|
|
108
|
+
|
|
109
|
+
## License
|
|
110
|
+
|
|
111
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "makeprov"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "An RDF provenance tracking library for simple Python workflows"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
authors = [{ name = "Benno Kruit", email = "b.b.kruit@amsterdamumc.nl" }]
|
|
12
|
+
keywords = ["provenance", "rdf", "workflow", "python"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.urls]
|
|
20
|
+
"Homepage" = "https://github.com/bennokr/makeprov"
|
|
21
|
+
"Documentation" = "https://makeprov.readthedocs.io"
|
|
22
|
+
"Issue Tracker" = "https://github.com/bennokr/makeprov/issues"
|
|
23
|
+
|
|
24
|
+
[tool.pytape]
|
|
25
|
+
test = "tests/test_makeprov.py"
|
makeprov-0.1.1/setup.cfg
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from dataclasses import fields, is_dataclass
|
|
2
|
+
import sys, logging, tomllib as toml, defopt
|
|
3
|
+
import argparse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def main(subcommands, conf_obj, parsers=None):
|
|
7
|
+
def conf(dc, params):
|
|
8
|
+
for f in fields(dc):
|
|
9
|
+
if f.name in params:
|
|
10
|
+
cur, new = getattr(dc, f.name), params[f.name]
|
|
11
|
+
if is_dataclass(cur) and isinstance(new, dict):
|
|
12
|
+
conf(cur, new)
|
|
13
|
+
else:
|
|
14
|
+
setattr(dc, f.name, new)
|
|
15
|
+
|
|
16
|
+
parent = argparse.ArgumentParser(add_help=False)
|
|
17
|
+
parent.add_argument(
|
|
18
|
+
"-c",
|
|
19
|
+
"--conf",
|
|
20
|
+
action="append",
|
|
21
|
+
default=[],
|
|
22
|
+
help="Set config param from TOML snippet or @file",
|
|
23
|
+
)
|
|
24
|
+
parent.add_argument(
|
|
25
|
+
"-v", "--verbose", action="count", default=0, help="Show more logging output"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def apply_globals(argv):
|
|
29
|
+
ns, _ = parent.parse_known_args(argv)
|
|
30
|
+
lvl = ("WARNING", "INFO", "DEBUG")[min(max(ns.verbose, 0), 2)]
|
|
31
|
+
logging.basicConfig(level=getattr(logging, lvl))
|
|
32
|
+
for t in ns.conf:
|
|
33
|
+
logging.debug(f"Parsing config {t}")
|
|
34
|
+
p = toml.load(open(t[1:], "rb")) if t.startswith("@") else toml.loads(t)
|
|
35
|
+
logging.debug(f"Setting config {p}")
|
|
36
|
+
conf(conf_obj, p)
|
|
37
|
+
|
|
38
|
+
apply_globals(sys.argv[1:]) # apply effects early
|
|
39
|
+
logging.debug(f"Config: {conf_obj}")
|
|
40
|
+
defopt.run(
|
|
41
|
+
subcommands,
|
|
42
|
+
parsers=parsers or {},
|
|
43
|
+
argv=sys.argv[1:],
|
|
44
|
+
argparse_kwargs={"parents": [parent]},
|
|
45
|
+
)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: makeprov
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: An RDF provenance tracking library for simple Python workflows
|
|
5
|
+
Author-email: Benno Kruit <b.b.kruit@amsterdamumc.nl>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/bennokr/makeprov
|
|
8
|
+
Project-URL: Documentation, https://makeprov.readthedocs.io
|
|
9
|
+
Project-URL: Issue Tracker, https://github.com/bennokr/makeprov/issues
|
|
10
|
+
Keywords: provenance,rdf,workflow,python
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# makeprov: Pythonic Provenance Tracking
|
|
17
|
+
|
|
18
|
+
This library provides a way to track file provenance in Python workflows using RDF and PROV (W3C Provenance) semantics. It supports defining input/output files via decorators and automatically generates provenance datasets.
|
|
19
|
+
|
|
20
|
+
## Features
|
|
21
|
+
|
|
22
|
+
- Use decorators to define rules for workflows.
|
|
23
|
+
- Automatically generate RDF-based provenance metadata.
|
|
24
|
+
- Handles input and output streams.
|
|
25
|
+
- Integrates with Python's type hints for easy configuration.
|
|
26
|
+
- Outputs provenance data in TRIG format.
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
You can install the module directly from PyPI:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install makeprov
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
Here’s an example of how to use this package in your Python scripts:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from makeprov import rule, InFile, OutFile, build
|
|
42
|
+
|
|
43
|
+
@rule()
|
|
44
|
+
def process_data(
|
|
45
|
+
input_file: InFile = InFile('input.txt'),
|
|
46
|
+
output_file: OutFile = OutFile('output.txt')
|
|
47
|
+
):
|
|
48
|
+
with input_file.open('r') as infile, output_file.open('w') as outfile:
|
|
49
|
+
data = infile.read()
|
|
50
|
+
outfile.write(data.upper())
|
|
51
|
+
|
|
52
|
+
if __name__ == '__main__':
|
|
53
|
+
process_data()
|
|
54
|
+
|
|
55
|
+
# or as a command line interface
|
|
56
|
+
import defopt
|
|
57
|
+
defopt.run(process_data)
|
|
58
|
+
|
|
59
|
+
# or as a workflow graph that automatically (re)generates all dependencies
|
|
60
|
+
from makeprov import build
|
|
61
|
+
build('output.txt')
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
You can execute `example.py` via the CLI like so:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
python example.py build-all
|
|
68
|
+
|
|
69
|
+
# Or set configuration through the CLI
|
|
70
|
+
python example.py build-all --conf='{"base_iri": "http://mybaseiri.org/", "prov_dir": "my_prov_directory"}' --force --input_file input.txt --output_file final_output.txt
|
|
71
|
+
|
|
72
|
+
# Or set configuration through a TOML file
|
|
73
|
+
python example.py build-all --conf=@my_config.toml
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Complex CSV-to-RDF Workflow
|
|
77
|
+
|
|
78
|
+
For a more involved scenario, see [`complex_example.py`](complex_example.py). It creates multiple CSV files, aggregates their contents, and emits an RDF graph that is both serialized to disk and embedded into the provenance dataset because the function returns an `rdflib.Graph`.
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
@rule()
|
|
82
|
+
def export_totals_graph(
|
|
83
|
+
totals_csv: InFile = InFile("data/region_totals.csv"),
|
|
84
|
+
graph_ttl: OutFile = OutFile("data/region_totals.ttl"),
|
|
85
|
+
) -> Graph:
|
|
86
|
+
graph = Graph()
|
|
87
|
+
graph.bind("sales", SALES)
|
|
88
|
+
|
|
89
|
+
with totals_csv.open("r", newline="") as handle:
|
|
90
|
+
for row in csv.DictReader(handle):
|
|
91
|
+
region_key = row["region"].lower().replace(" ", "-")
|
|
92
|
+
subject = SALES[f"region/{region_key}"]
|
|
93
|
+
|
|
94
|
+
graph.add((subject, RDF.type, SALES.RegionTotal))
|
|
95
|
+
graph.add((subject, SALES.regionName, Literal(row["region"])))
|
|
96
|
+
graph.add((subject, SALES.totalUnits, Literal(row["total_units"], datatype=XSD.integer)))
|
|
97
|
+
graph.add((subject, SALES.totalRevenue, Literal(row["total_revenue"], datatype=XSD.decimal)))
|
|
98
|
+
|
|
99
|
+
with graph_ttl.open("w") as handle:
|
|
100
|
+
handle.write(graph.serialize(format="turtle"))
|
|
101
|
+
|
|
102
|
+
return graph
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Run the entire workflow, including CSV generation and RDF export, with:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
python complex_example.py build-sales-report
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Configuration
|
|
112
|
+
|
|
113
|
+
You can customize the provenance tracking with the following options:
|
|
114
|
+
|
|
115
|
+
- `base_iri` (str): Base IRI for new resources
|
|
116
|
+
- `prov_dir` (str): Directory for writing PROV `.trig` files
|
|
117
|
+
- `force` (bool): Force running of dependencies
|
|
118
|
+
- `dry_run` (bool): Only check workflow, don't run anything
|
|
119
|
+
|
|
120
|
+
## Contributing
|
|
121
|
+
|
|
122
|
+
Contributions are welcome! Please open an issue or submit a pull request.
|
|
123
|
+
|
|
124
|
+
## License
|
|
125
|
+
|
|
126
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,593 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
import logging
|
|
5
|
+
import mimetypes
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
import inspect
|
|
9
|
+
import hashlib
|
|
10
|
+
import re
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from typing import get_origin, get_args, get_type_hints, Any
|
|
15
|
+
from abc import ABC, abstractmethod
|
|
16
|
+
from collections.abc import Callable
|
|
17
|
+
import importlib.metadata as im
|
|
18
|
+
|
|
19
|
+
import rdflib
|
|
20
|
+
from rdflib import RDF, RDFS, XSD
|
|
21
|
+
from rdflib.namespace import DCTERMS as DCT
|
|
22
|
+
|
|
23
|
+
PROV = rdflib.Namespace("http://www.w3.org/ns/prov#")
|
|
24
|
+
|
|
25
|
+
# ----------------------------------------------------------------------
|
|
26
|
+
# Config
|
|
27
|
+
# ----------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class ProvenanceConfig:
|
|
32
|
+
base_iri: str = "http://example.org/"
|
|
33
|
+
prov_dir: str = "prov"
|
|
34
|
+
force: bool = False # if True, run even when up to date
|
|
35
|
+
dry_run: bool = False # if True, do not run, just log
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
GLOBAL_CONFIG = ProvenanceConfig()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ----------------------------------------------------------------------
|
|
42
|
+
# File marker hierarchy (with "-" as stdin/stdout)
|
|
43
|
+
# ----------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class File(ABC):
|
|
47
|
+
"""Abstract base for InFile / OutFile.
|
|
48
|
+
|
|
49
|
+
Common fields:
|
|
50
|
+
- raw: original string
|
|
51
|
+
- path: filesystem path or None (for streams)
|
|
52
|
+
- is_stream: True if "-" (stdin/stdout)
|
|
53
|
+
- stream_name: "stdin" / "stdout" / None
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self, path: str | Path, stream_name: str | None):
|
|
57
|
+
raw = str(path)
|
|
58
|
+
self.raw = raw
|
|
59
|
+
|
|
60
|
+
if raw == "-":
|
|
61
|
+
self.is_stream = True
|
|
62
|
+
self.stream_name = stream_name
|
|
63
|
+
self.path: Path | None = None
|
|
64
|
+
else:
|
|
65
|
+
self.is_stream = False
|
|
66
|
+
self.stream_name = None
|
|
67
|
+
self.path = Path(path)
|
|
68
|
+
|
|
69
|
+
def __fspath__(self):
|
|
70
|
+
if self.is_stream or self.path is None:
|
|
71
|
+
raise TypeError(
|
|
72
|
+
f"{self.__class__.__name__}('-') does not have a filesystem path"
|
|
73
|
+
)
|
|
74
|
+
return str(self.path)
|
|
75
|
+
|
|
76
|
+
def __str__(self) -> str:
|
|
77
|
+
return self.raw
|
|
78
|
+
|
|
79
|
+
def __repr__(self) -> str:
|
|
80
|
+
return f"{self.__class__.__name__}({self.raw!r})"
|
|
81
|
+
|
|
82
|
+
@abstractmethod
|
|
83
|
+
def open(self, mode: str = "", *args, **kwargs):
|
|
84
|
+
"""Open underlying file or stream."""
|
|
85
|
+
...
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class InFile(File):
|
|
89
|
+
"""Marker type for input files (for provenance + DAG).
|
|
90
|
+
|
|
91
|
+
The string "-" stands for stdin.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
def __init__(self, path: str):
|
|
95
|
+
super().__init__(path, stream_name="stdin")
|
|
96
|
+
|
|
97
|
+
def open(self, mode: str = "r", *args, **kwargs):
|
|
98
|
+
if self.is_stream:
|
|
99
|
+
return sys.stdin.buffer if "b" in mode else sys.stdin
|
|
100
|
+
if self.path is None:
|
|
101
|
+
raise ValueError("InFile has no path")
|
|
102
|
+
|
|
103
|
+
return self.path.open(mode, *args, **kwargs)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class OutFile(File):
|
|
107
|
+
"""Marker type for output files (for provenance + DAG).
|
|
108
|
+
|
|
109
|
+
The string "-" stands for stdout.
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
def __init__(self, path: str):
|
|
113
|
+
super().__init__(path, stream_name="stdout")
|
|
114
|
+
|
|
115
|
+
def as_infile(self) -> InFile:
|
|
116
|
+
"""Convert an OutFile path into a new InFile for downstream steps."""
|
|
117
|
+
if self.is_stream or self.path is None:
|
|
118
|
+
raise ValueError("Cannot convert a stream-based OutFile into an InFile")
|
|
119
|
+
return InFile(str(self.path))
|
|
120
|
+
|
|
121
|
+
def open(self, mode: str = "w", *args, **kwargs):
|
|
122
|
+
if self.is_stream:
|
|
123
|
+
return sys.stdout.buffer if "b" in mode else sys.stdout
|
|
124
|
+
if self.path is None:
|
|
125
|
+
raise ValueError("OutFile has no path")
|
|
126
|
+
|
|
127
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
128
|
+
return self.path.open(mode, *args, **kwargs)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ----------------------------------------------------------------------
|
|
132
|
+
# Registry + basic Make-like build
|
|
133
|
+
# ----------------------------------------------------------------------
|
|
134
|
+
|
|
135
|
+
RULES: dict[str, dict[str, Any]] = {}
|
|
136
|
+
COMMANDS: set[Callable] = set()
|
|
137
|
+
|
|
138
|
+
def _caller_script() -> Path:
|
|
139
|
+
mod = sys.modules.get("__main__")
|
|
140
|
+
if getattr(mod, "__file__", None):
|
|
141
|
+
return Path(mod.__file__).resolve()
|
|
142
|
+
|
|
143
|
+
if sys.argv and sys.argv[0]:
|
|
144
|
+
p = Path(sys.argv[0])
|
|
145
|
+
if p.exists():
|
|
146
|
+
return p.resolve()
|
|
147
|
+
|
|
148
|
+
for f in reversed(inspect.stack()):
|
|
149
|
+
p = Path(f.filename)
|
|
150
|
+
if p.suffix in {".py", ""}:
|
|
151
|
+
return p.resolve()
|
|
152
|
+
|
|
153
|
+
return Path("unknown")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _safe_cmd(argv: list[str]) -> str | None:
|
|
157
|
+
try:
|
|
158
|
+
return subprocess.run(
|
|
159
|
+
argv, check=True, capture_output=True, text=True
|
|
160
|
+
).stdout.strip()
|
|
161
|
+
except Exception: # noqa: BLE001
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def needs_update(outputs, deps) -> bool:
|
|
166
|
+
"""Return True if outputs missing or older than any dependency."""
|
|
167
|
+
out_paths = [Path(o) for o in outputs]
|
|
168
|
+
dep_paths = [Path(d) for d in deps]
|
|
169
|
+
|
|
170
|
+
if not out_paths:
|
|
171
|
+
return True
|
|
172
|
+
|
|
173
|
+
if any(not o.exists() for o in out_paths):
|
|
174
|
+
return True
|
|
175
|
+
|
|
176
|
+
oldest_out = min(o.stat().st_mtime for o in out_paths)
|
|
177
|
+
dep_times = [d.stat().st_mtime for d in dep_paths if d.exists()]
|
|
178
|
+
if not dep_times:
|
|
179
|
+
return False
|
|
180
|
+
|
|
181
|
+
newest_dep = max(dep_times)
|
|
182
|
+
return newest_dep > oldest_out
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def build(target, _seen=None):
|
|
186
|
+
"""
|
|
187
|
+
Recursively build target after its dependencies, if needed.
|
|
188
|
+
|
|
189
|
+
`target` is a path (string/Path). Only rules with default
|
|
190
|
+
OutFile paths are part of this DAG.
|
|
191
|
+
"""
|
|
192
|
+
if _seen is None:
|
|
193
|
+
_seen = set()
|
|
194
|
+
target = str(target)
|
|
195
|
+
|
|
196
|
+
if target in _seen:
|
|
197
|
+
raise RuntimeError(f"Cycle in build graph at {target!r}")
|
|
198
|
+
_seen.add(target)
|
|
199
|
+
|
|
200
|
+
rule = RULES[target]
|
|
201
|
+
|
|
202
|
+
for dep in rule["deps"]:
|
|
203
|
+
if dep in RULES:
|
|
204
|
+
build(dep, _seen)
|
|
205
|
+
|
|
206
|
+
rule["func"]()
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
# ----------------------------------------------------------------------
|
|
210
|
+
# PROV helpers
|
|
211
|
+
# ----------------------------------------------------------------------
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _describe_file(g: rdflib.Graph, base: rdflib.Namespace, path: Path, kind: str):
|
|
215
|
+
iri = base[f"{kind}/{path.as_posix()}"]
|
|
216
|
+
|
|
217
|
+
mtype = mimetypes.guess_type(path.name)[0] or "application/octet-stream"
|
|
218
|
+
size = path.stat().st_size if path.exists() else 0
|
|
219
|
+
mtime = (
|
|
220
|
+
datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc).isoformat()
|
|
221
|
+
if path.exists()
|
|
222
|
+
else None
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
g.add((iri, RDF.type, PROV.Entity))
|
|
226
|
+
g.add((iri, DCT.format, rdflib.Literal(mtype)))
|
|
227
|
+
g.add((iri, DCT.extent, rdflib.Literal(size, datatype=XSD.integer)))
|
|
228
|
+
if mtime:
|
|
229
|
+
g.add((iri, DCT.modified, rdflib.Literal(mtime, datatype=XSD.dateTime)))
|
|
230
|
+
|
|
231
|
+
if kind == "src" and path.exists():
|
|
232
|
+
try:
|
|
233
|
+
sha = hashlib.sha256(path.read_bytes()).hexdigest()
|
|
234
|
+
g.add((iri, DCT.identifier, rdflib.Literal(f"sha256:{sha}")))
|
|
235
|
+
except Exception: # noqa: BLE001
|
|
236
|
+
pass
|
|
237
|
+
|
|
238
|
+
return iri
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def project_metadata(dist_name: str | None = None):
|
|
242
|
+
"""
|
|
243
|
+
Return (name, version, requires) using importlib.metadata.
|
|
244
|
+
|
|
245
|
+
If dist_name is None, tries to infer from the caller module.
|
|
246
|
+
"""
|
|
247
|
+
if dist_name is None:
|
|
248
|
+
# crude default: module name of the caller
|
|
249
|
+
frame = inspect.stack()[1]
|
|
250
|
+
module = inspect.getmodule(frame[0])
|
|
251
|
+
if module and module.__package__:
|
|
252
|
+
dist_name = module.__package__.split(".", 1)[0]
|
|
253
|
+
else:
|
|
254
|
+
return None, None, []
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
dist = im.distribution(dist_name)
|
|
258
|
+
except im.PackageNotFoundError:
|
|
259
|
+
return None, None, []
|
|
260
|
+
|
|
261
|
+
name = dist.metadata.get("Name")
|
|
262
|
+
version = dist.version
|
|
263
|
+
requires = dist.requires or []
|
|
264
|
+
return name, version, requires
|
|
265
|
+
|
|
266
|
+
def pep503_normalize(name: str) -> str:
|
|
267
|
+
"""PEP 503 normalization: lowercase, runs of [-_.] -> '-'."""
|
|
268
|
+
name = name.strip()
|
|
269
|
+
name = name.lower()
|
|
270
|
+
return re.sub(r"[-_.]+", "-", name)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def add_deps_to_env(
|
|
274
|
+
D: rdflib.Graph,
|
|
275
|
+
env_iri: rdflib.term.Identifier,
|
|
276
|
+
deps_specs: list[str],
|
|
277
|
+
):
|
|
278
|
+
"""
|
|
279
|
+
Attach dependencies from deps_specs to env_iri using PyPI-based IRIs.
|
|
280
|
+
|
|
281
|
+
deps_specs: e.g. ["rdflib>=6.0.0", "pydantic==2.0.3"]
|
|
282
|
+
"""
|
|
283
|
+
for spec in deps_specs:
|
|
284
|
+
spec_str = spec.strip()
|
|
285
|
+
if not spec_str:
|
|
286
|
+
continue
|
|
287
|
+
|
|
288
|
+
pkg = spec_str.split()[0]
|
|
289
|
+
# keep only the name part before version operator
|
|
290
|
+
pkg_name = re.split(r"[<>=!~ ]", pkg, 1)[0]
|
|
291
|
+
norm = pep503_normalize(pkg_name)
|
|
292
|
+
|
|
293
|
+
dep_iri = rdflib.URIRef(f"https://pypi.org/project/{norm}/")
|
|
294
|
+
|
|
295
|
+
D.add((dep_iri, RDF.type, RDF.Resource))
|
|
296
|
+
D.add((dep_iri, RDFS.label, rdflib.Literal(spec_str)))
|
|
297
|
+
# link environment -> dependency
|
|
298
|
+
D.add((env_iri, DCT.requires, dep_iri))
|
|
299
|
+
|
|
300
|
+
def _augment_with_metadata(
|
|
301
|
+
D: rdflib.Graph,
|
|
302
|
+
base: rdflib.Namespace,
|
|
303
|
+
activity: rdflib.term.Identifier,
|
|
304
|
+
run_id: str,
|
|
305
|
+
):
|
|
306
|
+
"""Add metadata + dependency info to provenance, if possible."""
|
|
307
|
+
name, version, deps_specs = project_metadata()
|
|
308
|
+
|
|
309
|
+
if name or version or deps_specs:
|
|
310
|
+
env = base[f"env/{run_id}"]
|
|
311
|
+
D.add((env, RDF.type, PROV.Entity))
|
|
312
|
+
D.add((env, RDF.type, PROV.Collection))
|
|
313
|
+
D.add((env, RDFS.label, rdflib.Literal("Python environment")))
|
|
314
|
+
D.add((activity, PROV.used, env))
|
|
315
|
+
|
|
316
|
+
if name:
|
|
317
|
+
D.add((env, DCT.title, rdflib.Literal(name)))
|
|
318
|
+
if version:
|
|
319
|
+
D.add((env, DCT.hasVersion, rdflib.Literal(version)))
|
|
320
|
+
|
|
321
|
+
add_deps_to_env(D, env, deps_specs)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _write_provenance_dataset(
|
|
325
|
+
base_iri: str,
|
|
326
|
+
name: str,
|
|
327
|
+
prov_path: str | Path,
|
|
328
|
+
deps: list[str],
|
|
329
|
+
outputs: list[str],
|
|
330
|
+
t0: datetime,
|
|
331
|
+
t1: datetime,
|
|
332
|
+
data_graph: rdflib.Graph | None = None,
|
|
333
|
+
success: bool = True,
|
|
334
|
+
):
|
|
335
|
+
"""
|
|
336
|
+
Build a Dataset with:
|
|
337
|
+
- default graph: PROV metadata
|
|
338
|
+
- named graph: data_graph (if provided)
|
|
339
|
+
and serialize as Trig to prov_path.
|
|
340
|
+
"""
|
|
341
|
+
base = rdflib.Namespace(base_iri)
|
|
342
|
+
ds = rdflib.Dataset()
|
|
343
|
+
D = ds.default_context
|
|
344
|
+
|
|
345
|
+
ds.bind("", base)
|
|
346
|
+
ds.bind("prov", PROV)
|
|
347
|
+
ds.bind("dcterms", DCT)
|
|
348
|
+
|
|
349
|
+
run_id = t0.strftime("%Y%m%dT%H%M%S")
|
|
350
|
+
script = _caller_script()
|
|
351
|
+
|
|
352
|
+
activity = base[f"run/{name}/{run_id}"]
|
|
353
|
+
agent = base[f"agent/{script.name}"]
|
|
354
|
+
graph_iri = base[f"graph/{name}"]
|
|
355
|
+
|
|
356
|
+
commit = _safe_cmd(["git", "rev-parse", "HEAD"])
|
|
357
|
+
origin = _safe_cmd(["git", "config", "--get", "remote.origin.url"])
|
|
358
|
+
|
|
359
|
+
D.add((activity, RDF.type, PROV.Activity))
|
|
360
|
+
t0_term = rdflib.Literal(t0.isoformat(), datatype=XSD.dateTime)
|
|
361
|
+
D.add((activity, PROV.startedAtTime, t0_term))
|
|
362
|
+
t1_term = rdflib.Literal(t1.isoformat(), datatype=XSD.dateTime)
|
|
363
|
+
D.add((activity, PROV.endedAtTime, t1_term))
|
|
364
|
+
|
|
365
|
+
D.add((agent, RDF.type, PROV.SoftwareAgent))
|
|
366
|
+
D.add((agent, RDFS.label, rdflib.Literal(script.name)))
|
|
367
|
+
if commit:
|
|
368
|
+
D.add((agent, DCT.hasVersion, rdflib.Literal(commit)))
|
|
369
|
+
if origin:
|
|
370
|
+
D.add((agent, DCT.source, rdflib.URIRef(origin)))
|
|
371
|
+
D.add((activity, PROV.wasAssociatedWith, agent))
|
|
372
|
+
|
|
373
|
+
if data_graph is not None:
|
|
374
|
+
gx = ds.get_context(graph_iri)
|
|
375
|
+
for triple in data_graph:
|
|
376
|
+
gx.add(triple)
|
|
377
|
+
|
|
378
|
+
D.add((graph_iri, RDF.type, PROV.Entity))
|
|
379
|
+
D.add((graph_iri, PROV.wasGeneratedBy, activity))
|
|
380
|
+
D.add((graph_iri, PROV.wasAttributedTo, agent))
|
|
381
|
+
D.add((graph_iri, PROV.generatedAtTime, t1_term))
|
|
382
|
+
|
|
383
|
+
for d in deps:
|
|
384
|
+
p = Path(d)
|
|
385
|
+
if not p.exists():
|
|
386
|
+
continue
|
|
387
|
+
src = _describe_file(D, base, p, "src")
|
|
388
|
+
D.add((activity, PROV.used, src))
|
|
389
|
+
|
|
390
|
+
for o in outputs:
|
|
391
|
+
p = Path(o)
|
|
392
|
+
if not p.exists():
|
|
393
|
+
continue
|
|
394
|
+
ent = _describe_file(D, base, p, "out")
|
|
395
|
+
D.add((ent, PROV.wasGeneratedBy, activity))
|
|
396
|
+
|
|
397
|
+
if not success:
|
|
398
|
+
D.add((activity, RDFS.comment, rdflib.Literal("task failed")))
|
|
399
|
+
|
|
400
|
+
# Add pyproject + dependency information, if available
|
|
401
|
+
_augment_with_metadata(D, base, activity, run_id)
|
|
402
|
+
|
|
403
|
+
prov_path = Path(prov_path)
|
|
404
|
+
prov_path.parent.mkdir(parents=True, exist_ok=True)
|
|
405
|
+
logging.info("Writing provenance dataset %s", prov_path)
|
|
406
|
+
ds.serialize(prov_path, format="trig")
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
# ----------------------------------------------------------------------
|
|
410
|
+
# Annotation helpers (supports Optional[InFile], OutFile | None, etc.)
|
|
411
|
+
# ----------------------------------------------------------------------
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def _is_kind_annotation(ann: Any, cls: type) -> bool:
|
|
415
|
+
if ann is cls:
|
|
416
|
+
return True
|
|
417
|
+
|
|
418
|
+
origin = get_origin(ann)
|
|
419
|
+
if origin is None:
|
|
420
|
+
return False
|
|
421
|
+
|
|
422
|
+
return any(a is cls for a in get_args(ann))
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
# ----------------------------------------------------------------------
|
|
426
|
+
# Decorator: infer inputs/outputs from signature
|
|
427
|
+
# ----------------------------------------------------------------------
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def rule(
|
|
431
|
+
*,
|
|
432
|
+
name: str | None = None,
|
|
433
|
+
base_iri: str | None = None,
|
|
434
|
+
prov_dir: str | None = None,
|
|
435
|
+
prov_path: str | None = None,
|
|
436
|
+
force: bool | None = None,
|
|
437
|
+
dry_run: bool | None = None,
|
|
438
|
+
config: ProvenanceConfig | None = None,
|
|
439
|
+
):
|
|
440
|
+
base_config = config or GLOBAL_CONFIG
|
|
441
|
+
|
|
442
|
+
rule_config = ProvenanceConfig(
|
|
443
|
+
base_iri=base_iri if base_iri is not None else base_config.base_iri,
|
|
444
|
+
prov_dir=prov_dir if prov_dir is not None else base_config.prov_dir,
|
|
445
|
+
force=force if force is not None else base_config.force,
|
|
446
|
+
dry_run=dry_run if dry_run is not None else base_config.dry_run,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
def decorator(func):
|
|
450
|
+
sig = inspect.signature(func)
|
|
451
|
+
hints = get_type_hints(func)
|
|
452
|
+
|
|
453
|
+
in_params: list[str] = []
|
|
454
|
+
out_params: list[str] = []
|
|
455
|
+
|
|
456
|
+
for p in sig.parameters.values():
|
|
457
|
+
ann = hints.get(p.name, p.annotation)
|
|
458
|
+
if _is_kind_annotation(ann, InFile):
|
|
459
|
+
in_params.append(p.name)
|
|
460
|
+
if _is_kind_annotation(ann, OutFile):
|
|
461
|
+
out_params.append(p.name)
|
|
462
|
+
|
|
463
|
+
if not out_params:
|
|
464
|
+
raise ValueError(
|
|
465
|
+
f"Function {func.__name__} must have at least one "
|
|
466
|
+
f"OutFile (possibly Optional[OutFile]) parameter"
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
deps: list[str] = []
|
|
470
|
+
outputs: list[str] = []
|
|
471
|
+
|
|
472
|
+
for p in sig.parameters.values():
|
|
473
|
+
if p.name in in_params and p.default is not inspect._empty:
|
|
474
|
+
val = p.default
|
|
475
|
+
if isinstance(val, InFile):
|
|
476
|
+
if getattr(val, "is_stream", False):
|
|
477
|
+
pass
|
|
478
|
+
elif val.path is not None:
|
|
479
|
+
deps.append(str(val.path))
|
|
480
|
+
elif isinstance(val, (str, Path)):
|
|
481
|
+
if str(val) != "-":
|
|
482
|
+
deps.append(str(val))
|
|
483
|
+
|
|
484
|
+
if p.name in out_params and p.default is not inspect._empty:
|
|
485
|
+
val = p.default
|
|
486
|
+
if isinstance(val, OutFile):
|
|
487
|
+
if getattr(val, "is_stream", False):
|
|
488
|
+
pass
|
|
489
|
+
elif val.path is not None:
|
|
490
|
+
outputs.append(str(val.path))
|
|
491
|
+
elif isinstance(val, (str, Path)):
|
|
492
|
+
if str(val) != "-":
|
|
493
|
+
outputs.append(str(val))
|
|
494
|
+
|
|
495
|
+
register_for_build = bool(outputs)
|
|
496
|
+
logical_name = name or func.__name__
|
|
497
|
+
|
|
498
|
+
if prov_path is not None:
|
|
499
|
+
rule_prov_path = prov_path
|
|
500
|
+
else:
|
|
501
|
+
rule_prov_path = str(Path(rule_config.prov_dir) / f"{logical_name}.trig")
|
|
502
|
+
|
|
503
|
+
@functools.wraps(func)
|
|
504
|
+
def wrapped(*args, **kwargs):
|
|
505
|
+
bound = sig.bind_partial(*args, **kwargs)
|
|
506
|
+
bound.apply_defaults()
|
|
507
|
+
|
|
508
|
+
in_files: list[Path] = []
|
|
509
|
+
out_files: list[Path] = []
|
|
510
|
+
|
|
511
|
+
for pname in in_params:
|
|
512
|
+
val = bound.arguments.get(pname)
|
|
513
|
+
if isinstance(val, InFile):
|
|
514
|
+
if val.is_stream or val.path is None:
|
|
515
|
+
continue
|
|
516
|
+
in_files.append(val.path)
|
|
517
|
+
elif val is None:
|
|
518
|
+
continue
|
|
519
|
+
else:
|
|
520
|
+
if str(val) != "-":
|
|
521
|
+
in_files.append(Path(val))
|
|
522
|
+
|
|
523
|
+
for pname in out_params:
|
|
524
|
+
val = bound.arguments.get(pname)
|
|
525
|
+
if isinstance(val, OutFile):
|
|
526
|
+
if val.is_stream or val.path is None:
|
|
527
|
+
continue
|
|
528
|
+
out_files.append(val.path)
|
|
529
|
+
elif val is None:
|
|
530
|
+
continue
|
|
531
|
+
else:
|
|
532
|
+
if str(val) != "-":
|
|
533
|
+
out_files.append(Path(val))
|
|
534
|
+
|
|
535
|
+
if not rule_config.force and not needs_update(out_files, in_files):
|
|
536
|
+
logging.info("Skipping %s (up to date)", logical_name)
|
|
537
|
+
return None
|
|
538
|
+
|
|
539
|
+
if rule_config.dry_run:
|
|
540
|
+
logging.info(
|
|
541
|
+
"Dry-run %s: would run with %s -> %s",
|
|
542
|
+
logical_name,
|
|
543
|
+
in_files,
|
|
544
|
+
out_files,
|
|
545
|
+
)
|
|
546
|
+
return None
|
|
547
|
+
|
|
548
|
+
t0 = datetime.now(timezone.utc)
|
|
549
|
+
exc: Exception | None = None
|
|
550
|
+
data_graph: rdflib.Graph | None = None
|
|
551
|
+
result = None
|
|
552
|
+
|
|
553
|
+
try:
|
|
554
|
+
result = func(*bound.args, **bound.kwargs)
|
|
555
|
+
if isinstance(result, (rdflib.Graph, rdflib.Dataset)):
|
|
556
|
+
data_graph = result
|
|
557
|
+
return result
|
|
558
|
+
except Exception as e:
|
|
559
|
+
exc = e
|
|
560
|
+
raise
|
|
561
|
+
finally:
|
|
562
|
+
t1 = datetime.now(timezone.utc)
|
|
563
|
+
try:
|
|
564
|
+
_write_provenance_dataset(
|
|
565
|
+
base_iri=rule_config.base_iri,
|
|
566
|
+
name=logical_name,
|
|
567
|
+
prov_path=rule_prov_path,
|
|
568
|
+
deps=[str(p) for p in in_files],
|
|
569
|
+
outputs=[str(p) for p in out_files],
|
|
570
|
+
t0=t0,
|
|
571
|
+
t1=t1,
|
|
572
|
+
data_graph=data_graph,
|
|
573
|
+
success=exc is None,
|
|
574
|
+
)
|
|
575
|
+
except Exception as prov_exc: # noqa: BLE001
|
|
576
|
+
logging.warning(
|
|
577
|
+
"Failed to write provenance for %s: %s",
|
|
578
|
+
logical_name,
|
|
579
|
+
prov_exc,
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
COMMANDS.add(wrapped)
|
|
583
|
+
if register_for_build:
|
|
584
|
+
target = outputs[0]
|
|
585
|
+
RULES[target] = {
|
|
586
|
+
"deps": deps,
|
|
587
|
+
"outputs": outputs,
|
|
588
|
+
"func": wrapped,
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
return wrapped
|
|
592
|
+
|
|
593
|
+
return decorator
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from rdflib import Graph, Literal, Namespace
|
|
5
|
+
from rdflib.namespace import RDF, XSD
|
|
6
|
+
|
|
7
|
+
from makeprov import InFile, OutFile, ProvenanceConfig, rule
|
|
8
|
+
|
|
9
|
+
@rule(name="test_process_data")
|
|
10
|
+
def process_data(input_file: InFile, output_file: OutFile):
|
|
11
|
+
with input_file.open('r') as infile, output_file.open('w') as outfile:
|
|
12
|
+
data = infile.read()
|
|
13
|
+
outfile.write(data)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
SALES_NS = Namespace("http://example.org/test/")
|
|
17
|
+
TEST_PROV_DIR = Path(tempfile.mkdtemp(prefix="makeprov-tests-"))
|
|
18
|
+
TEST_PROV_CONFIG = ProvenanceConfig(prov_dir=str(TEST_PROV_DIR))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@rule(name="test_totals_graph", config=TEST_PROV_CONFIG)
|
|
22
|
+
def totals_graph(input_csv: InFile, graph_out: OutFile) -> Graph:
|
|
23
|
+
graph = Graph()
|
|
24
|
+
graph.bind("sales", SALES_NS)
|
|
25
|
+
|
|
26
|
+
if graph_out.path is None:
|
|
27
|
+
raise ValueError("graph_out must have a filesystem path")
|
|
28
|
+
graph_out.path.parent.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
|
|
30
|
+
with input_csv.open('r') as handle:
|
|
31
|
+
for line in handle.read().strip().splitlines()[1:]:
|
|
32
|
+
region, units, revenue = line.split(',')
|
|
33
|
+
subject = SALES_NS[f"region/{region.lower()}"]
|
|
34
|
+
graph.add((subject, RDF.type, SALES_NS.RegionTotal))
|
|
35
|
+
graph.add((subject, SALES_NS.regionName, Literal(region)))
|
|
36
|
+
graph.add((subject, SALES_NS.totalUnits, Literal(units, datatype=XSD.integer)))
|
|
37
|
+
graph.add((subject, SALES_NS.totalRevenue, Literal(revenue, datatype=XSD.decimal)))
|
|
38
|
+
|
|
39
|
+
with graph_out.open('w') as handle:
|
|
40
|
+
handle.write(graph.serialize(format='turtle'))
|
|
41
|
+
|
|
42
|
+
return graph
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_process_data(tmp_path):
|
|
46
|
+
input_file = tmp_path / "input.txt"
|
|
47
|
+
output_file = tmp_path / "output.txt"
|
|
48
|
+
|
|
49
|
+
input_file.write_text("Hello, world!")
|
|
50
|
+
|
|
51
|
+
# Run the process_data function
|
|
52
|
+
result = process_data(InFile(str(input_file)), OutFile(str(output_file)))
|
|
53
|
+
|
|
54
|
+
# Check that the output file was created and contains the correct data
|
|
55
|
+
assert output_file.exists()
|
|
56
|
+
assert output_file.read_text() == "Hello, world!"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_rule_returns_graph(tmp_path):
|
|
60
|
+
input_csv = tmp_path / "region_totals.csv"
|
|
61
|
+
graph_ttl = tmp_path / "region_totals.ttl"
|
|
62
|
+
input_csv.write_text("region,total_units,total_revenue\nNorth,6,119.94\n")
|
|
63
|
+
|
|
64
|
+
result = totals_graph(InFile(str(input_csv)), OutFile(str(graph_ttl)))
|
|
65
|
+
|
|
66
|
+
assert isinstance(result, Graph)
|
|
67
|
+
assert graph_ttl.exists()
|
|
68
|
+
assert "North" in graph_ttl.read_text()
|
|
69
|
+
prov_path = TEST_PROV_DIR / "test_totals_graph.trig"
|
|
70
|
+
assert prov_path.exists()
|
|
71
|
+
prov_path.unlink()
|