makeprov 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ Metadata-Version: 2.4
2
+ Name: makeprov
3
+ Version: 0.1.1
4
+ Summary: An RDF provenance tracking library for simple Python workflows
5
+ Author-email: Benno Kruit <b.b.kruit@amsterdamumc.nl>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/bennokr/makeprov
8
+ Project-URL: Documentation, https://makeprov.readthedocs.io
9
+ Project-URL: Issue Tracker, https://github.com/bennokr/makeprov/issues
10
+ Keywords: provenance,rdf,workflow,python
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Description-Content-Type: text/markdown
15
+
16
+ # makeprov: Pythonic Provenance Tracking
17
+
18
+ This library provides a way to track file provenance in Python workflows using RDF and PROV (W3C Provenance) semantics. It supports defining input/output files via decorators and automatically generates provenance datasets.
19
+
20
+ ## Features
21
+
22
+ - Use decorators to define rules for workflows.
23
+ - Automatically generate RDF-based provenance metadata.
24
+ - Handles input and output streams.
25
+ - Integrates with Python's type hints for easy configuration.
26
+ - Outputs provenance data in TRIG format.
27
+
28
+ ## Installation
29
+
30
+ You can install the module directly from PyPI:
31
+
32
+ ```bash
33
+ pip install makeprov
34
+ ```
35
+
36
+ ## Usage
37
+
38
+ Here’s an example of how to use this package in your Python scripts:
39
+
40
+ ```python
41
+ from makeprov import rule, InFile, OutFile, build
42
+
43
+ @rule()
44
+ def process_data(
45
+ input_file: InFile = InFile('input.txt'),
46
+ output_file: OutFile = OutFile('output.txt')
47
+ ):
48
+ with input_file.open('r') as infile, output_file.open('w') as outfile:
49
+ data = infile.read()
50
+ outfile.write(data.upper())
51
+
52
+ if __name__ == '__main__':
53
+ process_data()
54
+
55
+ # or as a command line interface
56
+ import defopt
57
+ defopt.run(process_data)
58
+
59
+ # or as a workflow graph that automatically (re)generates all dependencies
60
+ from makeprov import build
61
+ build('output.txt')
62
+ ```
63
+
64
+ You can execute `example.py` via the CLI like so:
65
+
66
+ ```bash
67
+ python example.py build-all
68
+
69
+ # Or set configuration through the CLI
70
+ python example.py build-all --conf='{"base_iri": "http://mybaseiri.org/", "prov_dir": "my_prov_directory"}' --force --input_file input.txt --output_file final_output.txt
71
+
72
+ # Or set configuration through a TOML file
73
+ python example.py build-all --conf=@my_config.toml
74
+ ```
75
+
76
+ ### Complex CSV-to-RDF Workflow
77
+
78
+ For a more involved scenario, see [`complex_example.py`](complex_example.py). It creates multiple CSV files, aggregates their contents, and emits an RDF graph that is both serialized to disk and embedded into the provenance dataset because the function returns an `rdflib.Graph`.
79
+
80
+ ```python
81
+ @rule()
82
+ def export_totals_graph(
83
+ totals_csv: InFile = InFile("data/region_totals.csv"),
84
+ graph_ttl: OutFile = OutFile("data/region_totals.ttl"),
85
+ ) -> Graph:
86
+ graph = Graph()
87
+ graph.bind("sales", SALES)
88
+
89
+ with totals_csv.open("r", newline="") as handle:
90
+ for row in csv.DictReader(handle):
91
+ region_key = row["region"].lower().replace(" ", "-")
92
+ subject = SALES[f"region/{region_key}"]
93
+
94
+ graph.add((subject, RDF.type, SALES.RegionTotal))
95
+ graph.add((subject, SALES.regionName, Literal(row["region"])))
96
+ graph.add((subject, SALES.totalUnits, Literal(row["total_units"], datatype=XSD.integer)))
97
+ graph.add((subject, SALES.totalRevenue, Literal(row["total_revenue"], datatype=XSD.decimal)))
98
+
99
+ with graph_ttl.open("w") as handle:
100
+ handle.write(graph.serialize(format="turtle"))
101
+
102
+ return graph
103
+ ```
104
+
105
+ Run the entire workflow, including CSV generation and RDF export, with:
106
+
107
+ ```bash
108
+ python complex_example.py build-sales-report
109
+ ```
110
+
111
+ ### Configuration
112
+
113
+ You can customize the provenance tracking with the following options:
114
+
115
+ - `base_iri` (str): Base IRI for new resources
116
+ - `prov_dir` (str): Directory for writing PROV `.trig` files
117
+ - `force` (bool): Force running of dependencies
118
+ - `dry_run` (bool): Only check workflow, don't run anything
119
+
120
+ ## Contributing
121
+
122
+ Contributions are welcome! Please open an issue or submit a pull request.
123
+
124
+ ## License
125
+
126
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,111 @@
1
+ # makeprov: Pythonic Provenance Tracking
2
+
3
+ This library provides a way to track file provenance in Python workflows using RDF and PROV (W3C Provenance) semantics. It supports defining input/output files via decorators and automatically generates provenance datasets.
4
+
5
+ ## Features
6
+
7
+ - Use decorators to define rules for workflows.
8
+ - Automatically generate RDF-based provenance metadata.
9
+ - Handles input and output streams.
10
+ - Integrates with Python's type hints for easy configuration.
11
+ - Outputs provenance data in TRIG format.
12
+
13
+ ## Installation
14
+
15
+ You can install the module directly from PyPI:
16
+
17
+ ```bash
18
+ pip install makeprov
19
+ ```
20
+
21
+ ## Usage
22
+
23
+ Here’s an example of how to use this package in your Python scripts:
24
+
25
+ ```python
26
+ from makeprov import rule, InFile, OutFile, build
27
+
28
+ @rule()
29
+ def process_data(
30
+ input_file: InFile = InFile('input.txt'),
31
+ output_file: OutFile = OutFile('output.txt')
32
+ ):
33
+ with input_file.open('r') as infile, output_file.open('w') as outfile:
34
+ data = infile.read()
35
+ outfile.write(data.upper())
36
+
37
+ if __name__ == '__main__':
38
+ process_data()
39
+
40
+ # or as a command line interface
41
+ import defopt
42
+ defopt.run(process_data)
43
+
44
+ # or as a workflow graph that automatically (re)generates all dependencies
45
+ from makeprov import build
46
+ build('output.txt')
47
+ ```
48
+
49
+ You can execute `example.py` via the CLI like so:
50
+
51
+ ```bash
52
+ python example.py build-all
53
+
54
+ # Or set configuration through the CLI
55
+ python example.py build-all --conf='{"base_iri": "http://mybaseiri.org/", "prov_dir": "my_prov_directory"}' --force --input_file input.txt --output_file final_output.txt
56
+
57
+ # Or set configuration through a TOML file
58
+ python example.py build-all --conf=@my_config.toml
59
+ ```
60
+
61
+ ### Complex CSV-to-RDF Workflow
62
+
63
+ For a more involved scenario, see [`complex_example.py`](complex_example.py). It creates multiple CSV files, aggregates their contents, and emits an RDF graph that is both serialized to disk and embedded into the provenance dataset because the function returns an `rdflib.Graph`.
64
+
65
+ ```python
66
+ @rule()
67
+ def export_totals_graph(
68
+ totals_csv: InFile = InFile("data/region_totals.csv"),
69
+ graph_ttl: OutFile = OutFile("data/region_totals.ttl"),
70
+ ) -> Graph:
71
+ graph = Graph()
72
+ graph.bind("sales", SALES)
73
+
74
+ with totals_csv.open("r", newline="") as handle:
75
+ for row in csv.DictReader(handle):
76
+ region_key = row["region"].lower().replace(" ", "-")
77
+ subject = SALES[f"region/{region_key}"]
78
+
79
+ graph.add((subject, RDF.type, SALES.RegionTotal))
80
+ graph.add((subject, SALES.regionName, Literal(row["region"])))
81
+ graph.add((subject, SALES.totalUnits, Literal(row["total_units"], datatype=XSD.integer)))
82
+ graph.add((subject, SALES.totalRevenue, Literal(row["total_revenue"], datatype=XSD.decimal)))
83
+
84
+ with graph_ttl.open("w") as handle:
85
+ handle.write(graph.serialize(format="turtle"))
86
+
87
+ return graph
88
+ ```
89
+
90
+ Run the entire workflow, including CSV generation and RDF export, with:
91
+
92
+ ```bash
93
+ python complex_example.py build-sales-report
94
+ ```
95
+
96
+ ### Configuration
97
+
98
+ You can customize the provenance tracking with the following options:
99
+
100
+ - `base_iri` (str): Base IRI for new resources
101
+ - `prov_dir` (str): Directory for writing PROV `.trig` files
102
+ - `force` (bool): Force running of dependencies
103
+ - `dry_run` (bool): Only check workflow, don't run anything
104
+
105
+ ## Contributing
106
+
107
+ Contributions are welcome! Please open an issue or submit a pull request.
108
+
109
+ ## License
110
+
111
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "makeprov"
7
+ version = "0.1.1"
8
+ description = "An RDF provenance tracking library for simple Python workflows"
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ authors = [{ name = "Benno Kruit", email = "b.b.kruit@amsterdamumc.nl" }]
12
+ keywords = ["provenance", "rdf", "workflow", "python"]
13
+ classifiers = [
14
+ "Programming Language :: Python :: 3",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Operating System :: OS Independent",
17
+ ]
18
+
19
+ [project.urls]
20
+ "Homepage" = "https://github.com/bennokr/makeprov"
21
+ "Documentation" = "https://makeprov.readthedocs.io"
22
+ "Issue Tracker" = "https://github.com/bennokr/makeprov/issues"
23
+
24
+ [tool.pytape]
25
+ test = "tests/test_makeprov.py"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,45 @@
1
+ from dataclasses import fields, is_dataclass
2
+ import sys, logging, tomllib as toml, defopt
3
+ import argparse
4
+
5
+
6
+ def main(subcommands, conf_obj, parsers=None):
7
+ def conf(dc, params):
8
+ for f in fields(dc):
9
+ if f.name in params:
10
+ cur, new = getattr(dc, f.name), params[f.name]
11
+ if is_dataclass(cur) and isinstance(new, dict):
12
+ conf(cur, new)
13
+ else:
14
+ setattr(dc, f.name, new)
15
+
16
+ parent = argparse.ArgumentParser(add_help=False)
17
+ parent.add_argument(
18
+ "-c",
19
+ "--conf",
20
+ action="append",
21
+ default=[],
22
+ help="Set config param from TOML snippet or @file",
23
+ )
24
+ parent.add_argument(
25
+ "-v", "--verbose", action="count", default=0, help="Show more logging output"
26
+ )
27
+
28
+ def apply_globals(argv):
29
+ ns, _ = parent.parse_known_args(argv)
30
+ lvl = ("WARNING", "INFO", "DEBUG")[min(max(ns.verbose, 0), 2)]
31
+ logging.basicConfig(level=getattr(logging, lvl))
32
+ for t in ns.conf:
33
+ logging.debug(f"Parsing config {t}")
34
+ p = toml.load(open(t[1:], "rb")) if t.startswith("@") else toml.loads(t)
35
+ logging.debug(f"Setting config {p}")
36
+ conf(conf_obj, p)
37
+
38
+ apply_globals(sys.argv[1:]) # apply effects early
39
+ logging.debug(f"Config: {conf_obj}")
40
+ defopt.run(
41
+ subcommands,
42
+ parsers=parsers or {},
43
+ argv=sys.argv[1:],
44
+ argparse_kwargs={"parents": [parent]},
45
+ )
@@ -0,0 +1,126 @@
1
+ Metadata-Version: 2.4
2
+ Name: makeprov
3
+ Version: 0.1.1
4
+ Summary: An RDF provenance tracking library for simple Python workflows
5
+ Author-email: Benno Kruit <b.b.kruit@amsterdamumc.nl>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/bennokr/makeprov
8
+ Project-URL: Documentation, https://makeprov.readthedocs.io
9
+ Project-URL: Issue Tracker, https://github.com/bennokr/makeprov/issues
10
+ Keywords: provenance,rdf,workflow,python
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Description-Content-Type: text/markdown
15
+
16
+ # makeprov: Pythonic Provenance Tracking
17
+
18
+ This library provides a way to track file provenance in Python workflows using RDF and PROV (W3C Provenance) semantics. It supports defining input/output files via decorators and automatically generates provenance datasets.
19
+
20
+ ## Features
21
+
22
+ - Use decorators to define rules for workflows.
23
+ - Automatically generate RDF-based provenance metadata.
24
+ - Handles input and output streams.
25
+ - Integrates with Python's type hints for easy configuration.
26
+ - Outputs provenance data in TRIG format.
27
+
28
+ ## Installation
29
+
30
+ You can install the module directly from PyPI:
31
+
32
+ ```bash
33
+ pip install makeprov
34
+ ```
35
+
36
+ ## Usage
37
+
38
+ Here’s an example of how to use this package in your Python scripts:
39
+
40
+ ```python
41
+ from makeprov import rule, InFile, OutFile, build
42
+
43
+ @rule()
44
+ def process_data(
45
+ input_file: InFile = InFile('input.txt'),
46
+ output_file: OutFile = OutFile('output.txt')
47
+ ):
48
+ with input_file.open('r') as infile, output_file.open('w') as outfile:
49
+ data = infile.read()
50
+ outfile.write(data.upper())
51
+
52
+ if __name__ == '__main__':
53
+ process_data()
54
+
55
+ # or as a command line interface
56
+ import defopt
57
+ defopt.run(process_data)
58
+
59
+ # or as a workflow graph that automatically (re)generates all dependencies
60
+ from makeprov import build
61
+ build('output.txt')
62
+ ```
63
+
64
+ You can execute `example.py` via the CLI like so:
65
+
66
+ ```bash
67
+ python example.py build-all
68
+
69
+ # Or set configuration through the CLI
70
+ python example.py build-all --conf='{"base_iri": "http://mybaseiri.org/", "prov_dir": "my_prov_directory"}' --force --input_file input.txt --output_file final_output.txt
71
+
72
+ # Or set configuration through a TOML file
73
+ python example.py build-all --conf=@my_config.toml
74
+ ```
75
+
76
+ ### Complex CSV-to-RDF Workflow
77
+
78
+ For a more involved scenario, see [`complex_example.py`](complex_example.py). It creates multiple CSV files, aggregates their contents, and emits an RDF graph that is both serialized to disk and embedded into the provenance dataset because the function returns an `rdflib.Graph`.
79
+
80
+ ```python
81
+ @rule()
82
+ def export_totals_graph(
83
+ totals_csv: InFile = InFile("data/region_totals.csv"),
84
+ graph_ttl: OutFile = OutFile("data/region_totals.ttl"),
85
+ ) -> Graph:
86
+ graph = Graph()
87
+ graph.bind("sales", SALES)
88
+
89
+ with totals_csv.open("r", newline="") as handle:
90
+ for row in csv.DictReader(handle):
91
+ region_key = row["region"].lower().replace(" ", "-")
92
+ subject = SALES[f"region/{region_key}"]
93
+
94
+ graph.add((subject, RDF.type, SALES.RegionTotal))
95
+ graph.add((subject, SALES.regionName, Literal(row["region"])))
96
+ graph.add((subject, SALES.totalUnits, Literal(row["total_units"], datatype=XSD.integer)))
97
+ graph.add((subject, SALES.totalRevenue, Literal(row["total_revenue"], datatype=XSD.decimal)))
98
+
99
+ with graph_ttl.open("w") as handle:
100
+ handle.write(graph.serialize(format="turtle"))
101
+
102
+ return graph
103
+ ```
104
+
105
+ Run the entire workflow, including CSV generation and RDF export, with:
106
+
107
+ ```bash
108
+ python complex_example.py build-sales-report
109
+ ```
110
+
111
+ ### Configuration
112
+
113
+ You can customize the provenance tracking with the following options:
114
+
115
+ - `base_iri` (str): Base IRI for new resources
116
+ - `prov_dir` (str): Directory for writing PROV `.trig` files
117
+ - `force` (bool): Force running of dependencies
118
+ - `dry_run` (bool): Only check workflow, don't run anything
119
+
120
+ ## Contributing
121
+
122
+ Contributions are welcome! Please open an issue or submit a pull request.
123
+
124
+ ## License
125
+
126
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,9 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/config.py
4
+ src/makeprov.py
5
+ src/makeprov.egg-info/PKG-INFO
6
+ src/makeprov.egg-info/SOURCES.txt
7
+ src/makeprov.egg-info/dependency_links.txt
8
+ src/makeprov.egg-info/top_level.txt
9
+ tests/test_makeprov.py
@@ -0,0 +1,2 @@
1
+ config
2
+ makeprov
@@ -0,0 +1,593 @@
1
+ from __future__ import annotations
2
+
3
+ import functools
4
+ import logging
5
+ import mimetypes
6
+ import subprocess
7
+ import sys
8
+ import inspect
9
+ import hashlib
10
+ import re
11
+ from dataclasses import dataclass
12
+ from pathlib import Path
13
+ from datetime import datetime, timezone
14
+ from typing import get_origin, get_args, get_type_hints, Any
15
+ from abc import ABC, abstractmethod
16
+ from collections.abc import Callable
17
+ import importlib.metadata as im
18
+
19
+ import rdflib
20
+ from rdflib import RDF, RDFS, XSD
21
+ from rdflib.namespace import DCTERMS as DCT
22
+
23
+ PROV = rdflib.Namespace("http://www.w3.org/ns/prov#")
24
+
25
+ # ----------------------------------------------------------------------
26
+ # Config
27
+ # ----------------------------------------------------------------------
28
+
29
+
30
+ @dataclass
31
+ class ProvenanceConfig:
32
+ base_iri: str = "http://example.org/"
33
+ prov_dir: str = "prov"
34
+ force: bool = False # if True, run even when up to date
35
+ dry_run: bool = False # if True, do not run, just log
36
+
37
+
38
+ GLOBAL_CONFIG = ProvenanceConfig()
39
+
40
+
41
+ # ----------------------------------------------------------------------
42
+ # File marker hierarchy (with "-" as stdin/stdout)
43
+ # ----------------------------------------------------------------------
44
+
45
+
46
+ class File(ABC):
47
+ """Abstract base for InFile / OutFile.
48
+
49
+ Common fields:
50
+ - raw: original string
51
+ - path: filesystem path or None (for streams)
52
+ - is_stream: True if "-" (stdin/stdout)
53
+ - stream_name: "stdin" / "stdout" / None
54
+ """
55
+
56
+ def __init__(self, path: str | Path, stream_name: str | None):
57
+ raw = str(path)
58
+ self.raw = raw
59
+
60
+ if raw == "-":
61
+ self.is_stream = True
62
+ self.stream_name = stream_name
63
+ self.path: Path | None = None
64
+ else:
65
+ self.is_stream = False
66
+ self.stream_name = None
67
+ self.path = Path(path)
68
+
69
+ def __fspath__(self):
70
+ if self.is_stream or self.path is None:
71
+ raise TypeError(
72
+ f"{self.__class__.__name__}('-') does not have a filesystem path"
73
+ )
74
+ return str(self.path)
75
+
76
+ def __str__(self) -> str:
77
+ return self.raw
78
+
79
+ def __repr__(self) -> str:
80
+ return f"{self.__class__.__name__}({self.raw!r})"
81
+
82
+ @abstractmethod
83
+ def open(self, mode: str = "", *args, **kwargs):
84
+ """Open underlying file or stream."""
85
+ ...
86
+
87
+
88
+ class InFile(File):
89
+ """Marker type for input files (for provenance + DAG).
90
+
91
+ The string "-" stands for stdin.
92
+ """
93
+
94
+ def __init__(self, path: str):
95
+ super().__init__(path, stream_name="stdin")
96
+
97
+ def open(self, mode: str = "r", *args, **kwargs):
98
+ if self.is_stream:
99
+ return sys.stdin.buffer if "b" in mode else sys.stdin
100
+ if self.path is None:
101
+ raise ValueError("InFile has no path")
102
+
103
+ return self.path.open(mode, *args, **kwargs)
104
+
105
+
106
+ class OutFile(File):
107
+ """Marker type for output files (for provenance + DAG).
108
+
109
+ The string "-" stands for stdout.
110
+ """
111
+
112
+ def __init__(self, path: str):
113
+ super().__init__(path, stream_name="stdout")
114
+
115
+ def as_infile(self) -> InFile:
116
+ """Convert an OutFile path into a new InFile for downstream steps."""
117
+ if self.is_stream or self.path is None:
118
+ raise ValueError("Cannot convert a stream-based OutFile into an InFile")
119
+ return InFile(str(self.path))
120
+
121
+ def open(self, mode: str = "w", *args, **kwargs):
122
+ if self.is_stream:
123
+ return sys.stdout.buffer if "b" in mode else sys.stdout
124
+ if self.path is None:
125
+ raise ValueError("OutFile has no path")
126
+
127
+ self.path.parent.mkdir(parents=True, exist_ok=True)
128
+ return self.path.open(mode, *args, **kwargs)
129
+
130
+
131
+ # ----------------------------------------------------------------------
132
+ # Registry + basic Make-like build
133
+ # ----------------------------------------------------------------------
134
+
135
+ RULES: dict[str, dict[str, Any]] = {}
136
+ COMMANDS: set[Callable] = set()
137
+
138
+ def _caller_script() -> Path:
139
+ mod = sys.modules.get("__main__")
140
+ if getattr(mod, "__file__", None):
141
+ return Path(mod.__file__).resolve()
142
+
143
+ if sys.argv and sys.argv[0]:
144
+ p = Path(sys.argv[0])
145
+ if p.exists():
146
+ return p.resolve()
147
+
148
+ for f in reversed(inspect.stack()):
149
+ p = Path(f.filename)
150
+ if p.suffix in {".py", ""}:
151
+ return p.resolve()
152
+
153
+ return Path("unknown")
154
+
155
+
156
+ def _safe_cmd(argv: list[str]) -> str | None:
157
+ try:
158
+ return subprocess.run(
159
+ argv, check=True, capture_output=True, text=True
160
+ ).stdout.strip()
161
+ except Exception: # noqa: BLE001
162
+ return None
163
+
164
+
165
+ def needs_update(outputs, deps) -> bool:
166
+ """Return True if outputs missing or older than any dependency."""
167
+ out_paths = [Path(o) for o in outputs]
168
+ dep_paths = [Path(d) for d in deps]
169
+
170
+ if not out_paths:
171
+ return True
172
+
173
+ if any(not o.exists() for o in out_paths):
174
+ return True
175
+
176
+ oldest_out = min(o.stat().st_mtime for o in out_paths)
177
+ dep_times = [d.stat().st_mtime for d in dep_paths if d.exists()]
178
+ if not dep_times:
179
+ return False
180
+
181
+ newest_dep = max(dep_times)
182
+ return newest_dep > oldest_out
183
+
184
+
185
+ def build(target, _seen=None):
186
+ """
187
+ Recursively build target after its dependencies, if needed.
188
+
189
+ `target` is a path (string/Path). Only rules with default
190
+ OutFile paths are part of this DAG.
191
+ """
192
+ if _seen is None:
193
+ _seen = set()
194
+ target = str(target)
195
+
196
+ if target in _seen:
197
+ raise RuntimeError(f"Cycle in build graph at {target!r}")
198
+ _seen.add(target)
199
+
200
+ rule = RULES[target]
201
+
202
+ for dep in rule["deps"]:
203
+ if dep in RULES:
204
+ build(dep, _seen)
205
+
206
+ rule["func"]()
207
+
208
+
209
+ # ----------------------------------------------------------------------
210
+ # PROV helpers
211
+ # ----------------------------------------------------------------------
212
+
213
+
214
+ def _describe_file(g: rdflib.Graph, base: rdflib.Namespace, path: Path, kind: str):
215
+ iri = base[f"{kind}/{path.as_posix()}"]
216
+
217
+ mtype = mimetypes.guess_type(path.name)[0] or "application/octet-stream"
218
+ size = path.stat().st_size if path.exists() else 0
219
+ mtime = (
220
+ datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc).isoformat()
221
+ if path.exists()
222
+ else None
223
+ )
224
+
225
+ g.add((iri, RDF.type, PROV.Entity))
226
+ g.add((iri, DCT.format, rdflib.Literal(mtype)))
227
+ g.add((iri, DCT.extent, rdflib.Literal(size, datatype=XSD.integer)))
228
+ if mtime:
229
+ g.add((iri, DCT.modified, rdflib.Literal(mtime, datatype=XSD.dateTime)))
230
+
231
+ if kind == "src" and path.exists():
232
+ try:
233
+ sha = hashlib.sha256(path.read_bytes()).hexdigest()
234
+ g.add((iri, DCT.identifier, rdflib.Literal(f"sha256:{sha}")))
235
+ except Exception: # noqa: BLE001
236
+ pass
237
+
238
+ return iri
239
+
240
+
241
+ def project_metadata(dist_name: str | None = None):
242
+ """
243
+ Return (name, version, requires) using importlib.metadata.
244
+
245
+ If dist_name is None, tries to infer from the caller module.
246
+ """
247
+ if dist_name is None:
248
+ # crude default: module name of the caller
249
+ frame = inspect.stack()[1]
250
+ module = inspect.getmodule(frame[0])
251
+ if module and module.__package__:
252
+ dist_name = module.__package__.split(".", 1)[0]
253
+ else:
254
+ return None, None, []
255
+
256
+ try:
257
+ dist = im.distribution(dist_name)
258
+ except im.PackageNotFoundError:
259
+ return None, None, []
260
+
261
+ name = dist.metadata.get("Name")
262
+ version = dist.version
263
+ requires = dist.requires or []
264
+ return name, version, requires
265
+
266
+ def pep503_normalize(name: str) -> str:
267
+ """PEP 503 normalization: lowercase, runs of [-_.] -> '-'."""
268
+ name = name.strip()
269
+ name = name.lower()
270
+ return re.sub(r"[-_.]+", "-", name)
271
+
272
+
273
+ def add_deps_to_env(
274
+ D: rdflib.Graph,
275
+ env_iri: rdflib.term.Identifier,
276
+ deps_specs: list[str],
277
+ ):
278
+ """
279
+ Attach dependencies from deps_specs to env_iri using PyPI-based IRIs.
280
+
281
+ deps_specs: e.g. ["rdflib>=6.0.0", "pydantic==2.0.3"]
282
+ """
283
+ for spec in deps_specs:
284
+ spec_str = spec.strip()
285
+ if not spec_str:
286
+ continue
287
+
288
+ pkg = spec_str.split()[0]
289
+ # keep only the name part before version operator
290
+ pkg_name = re.split(r"[<>=!~ ]", pkg, 1)[0]
291
+ norm = pep503_normalize(pkg_name)
292
+
293
+ dep_iri = rdflib.URIRef(f"https://pypi.org/project/{norm}/")
294
+
295
+ D.add((dep_iri, RDF.type, RDF.Resource))
296
+ D.add((dep_iri, RDFS.label, rdflib.Literal(spec_str)))
297
+ # link environment -> dependency
298
+ D.add((env_iri, DCT.requires, dep_iri))
299
+
300
+ def _augment_with_metadata(
301
+ D: rdflib.Graph,
302
+ base: rdflib.Namespace,
303
+ activity: rdflib.term.Identifier,
304
+ run_id: str,
305
+ ):
306
+ """Add metadata + dependency info to provenance, if possible."""
307
+ name, version, deps_specs = project_metadata()
308
+
309
+ if name or version or deps_specs:
310
+ env = base[f"env/{run_id}"]
311
+ D.add((env, RDF.type, PROV.Entity))
312
+ D.add((env, RDF.type, PROV.Collection))
313
+ D.add((env, RDFS.label, rdflib.Literal("Python environment")))
314
+ D.add((activity, PROV.used, env))
315
+
316
+ if name:
317
+ D.add((env, DCT.title, rdflib.Literal(name)))
318
+ if version:
319
+ D.add((env, DCT.hasVersion, rdflib.Literal(version)))
320
+
321
+ add_deps_to_env(D, env, deps_specs)
322
+
323
+
324
+ def _write_provenance_dataset(
325
+ base_iri: str,
326
+ name: str,
327
+ prov_path: str | Path,
328
+ deps: list[str],
329
+ outputs: list[str],
330
+ t0: datetime,
331
+ t1: datetime,
332
+ data_graph: rdflib.Graph | None = None,
333
+ success: bool = True,
334
+ ):
335
+ """
336
+ Build a Dataset with:
337
+ - default graph: PROV metadata
338
+ - named graph: data_graph (if provided)
339
+ and serialize as Trig to prov_path.
340
+ """
341
+ base = rdflib.Namespace(base_iri)
342
+ ds = rdflib.Dataset()
343
+ D = ds.default_context
344
+
345
+ ds.bind("", base)
346
+ ds.bind("prov", PROV)
347
+ ds.bind("dcterms", DCT)
348
+
349
+ run_id = t0.strftime("%Y%m%dT%H%M%S")
350
+ script = _caller_script()
351
+
352
+ activity = base[f"run/{name}/{run_id}"]
353
+ agent = base[f"agent/{script.name}"]
354
+ graph_iri = base[f"graph/{name}"]
355
+
356
+ commit = _safe_cmd(["git", "rev-parse", "HEAD"])
357
+ origin = _safe_cmd(["git", "config", "--get", "remote.origin.url"])
358
+
359
+ D.add((activity, RDF.type, PROV.Activity))
360
+ t0_term = rdflib.Literal(t0.isoformat(), datatype=XSD.dateTime)
361
+ D.add((activity, PROV.startedAtTime, t0_term))
362
+ t1_term = rdflib.Literal(t1.isoformat(), datatype=XSD.dateTime)
363
+ D.add((activity, PROV.endedAtTime, t1_term))
364
+
365
+ D.add((agent, RDF.type, PROV.SoftwareAgent))
366
+ D.add((agent, RDFS.label, rdflib.Literal(script.name)))
367
+ if commit:
368
+ D.add((agent, DCT.hasVersion, rdflib.Literal(commit)))
369
+ if origin:
370
+ D.add((agent, DCT.source, rdflib.URIRef(origin)))
371
+ D.add((activity, PROV.wasAssociatedWith, agent))
372
+
373
+ if data_graph is not None:
374
+ gx = ds.get_context(graph_iri)
375
+ for triple in data_graph:
376
+ gx.add(triple)
377
+
378
+ D.add((graph_iri, RDF.type, PROV.Entity))
379
+ D.add((graph_iri, PROV.wasGeneratedBy, activity))
380
+ D.add((graph_iri, PROV.wasAttributedTo, agent))
381
+ D.add((graph_iri, PROV.generatedAtTime, t1_term))
382
+
383
+ for d in deps:
384
+ p = Path(d)
385
+ if not p.exists():
386
+ continue
387
+ src = _describe_file(D, base, p, "src")
388
+ D.add((activity, PROV.used, src))
389
+
390
+ for o in outputs:
391
+ p = Path(o)
392
+ if not p.exists():
393
+ continue
394
+ ent = _describe_file(D, base, p, "out")
395
+ D.add((ent, PROV.wasGeneratedBy, activity))
396
+
397
+ if not success:
398
+ D.add((activity, RDFS.comment, rdflib.Literal("task failed")))
399
+
400
+ # Add pyproject + dependency information, if available
401
+ _augment_with_metadata(D, base, activity, run_id)
402
+
403
+ prov_path = Path(prov_path)
404
+ prov_path.parent.mkdir(parents=True, exist_ok=True)
405
+ logging.info("Writing provenance dataset %s", prov_path)
406
+ ds.serialize(prov_path, format="trig")
407
+
408
+
409
+ # ----------------------------------------------------------------------
410
+ # Annotation helpers (supports Optional[InFile], OutFile | None, etc.)
411
+ # ----------------------------------------------------------------------
412
+
413
+
414
+ def _is_kind_annotation(ann: Any, cls: type) -> bool:
415
+ if ann is cls:
416
+ return True
417
+
418
+ origin = get_origin(ann)
419
+ if origin is None:
420
+ return False
421
+
422
+ return any(a is cls for a in get_args(ann))
423
+
424
+
425
+ # ----------------------------------------------------------------------
426
+ # Decorator: infer inputs/outputs from signature
427
+ # ----------------------------------------------------------------------
428
+
429
+
430
+ def rule(
431
+ *,
432
+ name: str | None = None,
433
+ base_iri: str | None = None,
434
+ prov_dir: str | None = None,
435
+ prov_path: str | None = None,
436
+ force: bool | None = None,
437
+ dry_run: bool | None = None,
438
+ config: ProvenanceConfig | None = None,
439
+ ):
440
+ base_config = config or GLOBAL_CONFIG
441
+
442
+ rule_config = ProvenanceConfig(
443
+ base_iri=base_iri if base_iri is not None else base_config.base_iri,
444
+ prov_dir=prov_dir if prov_dir is not None else base_config.prov_dir,
445
+ force=force if force is not None else base_config.force,
446
+ dry_run=dry_run if dry_run is not None else base_config.dry_run,
447
+ )
448
+
449
+ def decorator(func):
450
+ sig = inspect.signature(func)
451
+ hints = get_type_hints(func)
452
+
453
+ in_params: list[str] = []
454
+ out_params: list[str] = []
455
+
456
+ for p in sig.parameters.values():
457
+ ann = hints.get(p.name, p.annotation)
458
+ if _is_kind_annotation(ann, InFile):
459
+ in_params.append(p.name)
460
+ if _is_kind_annotation(ann, OutFile):
461
+ out_params.append(p.name)
462
+
463
+ if not out_params:
464
+ raise ValueError(
465
+ f"Function {func.__name__} must have at least one "
466
+ f"OutFile (possibly Optional[OutFile]) parameter"
467
+ )
468
+
469
+ deps: list[str] = []
470
+ outputs: list[str] = []
471
+
472
+ for p in sig.parameters.values():
473
+ if p.name in in_params and p.default is not inspect._empty:
474
+ val = p.default
475
+ if isinstance(val, InFile):
476
+ if getattr(val, "is_stream", False):
477
+ pass
478
+ elif val.path is not None:
479
+ deps.append(str(val.path))
480
+ elif isinstance(val, (str, Path)):
481
+ if str(val) != "-":
482
+ deps.append(str(val))
483
+
484
+ if p.name in out_params and p.default is not inspect._empty:
485
+ val = p.default
486
+ if isinstance(val, OutFile):
487
+ if getattr(val, "is_stream", False):
488
+ pass
489
+ elif val.path is not None:
490
+ outputs.append(str(val.path))
491
+ elif isinstance(val, (str, Path)):
492
+ if str(val) != "-":
493
+ outputs.append(str(val))
494
+
495
+ register_for_build = bool(outputs)
496
+ logical_name = name or func.__name__
497
+
498
+ if prov_path is not None:
499
+ rule_prov_path = prov_path
500
+ else:
501
+ rule_prov_path = str(Path(rule_config.prov_dir) / f"{logical_name}.trig")
502
+
503
+ @functools.wraps(func)
504
+ def wrapped(*args, **kwargs):
505
+ bound = sig.bind_partial(*args, **kwargs)
506
+ bound.apply_defaults()
507
+
508
+ in_files: list[Path] = []
509
+ out_files: list[Path] = []
510
+
511
+ for pname in in_params:
512
+ val = bound.arguments.get(pname)
513
+ if isinstance(val, InFile):
514
+ if val.is_stream or val.path is None:
515
+ continue
516
+ in_files.append(val.path)
517
+ elif val is None:
518
+ continue
519
+ else:
520
+ if str(val) != "-":
521
+ in_files.append(Path(val))
522
+
523
+ for pname in out_params:
524
+ val = bound.arguments.get(pname)
525
+ if isinstance(val, OutFile):
526
+ if val.is_stream or val.path is None:
527
+ continue
528
+ out_files.append(val.path)
529
+ elif val is None:
530
+ continue
531
+ else:
532
+ if str(val) != "-":
533
+ out_files.append(Path(val))
534
+
535
+ if not rule_config.force and not needs_update(out_files, in_files):
536
+ logging.info("Skipping %s (up to date)", logical_name)
537
+ return None
538
+
539
+ if rule_config.dry_run:
540
+ logging.info(
541
+ "Dry-run %s: would run with %s -> %s",
542
+ logical_name,
543
+ in_files,
544
+ out_files,
545
+ )
546
+ return None
547
+
548
+ t0 = datetime.now(timezone.utc)
549
+ exc: Exception | None = None
550
+ data_graph: rdflib.Graph | None = None
551
+ result = None
552
+
553
+ try:
554
+ result = func(*bound.args, **bound.kwargs)
555
+ if isinstance(result, (rdflib.Graph, rdflib.Dataset)):
556
+ data_graph = result
557
+ return result
558
+ except Exception as e:
559
+ exc = e
560
+ raise
561
+ finally:
562
+ t1 = datetime.now(timezone.utc)
563
+ try:
564
+ _write_provenance_dataset(
565
+ base_iri=rule_config.base_iri,
566
+ name=logical_name,
567
+ prov_path=rule_prov_path,
568
+ deps=[str(p) for p in in_files],
569
+ outputs=[str(p) for p in out_files],
570
+ t0=t0,
571
+ t1=t1,
572
+ data_graph=data_graph,
573
+ success=exc is None,
574
+ )
575
+ except Exception as prov_exc: # noqa: BLE001
576
+ logging.warning(
577
+ "Failed to write provenance for %s: %s",
578
+ logical_name,
579
+ prov_exc,
580
+ )
581
+
582
+ COMMANDS.add(wrapped)
583
+ if register_for_build:
584
+ target = outputs[0]
585
+ RULES[target] = {
586
+ "deps": deps,
587
+ "outputs": outputs,
588
+ "func": wrapped,
589
+ }
590
+
591
+ return wrapped
592
+
593
+ return decorator
@@ -0,0 +1,71 @@
1
+ import tempfile
2
+ from pathlib import Path
3
+
4
+ from rdflib import Graph, Literal, Namespace
5
+ from rdflib.namespace import RDF, XSD
6
+
7
+ from makeprov import InFile, OutFile, ProvenanceConfig, rule
8
+
9
+ @rule(name="test_process_data")
10
+ def process_data(input_file: InFile, output_file: OutFile):
11
+ with input_file.open('r') as infile, output_file.open('w') as outfile:
12
+ data = infile.read()
13
+ outfile.write(data)
14
+
15
+
16
+ SALES_NS = Namespace("http://example.org/test/")
17
+ TEST_PROV_DIR = Path(tempfile.mkdtemp(prefix="makeprov-tests-"))
18
+ TEST_PROV_CONFIG = ProvenanceConfig(prov_dir=str(TEST_PROV_DIR))
19
+
20
+
21
+ @rule(name="test_totals_graph", config=TEST_PROV_CONFIG)
22
+ def totals_graph(input_csv: InFile, graph_out: OutFile) -> Graph:
23
+ graph = Graph()
24
+ graph.bind("sales", SALES_NS)
25
+
26
+ if graph_out.path is None:
27
+ raise ValueError("graph_out must have a filesystem path")
28
+ graph_out.path.parent.mkdir(parents=True, exist_ok=True)
29
+
30
+ with input_csv.open('r') as handle:
31
+ for line in handle.read().strip().splitlines()[1:]:
32
+ region, units, revenue = line.split(',')
33
+ subject = SALES_NS[f"region/{region.lower()}"]
34
+ graph.add((subject, RDF.type, SALES_NS.RegionTotal))
35
+ graph.add((subject, SALES_NS.regionName, Literal(region)))
36
+ graph.add((subject, SALES_NS.totalUnits, Literal(units, datatype=XSD.integer)))
37
+ graph.add((subject, SALES_NS.totalRevenue, Literal(revenue, datatype=XSD.decimal)))
38
+
39
+ with graph_out.open('w') as handle:
40
+ handle.write(graph.serialize(format='turtle'))
41
+
42
+ return graph
43
+
44
+
45
+ def test_process_data(tmp_path):
46
+ input_file = tmp_path / "input.txt"
47
+ output_file = tmp_path / "output.txt"
48
+
49
+ input_file.write_text("Hello, world!")
50
+
51
+ # Run the process_data function
52
+ result = process_data(InFile(str(input_file)), OutFile(str(output_file)))
53
+
54
+ # Check that the output file was created and contains the correct data
55
+ assert output_file.exists()
56
+ assert output_file.read_text() == "Hello, world!"
57
+
58
+
59
+ def test_rule_returns_graph(tmp_path):
60
+ input_csv = tmp_path / "region_totals.csv"
61
+ graph_ttl = tmp_path / "region_totals.ttl"
62
+ input_csv.write_text("region,total_units,total_revenue\nNorth,6,119.94\n")
63
+
64
+ result = totals_graph(InFile(str(input_csv)), OutFile(str(graph_ttl)))
65
+
66
+ assert isinstance(result, Graph)
67
+ assert graph_ttl.exists()
68
+ assert "North" in graph_ttl.read_text()
69
+ prov_path = TEST_PROV_DIR / "test_totals_graph.trig"
70
+ assert prov_path.exists()
71
+ prov_path.unlink()