makeprov 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {makeprov-0.2.0 → makeprov-0.2.2}/PKG-INFO +1 -1
- {makeprov-0.2.0 → makeprov-0.2.2}/pyproject.toml +1 -1
- {makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov/config.py +24 -8
- {makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov/core.py +53 -9
- {makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov/paths.py +10 -7
- {makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov/prov.py +98 -26
- {makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov.egg-info/PKG-INFO +1 -1
- makeprov-0.2.2/tests/test_makeprov.py +160 -0
- makeprov-0.2.0/tests/test_makeprov.py +0 -67
- {makeprov-0.2.0 → makeprov-0.2.2}/README.md +0 -0
- {makeprov-0.2.0 → makeprov-0.2.2}/setup.cfg +0 -0
- {makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov/__init__.py +0 -0
- {makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov/jsonld.py +0 -0
- {makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov.egg-info/SOURCES.txt +0 -0
- {makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov.egg-info/dependency_links.txt +0 -0
- {makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov.egg-info/requires.txt +0 -0
- {makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov.egg-info/top_level.txt +0 -0
- {makeprov-0.2.0 → makeprov-0.2.2}/tests/test_prov_shacl.py +0 -0
|
@@ -10,14 +10,16 @@ ProvFormat = Literal["json", "trig"]
|
|
|
10
10
|
class ProvenanceConfig:
|
|
11
11
|
base_iri: str = "http://example.org/"
|
|
12
12
|
prov_dir: str = "prov"
|
|
13
|
+
prov_path: str | None = None
|
|
13
14
|
force: bool = False
|
|
14
15
|
dry_run: bool = False
|
|
15
16
|
out_fmt: ProvFormat = "json"
|
|
17
|
+
jsonld_with_context: bool = False
|
|
16
18
|
|
|
17
19
|
GLOBAL_CONFIG = ProvenanceConfig()
|
|
18
20
|
|
|
19
21
|
def main(subcommands=None, conf_obj=None, parsers=None):
|
|
20
|
-
from .core import COMMANDS
|
|
22
|
+
from .core import COMMANDS, flush_prov_buffer, start_prov_buffer
|
|
21
23
|
|
|
22
24
|
subcommands = subcommands or COMMANDS
|
|
23
25
|
conf_obj = conf_obj or GLOBAL_CONFIG
|
|
@@ -53,11 +55,25 @@ def main(subcommands=None, conf_obj=None, parsers=None):
|
|
|
53
55
|
logging.debug(f"Setting config {p}")
|
|
54
56
|
conf(conf_obj, p)
|
|
55
57
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
argparse_kwargs={"parents": [parent]},
|
|
58
|
+
return ns
|
|
59
|
+
|
|
60
|
+
parent.add_argument(
|
|
61
|
+
"--merge-prov",
|
|
62
|
+
action="store_true",
|
|
63
|
+
help="Merge provenance from invoked commands into a single output",
|
|
63
64
|
)
|
|
65
|
+
|
|
66
|
+
ns = apply_globals(sys.argv[1:]) # apply effects early
|
|
67
|
+
logging.debug(f"Config: {conf_obj}")
|
|
68
|
+
try:
|
|
69
|
+
if ns.merge_prov:
|
|
70
|
+
start_prov_buffer()
|
|
71
|
+
defopt.run(
|
|
72
|
+
subcommands,
|
|
73
|
+
parsers=parsers or {},
|
|
74
|
+
argv=sys.argv[1:],
|
|
75
|
+
argparse_kwargs={"parents": [parent]},
|
|
76
|
+
)
|
|
77
|
+
finally:
|
|
78
|
+
if ns.merge_prov:
|
|
79
|
+
flush_prov_buffer()
|
|
@@ -10,7 +10,7 @@ from collections.abc import Callable
|
|
|
10
10
|
|
|
11
11
|
from .config import ProvenanceConfig, ProvFormat, GLOBAL_CONFIG
|
|
12
12
|
from .paths import InPath, OutPath
|
|
13
|
-
from .prov import Prov
|
|
13
|
+
from .prov import Prov, ProvResult, write_combined_prov
|
|
14
14
|
|
|
15
15
|
try:
|
|
16
16
|
import rdflib # optional
|
|
@@ -21,6 +21,29 @@ except Exception:
|
|
|
21
21
|
RULES: dict[str, dict[str, Any]] = {}
|
|
22
22
|
COMMANDS: set[Callable] = set()
|
|
23
23
|
|
|
24
|
+
|
|
25
|
+
PROV_BUFFER: list[ProvResult] | None = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def start_prov_buffer() -> None:
|
|
29
|
+
global PROV_BUFFER
|
|
30
|
+
PROV_BUFFER = []
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def flush_prov_buffer() -> None:
|
|
34
|
+
global PROV_BUFFER
|
|
35
|
+
try:
|
|
36
|
+
if PROV_BUFFER:
|
|
37
|
+
write_combined_prov(
|
|
38
|
+
PROV_BUFFER,
|
|
39
|
+
prov_path=GLOBAL_CONFIG.prov_path or Path(GLOBAL_CONFIG.prov_dir)
|
|
40
|
+
/ "combined",
|
|
41
|
+
fmt=GLOBAL_CONFIG.out_fmt,
|
|
42
|
+
jsonld_with_context=GLOBAL_CONFIG.jsonld_with_context,
|
|
43
|
+
)
|
|
44
|
+
finally:
|
|
45
|
+
PROV_BUFFER = None
|
|
46
|
+
|
|
24
47
|
def needs_update(outputs, deps) -> bool:
|
|
25
48
|
"""Return True if any output missing or older than any dependency."""
|
|
26
49
|
out_paths = [Path(o) for o in outputs]
|
|
@@ -43,6 +66,7 @@ def build(target, _seen=None):
|
|
|
43
66
|
Recursively build target after its dependencies, if needed.
|
|
44
67
|
`target` is a path (string/Path). Only rules with default OutPath are in DAG.
|
|
45
68
|
"""
|
|
69
|
+
top_level = _seen is None
|
|
46
70
|
if _seen is None:
|
|
47
71
|
_seen = set()
|
|
48
72
|
target = str(target)
|
|
@@ -50,12 +74,18 @@ def build(target, _seen=None):
|
|
|
50
74
|
raise RuntimeError(f"Cycle in build graph at {target!r}")
|
|
51
75
|
_seen.add(target)
|
|
52
76
|
|
|
77
|
+
if top_level:
|
|
78
|
+
start_prov_buffer()
|
|
79
|
+
|
|
53
80
|
rule = RULES[target]
|
|
54
81
|
for dep in rule["deps"]:
|
|
55
82
|
if dep in RULES:
|
|
56
83
|
build(dep, _seen)
|
|
57
84
|
rule["func"]()
|
|
58
85
|
|
|
86
|
+
if top_level:
|
|
87
|
+
flush_prov_buffer()
|
|
88
|
+
|
|
59
89
|
def _is_kind_annotation(ann: Any, cls: type) -> bool:
|
|
60
90
|
if ann is cls:
|
|
61
91
|
return True
|
|
@@ -74,7 +104,7 @@ def rule(
|
|
|
74
104
|
dry_run: bool | None = None,
|
|
75
105
|
out_fmt: ProvFormat | None = None,
|
|
76
106
|
config: ProvenanceConfig | None = None,
|
|
77
|
-
jsonld_with_context: bool =
|
|
107
|
+
jsonld_with_context: bool | None = None,
|
|
78
108
|
):
|
|
79
109
|
"""
|
|
80
110
|
Decorator that infers inputs/outputs from type annotations
|
|
@@ -84,9 +114,11 @@ def rule(
|
|
|
84
114
|
rule_config = ProvenanceConfig(
|
|
85
115
|
base_iri=base_iri if base_iri is not None else base_config.base_iri,
|
|
86
116
|
prov_dir=prov_dir if prov_dir is not None else base_config.prov_dir,
|
|
117
|
+
prov_path=base_config.prov_path,
|
|
87
118
|
force=force if force is not None else base_config.force,
|
|
88
119
|
dry_run=dry_run if dry_run is not None else base_config.dry_run,
|
|
89
120
|
out_fmt=out_fmt if out_fmt is not None else base_config.out_fmt,
|
|
121
|
+
jsonld_with_context=base_config.jsonld_with_context,
|
|
90
122
|
)
|
|
91
123
|
|
|
92
124
|
def decorator(func):
|
|
@@ -136,6 +168,12 @@ def rule(
|
|
|
136
168
|
bound = sig.bind_partial(*args, **kwargs)
|
|
137
169
|
bound.apply_defaults()
|
|
138
170
|
|
|
171
|
+
effective_jsonld_with_context = (
|
|
172
|
+
jsonld_with_context
|
|
173
|
+
if jsonld_with_context is not None
|
|
174
|
+
else rule_config.jsonld_with_context
|
|
175
|
+
)
|
|
176
|
+
|
|
139
177
|
in_files: list[Path] = []
|
|
140
178
|
out_files: list[Path] = []
|
|
141
179
|
|
|
@@ -199,14 +237,20 @@ def rule(
|
|
|
199
237
|
)
|
|
200
238
|
if prov_path is not None:
|
|
201
239
|
rule_prov_path = prov_path
|
|
240
|
+
elif rule_config.prov_path is not None:
|
|
241
|
+
rule_prov_path = rule_config.prov_path
|
|
202
242
|
else:
|
|
203
243
|
rule_prov_path = Path(rule_config.prov_dir) / logical_name
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
244
|
+
|
|
245
|
+
if PROV_BUFFER is not None:
|
|
246
|
+
PROV_BUFFER.append(ProvResult(prov, result))
|
|
247
|
+
else:
|
|
248
|
+
prov.write(
|
|
249
|
+
rule_prov_path,
|
|
250
|
+
fmt=rule_config.out_fmt,
|
|
251
|
+
result=result,
|
|
252
|
+
jsonld_with_context=effective_jsonld_with_context,
|
|
253
|
+
)
|
|
210
254
|
except Exception as prov_exc: # noqa: BLE001
|
|
211
255
|
logging.warning("Failed to write provenance for %s: %s", logical_name, prov_exc)
|
|
212
256
|
|
|
@@ -221,4 +265,4 @@ def rule(
|
|
|
221
265
|
|
|
222
266
|
return wrapped
|
|
223
267
|
|
|
224
|
-
return decorator
|
|
268
|
+
return decorator
|
|
@@ -1,21 +1,24 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import os
|
|
3
4
|
import sys
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
# Platform-appropriate base class for Path subclassing
|
|
7
8
|
_BasePath = type(Path())
|
|
8
9
|
|
|
10
|
+
|
|
9
11
|
class ProvPath(_BasePath):
|
|
10
12
|
"""
|
|
11
13
|
A Path subclass that understands '-' as a special stream path.
|
|
12
14
|
For subclasses InPath and OutPath, '-' maps to stdin/stdout, respectively.
|
|
13
15
|
"""
|
|
14
16
|
|
|
15
|
-
def __new__(cls,
|
|
16
|
-
|
|
17
|
+
def __new__(cls, *paths: str | bytes | "ProvPath"):
|
|
18
|
+
raw_paths = [os.fspath(p) for p in paths]
|
|
19
|
+
self = super().__new__(cls, *paths)
|
|
17
20
|
# We store stream flags on the instance. Path is immutable, but allows attributes.
|
|
18
|
-
self._is_stream =
|
|
21
|
+
self._is_stream = len(raw_paths) == 1 and raw_paths[0] == "-"
|
|
19
22
|
self._stream_name = None
|
|
20
23
|
return self
|
|
21
24
|
|
|
@@ -47,8 +50,8 @@ class ProvPath(_BasePath):
|
|
|
47
50
|
|
|
48
51
|
class InPath(ProvPath):
|
|
49
52
|
"""Marker for input paths. '-' means stdin."""
|
|
50
|
-
def __new__(cls,
|
|
51
|
-
self = super().__new__(cls,
|
|
53
|
+
def __new__(cls, *paths: str | bytes | ProvPath):
|
|
54
|
+
self = super().__new__(cls, *paths)
|
|
52
55
|
if self.is_stream:
|
|
53
56
|
self._stream_name = "stdin"
|
|
54
57
|
return self
|
|
@@ -61,8 +64,8 @@ class InPath(ProvPath):
|
|
|
61
64
|
|
|
62
65
|
class OutPath(ProvPath):
|
|
63
66
|
"""Marker for output paths. '-' means stdout."""
|
|
64
|
-
def __new__(cls,
|
|
65
|
-
self = super().__new__(cls,
|
|
67
|
+
def __new__(cls, *paths: str | bytes | ProvPath):
|
|
68
|
+
self = super().__new__(cls, *paths)
|
|
66
69
|
if self.is_stream:
|
|
67
70
|
self._stream_name = "stdout"
|
|
68
71
|
return self
|
|
@@ -93,6 +93,12 @@ class ProvDoc(JSONLDMixin):
|
|
|
93
93
|
provenance: list[Any] = field(default_factory=list)
|
|
94
94
|
__context__ = COMMON_CONTEXT
|
|
95
95
|
|
|
96
|
+
|
|
97
|
+
@dataclass
|
|
98
|
+
class ProvResult:
|
|
99
|
+
prov: "Prov"
|
|
100
|
+
result: Any | None = None
|
|
101
|
+
|
|
96
102
|
# ---------- helpers ----------
|
|
97
103
|
|
|
98
104
|
def _safe_cmd(argv: list[str]) -> str | None:
|
|
@@ -275,7 +281,7 @@ class Prov:
|
|
|
275
281
|
norm = pep503_normalize(pkg_name)
|
|
276
282
|
dep_iri = f"https://pypi.org/project/{norm}/"
|
|
277
283
|
reqs.append(DepNode(id=dep_iri, type="rdfs:Resource", label=spec_str))
|
|
278
|
-
|
|
284
|
+
self.env_node = EnvNode(
|
|
279
285
|
id=env_id,
|
|
280
286
|
type=["prov:Entity", "prov:Collection"],
|
|
281
287
|
label="Python environment",
|
|
@@ -288,44 +294,110 @@ class Prov:
|
|
|
288
294
|
self.activity.used = []
|
|
289
295
|
self.activity.used.append(env_id)
|
|
290
296
|
|
|
297
|
+
def to_doc(self, *, include_graph_meta: bool = False) -> ProvDoc:
|
|
298
|
+
provenance: list[Any] = [
|
|
299
|
+
self.activity,
|
|
300
|
+
self.agent,
|
|
301
|
+
*self.output_nodes,
|
|
302
|
+
*([self.env_node] if self.env_node else []),
|
|
303
|
+
]
|
|
304
|
+
|
|
305
|
+
if include_graph_meta:
|
|
306
|
+
provenance.append(self.graph_meta)
|
|
307
|
+
|
|
308
|
+
return ProvDoc(provenance=provenance)
|
|
309
|
+
|
|
310
|
+
def to_dataset(self, result=None):
|
|
311
|
+
import rdflib
|
|
312
|
+
|
|
313
|
+
ds = rdflib.Dataset()
|
|
314
|
+
ds.bind("", self.base_iri)
|
|
315
|
+
default_graph = ds.default_context
|
|
316
|
+
|
|
317
|
+
for triple in self.to_doc(include_graph_meta=True).to_graph():
|
|
318
|
+
default_graph.add(triple)
|
|
319
|
+
|
|
320
|
+
if result is not None and isinstance(result, (rdflib.Graph, rdflib.Dataset)):
|
|
321
|
+
gx = ds.get_context(self.graph_id)
|
|
322
|
+
for triple in result:
|
|
323
|
+
gx.add(triple)
|
|
324
|
+
|
|
325
|
+
return ds
|
|
291
326
|
|
|
292
327
|
def write(self, prov_path: str | Path, result=None, fmt="json", jsonld_with_context=False) -> Path:
|
|
293
328
|
out = Path(prov_path)
|
|
294
329
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
295
|
-
# Assemble document
|
|
296
|
-
doc = ProvDoc(
|
|
297
|
-
provenance=[
|
|
298
|
-
self.activity,
|
|
299
|
-
self.agent,
|
|
300
|
-
*self.output_nodes,
|
|
301
|
-
*([self.env_node] if self.env_node else [])
|
|
302
|
-
]
|
|
303
|
-
)
|
|
304
330
|
if fmt == "json":
|
|
305
|
-
data =
|
|
331
|
+
data = self.to_doc().to_jsonld(with_context=jsonld_with_context)
|
|
332
|
+
if result is not None and isinstance(result, JSONLDMixin):
|
|
333
|
+
data["result"] = [result.to_jsonld(with_context=jsonld_with_context)]
|
|
306
334
|
final = out.with_suffix(".json")
|
|
307
335
|
logging.info("Writing JSON-LD provenance %s", final)
|
|
308
336
|
final.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
|
309
337
|
return final
|
|
310
338
|
elif fmt == "trig":
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
ds = rdflib.Dataset()
|
|
314
|
-
ds.bind("", self.base_iri)
|
|
315
|
-
D = ds.default_context
|
|
316
|
-
doc.provenance.append(self.graph_meta)
|
|
317
|
-
for triple in doc.to_graph():
|
|
318
|
-
D.add(triple)
|
|
319
|
-
|
|
320
|
-
if result is not None:
|
|
321
|
-
if isinstance(result, (rdflib.Graph, rdflib.Dataset)):
|
|
322
|
-
gx = ds.get_context(self.graph_id)
|
|
323
|
-
for triple in result:
|
|
324
|
-
gx.add(triple)
|
|
339
|
+
ds = self.to_dataset(result=result)
|
|
325
340
|
|
|
326
341
|
final = out.with_suffix(".trig")
|
|
327
342
|
logging.info("Writing TRIG provenance %s", final)
|
|
328
343
|
ds.serialize(final, format="trig")
|
|
344
|
+
return final
|
|
329
345
|
|
|
330
346
|
else:
|
|
331
|
-
raise Exception(f"No handler to write Prov object in format '{fmt}'")
|
|
347
|
+
raise Exception(f"No handler to write Prov object in format '{fmt}'")
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def write_combined_prov(
|
|
351
|
+
provs: list[ProvResult],
|
|
352
|
+
prov_path: str | Path,
|
|
353
|
+
fmt: str = "json",
|
|
354
|
+
jsonld_with_context: bool = False,
|
|
355
|
+
):
|
|
356
|
+
if not provs:
|
|
357
|
+
raise ValueError("No provenance objects provided for combination")
|
|
358
|
+
|
|
359
|
+
out = Path(prov_path)
|
|
360
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
361
|
+
|
|
362
|
+
if fmt == "json":
|
|
363
|
+
combined_doc = ProvDoc(provenance=[])
|
|
364
|
+
for prov_result in provs:
|
|
365
|
+
combined_doc.provenance.extend(prov_result.prov.to_doc().provenance)
|
|
366
|
+
|
|
367
|
+
data = combined_doc.to_jsonld(with_context=jsonld_with_context)
|
|
368
|
+
data["result"] = []
|
|
369
|
+
|
|
370
|
+
for prov_result in provs:
|
|
371
|
+
if isinstance(prov_result.result, JSONLDMixin):
|
|
372
|
+
data["result"].append(
|
|
373
|
+
prov_result.result.to_jsonld(with_context=jsonld_with_context)
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
final = out.with_suffix(".json")
|
|
377
|
+
logging.info("Writing combined JSON-LD provenance %s", final)
|
|
378
|
+
final.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
|
379
|
+
return final
|
|
380
|
+
|
|
381
|
+
if fmt == "trig":
|
|
382
|
+
import rdflib
|
|
383
|
+
|
|
384
|
+
ds = rdflib.Dataset()
|
|
385
|
+
|
|
386
|
+
for prov_result in provs:
|
|
387
|
+
ds.bind("", prov_result.prov.base_iri)
|
|
388
|
+
default_graph = ds.default_context
|
|
389
|
+
|
|
390
|
+
for triple in prov_result.prov.to_doc(include_graph_meta=True).to_graph():
|
|
391
|
+
default_graph.add(triple)
|
|
392
|
+
|
|
393
|
+
if isinstance(prov_result.result, (rdflib.Graph, rdflib.Dataset)):
|
|
394
|
+
gx = ds.get_context(prov_result.prov.graph_id)
|
|
395
|
+
for triple in prov_result.result:
|
|
396
|
+
gx.add(triple)
|
|
397
|
+
|
|
398
|
+
final = out.with_suffix(".trig")
|
|
399
|
+
logging.info("Writing combined TRIG provenance %s", final)
|
|
400
|
+
ds.serialize(final, format="trig")
|
|
401
|
+
return final
|
|
402
|
+
|
|
403
|
+
raise Exception(f"No handler to write combined Prov objects in format '{fmt}'")
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import sys
|
|
3
|
+
import tempfile
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from rdflib import Graph, Literal, Namespace
|
|
7
|
+
from rdflib.namespace import RDF, XSD
|
|
8
|
+
|
|
9
|
+
from makeprov import InPath, OutPath, ProvenanceConfig, build, main, rule
|
|
10
|
+
|
|
11
|
+
@rule(name="test_process_data")
|
|
12
|
+
def process_data(input_file: InPath, output_file: OutPath):
|
|
13
|
+
with input_file.open('r') as infile, output_file.open('w') as outfile:
|
|
14
|
+
data = infile.read()
|
|
15
|
+
outfile.write(data)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
SALES_NS = Namespace("http://example.org/test/")
|
|
19
|
+
TEST_PROV_DIR = Path(tempfile.mkdtemp(prefix="makeprov-tests-"))
|
|
20
|
+
TEST_PROV_CONFIG = ProvenanceConfig(prov_dir=str(TEST_PROV_DIR))
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@rule(name="test_totals_graph", config=TEST_PROV_CONFIG)
|
|
24
|
+
def totals_graph(input_csv: InPath, graph_out: OutPath) -> Graph:
|
|
25
|
+
graph = Graph()
|
|
26
|
+
graph.bind("sales", SALES_NS)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
with input_csv.open('r') as handle:
|
|
30
|
+
for line in handle.read().strip().splitlines()[1:]:
|
|
31
|
+
region, units, revenue = line.split(',')
|
|
32
|
+
subject = SALES_NS[f"region/{region.lower()}"]
|
|
33
|
+
graph.add((subject, RDF.type, SALES_NS.RegionTotal))
|
|
34
|
+
graph.add((subject, SALES_NS.regionName, Literal(region)))
|
|
35
|
+
graph.add((subject, SALES_NS.totalUnits, Literal(units, datatype=XSD.integer)))
|
|
36
|
+
graph.add((subject, SALES_NS.totalRevenue, Literal(revenue, datatype=XSD.decimal)))
|
|
37
|
+
|
|
38
|
+
with graph_out.open('w') as handle:
|
|
39
|
+
handle.write(graph.serialize(format='turtle'))
|
|
40
|
+
|
|
41
|
+
return graph
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_process_data(tmp_path):
|
|
45
|
+
input_file = tmp_path / "input.txt"
|
|
46
|
+
output_file = tmp_path / "output.txt"
|
|
47
|
+
|
|
48
|
+
input_file.write_text("Hello, world!")
|
|
49
|
+
|
|
50
|
+
# Run the process_data function
|
|
51
|
+
result = process_data(InPath(str(input_file)), OutPath(str(output_file)))
|
|
52
|
+
|
|
53
|
+
# Check that the output file was created and contains the correct data
|
|
54
|
+
assert output_file.exists()
|
|
55
|
+
assert output_file.read_text() == "Hello, world!"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_rule_returns_graph(tmp_path):
|
|
59
|
+
input_csv = tmp_path / "region_totals.csv"
|
|
60
|
+
graph_ttl = tmp_path / "region_totals.ttl"
|
|
61
|
+
input_csv.write_text("region,total_units,total_revenue\nNorth,6,119.94\n")
|
|
62
|
+
|
|
63
|
+
result = totals_graph(InPath(str(input_csv)), OutPath(str(graph_ttl)))
|
|
64
|
+
|
|
65
|
+
assert isinstance(result, Graph)
|
|
66
|
+
assert graph_ttl.exists()
|
|
67
|
+
assert "North" in graph_ttl.read_text()
|
|
68
|
+
print(*TEST_PROV_DIR.glob('*'))
|
|
69
|
+
assert list(TEST_PROV_DIR.glob('*'))
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_build_combines_provenance(tmp_path, monkeypatch):
|
|
73
|
+
prov_dir = tmp_path / "prov"
|
|
74
|
+
config = ProvenanceConfig(prov_dir=str(prov_dir))
|
|
75
|
+
|
|
76
|
+
@rule(name="combine_step_one", config=config)
|
|
77
|
+
def step_one(
|
|
78
|
+
source: InPath = InPath("combine-source.txt"),
|
|
79
|
+
mid: OutPath = OutPath("combine-mid.txt"),
|
|
80
|
+
):
|
|
81
|
+
with source.open("r") as src, mid.open("w") as dst:
|
|
82
|
+
dst.write(src.read() + " step1")
|
|
83
|
+
|
|
84
|
+
@rule(name="combine_step_two", config=config)
|
|
85
|
+
def step_two(
|
|
86
|
+
mid: InPath = InPath("combine-mid.txt"),
|
|
87
|
+
final: OutPath = OutPath("combine-final.txt"),
|
|
88
|
+
):
|
|
89
|
+
with mid.open("r") as src, final.open("w") as dst:
|
|
90
|
+
dst.write(src.read() + " step2")
|
|
91
|
+
|
|
92
|
+
monkeypatch.chdir(tmp_path)
|
|
93
|
+
(tmp_path / "combine-source.txt").write_text("data")
|
|
94
|
+
|
|
95
|
+
build("combine-final.txt")
|
|
96
|
+
|
|
97
|
+
final_output = tmp_path / "combine-final.txt"
|
|
98
|
+
assert final_output.exists()
|
|
99
|
+
assert final_output.read_text() == "data step1 step2"
|
|
100
|
+
|
|
101
|
+
prov_files = list(prov_dir.glob("*"))
|
|
102
|
+
assert len(prov_files) == 1
|
|
103
|
+
|
|
104
|
+
prov_json = json.loads(prov_files[0].read_text())
|
|
105
|
+
activities = [
|
|
106
|
+
node
|
|
107
|
+
for node in prov_json["provenance"]
|
|
108
|
+
if node.get("type") == "prov:Activity"
|
|
109
|
+
or (
|
|
110
|
+
isinstance(node.get("type"), list)
|
|
111
|
+
and "prov:Activity" in node.get("type", [])
|
|
112
|
+
)
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
assert len(activities) == 2
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def test_cli_merge_prov(tmp_path, monkeypatch):
|
|
119
|
+
prov_dir = tmp_path / "prov"
|
|
120
|
+
intermediate = tmp_path / "cli-mid.txt"
|
|
121
|
+
final = tmp_path / "cli-final.txt"
|
|
122
|
+
config = ProvenanceConfig(prov_dir=str(prov_dir))
|
|
123
|
+
|
|
124
|
+
@rule(name="cli_merge_one", config=config)
|
|
125
|
+
def step_one(mid: OutPath = OutPath(intermediate)):
|
|
126
|
+
with mid.open("w") as dst:
|
|
127
|
+
dst.write("stage1")
|
|
128
|
+
|
|
129
|
+
@rule(name="cli_merge_two", config=config)
|
|
130
|
+
def step_two(mid: InPath = InPath(intermediate), final: OutPath = OutPath(final)):
|
|
131
|
+
with mid.open("r") as src, final.open("w") as dst:
|
|
132
|
+
dst.write(src.read() + " stage2")
|
|
133
|
+
|
|
134
|
+
def run_pipeline():
|
|
135
|
+
step_one()
|
|
136
|
+
step_two()
|
|
137
|
+
|
|
138
|
+
monkeypatch.chdir(tmp_path)
|
|
139
|
+
monkeypatch.setattr(sys, "argv", ["prog", "--merge-prov", "run-pipeline"])
|
|
140
|
+
|
|
141
|
+
main(subcommands=[run_pipeline])
|
|
142
|
+
|
|
143
|
+
assert final.exists()
|
|
144
|
+
assert final.read_text() == "stage1 stage2"
|
|
145
|
+
|
|
146
|
+
prov_files = list(prov_dir.glob("*"))
|
|
147
|
+
assert len(prov_files) == 1
|
|
148
|
+
|
|
149
|
+
prov_json = json.loads(prov_files[0].read_text())
|
|
150
|
+
activities = [
|
|
151
|
+
node
|
|
152
|
+
for node in prov_json["provenance"]
|
|
153
|
+
if node.get("type") == "prov:Activity"
|
|
154
|
+
or (
|
|
155
|
+
isinstance(node.get("type"), list)
|
|
156
|
+
and "prov:Activity" in node.get("type", [])
|
|
157
|
+
)
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
assert len(activities) == 2
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
import tempfile
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
from rdflib import Graph, Literal, Namespace
|
|
5
|
-
from rdflib.namespace import RDF, XSD
|
|
6
|
-
|
|
7
|
-
from makeprov import InPath, OutPath, ProvenanceConfig, rule
|
|
8
|
-
|
|
9
|
-
@rule(name="test_process_data")
|
|
10
|
-
def process_data(input_file: InPath, output_file: OutPath):
|
|
11
|
-
with input_file.open('r') as infile, output_file.open('w') as outfile:
|
|
12
|
-
data = infile.read()
|
|
13
|
-
outfile.write(data)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
SALES_NS = Namespace("http://example.org/test/")
|
|
17
|
-
TEST_PROV_DIR = Path(tempfile.mkdtemp(prefix="makeprov-tests-"))
|
|
18
|
-
TEST_PROV_CONFIG = ProvenanceConfig(prov_dir=str(TEST_PROV_DIR))
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
@rule(name="test_totals_graph", config=TEST_PROV_CONFIG)
|
|
22
|
-
def totals_graph(input_csv: InPath, graph_out: OutPath) -> Graph:
|
|
23
|
-
graph = Graph()
|
|
24
|
-
graph.bind("sales", SALES_NS)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
with input_csv.open('r') as handle:
|
|
28
|
-
for line in handle.read().strip().splitlines()[1:]:
|
|
29
|
-
region, units, revenue = line.split(',')
|
|
30
|
-
subject = SALES_NS[f"region/{region.lower()}"]
|
|
31
|
-
graph.add((subject, RDF.type, SALES_NS.RegionTotal))
|
|
32
|
-
graph.add((subject, SALES_NS.regionName, Literal(region)))
|
|
33
|
-
graph.add((subject, SALES_NS.totalUnits, Literal(units, datatype=XSD.integer)))
|
|
34
|
-
graph.add((subject, SALES_NS.totalRevenue, Literal(revenue, datatype=XSD.decimal)))
|
|
35
|
-
|
|
36
|
-
with graph_out.open('w') as handle:
|
|
37
|
-
handle.write(graph.serialize(format='turtle'))
|
|
38
|
-
|
|
39
|
-
return graph
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def test_process_data(tmp_path):
|
|
43
|
-
input_file = tmp_path / "input.txt"
|
|
44
|
-
output_file = tmp_path / "output.txt"
|
|
45
|
-
|
|
46
|
-
input_file.write_text("Hello, world!")
|
|
47
|
-
|
|
48
|
-
# Run the process_data function
|
|
49
|
-
result = process_data(InPath(str(input_file)), OutPath(str(output_file)))
|
|
50
|
-
|
|
51
|
-
# Check that the output file was created and contains the correct data
|
|
52
|
-
assert output_file.exists()
|
|
53
|
-
assert output_file.read_text() == "Hello, world!"
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def test_rule_returns_graph(tmp_path):
|
|
57
|
-
input_csv = tmp_path / "region_totals.csv"
|
|
58
|
-
graph_ttl = tmp_path / "region_totals.ttl"
|
|
59
|
-
input_csv.write_text("region,total_units,total_revenue\nNorth,6,119.94\n")
|
|
60
|
-
|
|
61
|
-
result = totals_graph(InPath(str(input_csv)), OutPath(str(graph_ttl)))
|
|
62
|
-
|
|
63
|
-
assert isinstance(result, Graph)
|
|
64
|
-
assert graph_ttl.exists()
|
|
65
|
-
assert "North" in graph_ttl.read_text()
|
|
66
|
-
print(*TEST_PROV_DIR.glob('*'))
|
|
67
|
-
assert list(TEST_PROV_DIR.glob('*'))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|