PyPI - makeprov - Versions diffs - 0.2.0__tar.gz → 0.2.2__tar.gz - Mend

makeprov 0.2.0tar.gz → 0.2.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{makeprov-0.2.0 → makeprov-0.2.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: makeprov
-Version: 0.2.0
+Version: 0.2.2
 Summary: An provenance tracking library for simple Python workflows
 Author-email: Benno Kruit <b.b.kruit@amsterdamumc.nl>
 License: MIT

{makeprov-0.2.0 → makeprov-0.2.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "makeprov"
-version = "0.2.0"
+version = "0.2.2"
 description = "An provenance tracking library for simple Python workflows"
 readme = "README.md"
 license = { text = "MIT" }

{makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov/config.py RENAMED Viewed

@@ -10,14 +10,16 @@ ProvFormat = Literal["json", "trig"]
 class ProvenanceConfig:
     base_iri: str = "http://example.org/"
     prov_dir: str = "prov"
+    prov_path: str | None = None
     force: bool = False
     dry_run: bool = False
     out_fmt: ProvFormat = "json"
+    jsonld_with_context: bool = False
 GLOBAL_CONFIG = ProvenanceConfig()
 def main(subcommands=None, conf_obj=None, parsers=None):
-    from .core import COMMANDS
+    from .core import COMMANDS, flush_prov_buffer, start_prov_buffer
     subcommands = subcommands or COMMANDS
     conf_obj = conf_obj or GLOBAL_CONFIG
@@ -53,11 +55,25 @@ def main(subcommands=None, conf_obj=None, parsers=None):
             logging.debug(f"Setting config {p}")
             conf(conf_obj, p)
-    apply_globals(sys.argv[1:])  # apply effects early
-    logging.debug(f"Config: {conf_obj}")
-    defopt.run(
-        subcommands,
-        parsers=parsers or {},
-        argv=sys.argv[1:],
-        argparse_kwargs={"parents": [parent]},
+        return ns
+    parent.add_argument(
+        "--merge-prov",
+        action="store_true",
+        help="Merge provenance from invoked commands into a single output",
     )
+    ns = apply_globals(sys.argv[1:])  # apply effects early
+    logging.debug(f"Config: {conf_obj}")
+    try:
+        if ns.merge_prov:
+            start_prov_buffer()
+        defopt.run(
+            subcommands,
+            parsers=parsers or {},
+            argv=sys.argv[1:],
+            argparse_kwargs={"parents": [parent]},
+        )
+    finally:
+        if ns.merge_prov:
+            flush_prov_buffer()

{makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov/core.py RENAMED Viewed

@@ -10,7 +10,7 @@ from collections.abc import Callable
 from .config import ProvenanceConfig, ProvFormat, GLOBAL_CONFIG
 from .paths import InPath, OutPath
-from .prov import Prov
+from .prov import Prov, ProvResult, write_combined_prov
 try:
     import rdflib  # optional
@@ -21,6 +21,29 @@ except Exception:
 RULES: dict[str, dict[str, Any]] = {}
 COMMANDS: set[Callable] = set()
+PROV_BUFFER: list[ProvResult] | None = None
+def start_prov_buffer() -> None:
+    global PROV_BUFFER
+    PROV_BUFFER = []
+def flush_prov_buffer() -> None:
+    global PROV_BUFFER
+    try:
+        if PROV_BUFFER:
+            write_combined_prov(
+                PROV_BUFFER,
+                prov_path=GLOBAL_CONFIG.prov_path or Path(GLOBAL_CONFIG.prov_dir)
+                / "combined",
+                fmt=GLOBAL_CONFIG.out_fmt,
+                jsonld_with_context=GLOBAL_CONFIG.jsonld_with_context,
+            )
+    finally:
+        PROV_BUFFER = None
 def needs_update(outputs, deps) -> bool:
     """Return True if any output missing or older than any dependency."""
     out_paths = [Path(o) for o in outputs]
@@ -43,6 +66,7 @@ def build(target, _seen=None):
     Recursively build target after its dependencies, if needed.
     `target` is a path (string/Path). Only rules with default OutPath are in DAG.
     """
+    top_level = _seen is None
     if _seen is None:
         _seen = set()
     target = str(target)
@@ -50,12 +74,18 @@ def build(target, _seen=None):
         raise RuntimeError(f"Cycle in build graph at {target!r}")
     _seen.add(target)
+    if top_level:
+        start_prov_buffer()
     rule = RULES[target]
     for dep in rule["deps"]:
         if dep in RULES:
             build(dep, _seen)
     rule["func"]()
+    if top_level:
+        flush_prov_buffer()
 def _is_kind_annotation(ann: Any, cls: type) -> bool:
     if ann is cls:
         return True
@@ -74,7 +104,7 @@ def rule(
     dry_run: bool | None = None,
     out_fmt: ProvFormat | None = None,
     config: ProvenanceConfig | None = None,
-    jsonld_with_context: bool = False,
+    jsonld_with_context: bool | None = None,
 ):
     """
     Decorator that infers inputs/outputs from type annotations
@@ -84,9 +114,11 @@ def rule(
     rule_config = ProvenanceConfig(
         base_iri=base_iri if base_iri is not None else base_config.base_iri,
         prov_dir=prov_dir if prov_dir is not None else base_config.prov_dir,
+        prov_path=base_config.prov_path,
         force=force if force is not None else base_config.force,
         dry_run=dry_run if dry_run is not None else base_config.dry_run,
         out_fmt=out_fmt if out_fmt is not None else base_config.out_fmt,
+        jsonld_with_context=base_config.jsonld_with_context,
     )
     def decorator(func):
@@ -136,6 +168,12 @@ def rule(
             bound = sig.bind_partial(*args, **kwargs)
             bound.apply_defaults()
+            effective_jsonld_with_context = (
+                jsonld_with_context
+                if jsonld_with_context is not None
+                else rule_config.jsonld_with_context
+            )
             in_files: list[Path] = []
             out_files: list[Path] = []
@@ -199,14 +237,20 @@ def rule(
                     )
                     if prov_path is not None:
                         rule_prov_path = prov_path
+                    elif rule_config.prov_path is not None:
+                        rule_prov_path = rule_config.prov_path
                     else:
                         rule_prov_path = Path(rule_config.prov_dir) / logical_name
-                    prov.write(
-                        rule_prov_path,
-                        fmt=rule_config.out_fmt,
-                        result=result,
-                        jsonld_with_context=jsonld_with_context
-                    )
+                    if PROV_BUFFER is not None:
+                        PROV_BUFFER.append(ProvResult(prov, result))
+                    else:
+                        prov.write(
+                            rule_prov_path,
+                            fmt=rule_config.out_fmt,
+                            result=result,
+                            jsonld_with_context=effective_jsonld_with_context,
+                        )
                 except Exception as prov_exc:  # noqa: BLE001
                     logging.warning("Failed to write provenance for %s: %s", logical_name, prov_exc)
@@ -221,4 +265,4 @@ def rule(
         return wrapped
-    return decorator
+    return decorator

{makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov/paths.py RENAMED Viewed

@@ -1,21 +1,24 @@
 from __future__ import annotations
+import os
 import sys
 from pathlib import Path
 # Platform-appropriate base class for Path subclassing
 _BasePath = type(Path())
 class ProvPath(_BasePath):
     """
     A Path subclass that understands '-' as a special stream path.
     For subclasses InPath and OutPath, '-' maps to stdin/stdout, respectively.
     """
-    def __new__(cls, path: str | bytes | "ProvPath"):
-        self = super().__new__(cls, path)
+    def __new__(cls, *paths: str | bytes | "ProvPath"):
+        raw_paths = [os.fspath(p) for p in paths]
+        self = super().__new__(cls, *paths)
         # We store stream flags on the instance. Path is immutable, but allows attributes.
-        self._is_stream = str(self) == "-"
+        self._is_stream = len(raw_paths) == 1 and raw_paths[0] == "-"
         self._stream_name = None
         return self
@@ -47,8 +50,8 @@ class ProvPath(_BasePath):
 class InPath(ProvPath):
     """Marker for input paths. '-' means stdin."""
-    def __new__(cls, path: str | bytes | ProvPath):
-        self = super().__new__(cls, path)
+    def __new__(cls, *paths: str | bytes | ProvPath):
+        self = super().__new__(cls, *paths)
         if self.is_stream:
             self._stream_name = "stdin"
         return self
@@ -61,8 +64,8 @@ class InPath(ProvPath):
 class OutPath(ProvPath):
     """Marker for output paths. '-' means stdout."""
-    def __new__(cls, path: str | bytes | ProvPath):
-        self = super().__new__(cls, path)
+    def __new__(cls, *paths: str | bytes | ProvPath):
+        self = super().__new__(cls, *paths)
         if self.is_stream:
             self._stream_name = "stdout"
         return self

{makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov/prov.py RENAMED Viewed

@@ -93,6 +93,12 @@ class ProvDoc(JSONLDMixin):
     provenance: list[Any] = field(default_factory=list)
     __context__ = COMMON_CONTEXT
+@dataclass
+class ProvResult:
+    prov: "Prov"
+    result: Any | None = None
 # ---------- helpers ----------
 def _safe_cmd(argv: list[str]) -> str | None:
@@ -275,7 +281,7 @@ class Prov:
                 norm = pep503_normalize(pkg_name)
                 dep_iri = f"https://pypi.org/project/{norm}/"
                 reqs.append(DepNode(id=dep_iri, type="rdfs:Resource", label=spec_str))
-                self.env_node = EnvNode(
+            self.env_node = EnvNode(
                 id=env_id,
                 type=["prov:Entity", "prov:Collection"],
                 label="Python environment",
@@ -288,44 +294,110 @@ class Prov:
                 self.activity.used = []
             self.activity.used.append(env_id)
+    def to_doc(self, *, include_graph_meta: bool = False) -> ProvDoc:
+        provenance: list[Any] = [
+            self.activity,
+            self.agent,
+            *self.output_nodes,
+            *([self.env_node] if self.env_node else []),
+        ]
+        if include_graph_meta:
+            provenance.append(self.graph_meta)
+        return ProvDoc(provenance=provenance)
+    def to_dataset(self, result=None):
+        import rdflib
+        ds = rdflib.Dataset()
+        ds.bind("", self.base_iri)
+        default_graph = ds.default_context
+        for triple in self.to_doc(include_graph_meta=True).to_graph():
+            default_graph.add(triple)
+        if result is not None and isinstance(result, (rdflib.Graph, rdflib.Dataset)):
+            gx = ds.get_context(self.graph_id)
+            for triple in result:
+                gx.add(triple)
+        return ds
     def write(self, prov_path: str | Path, result=None, fmt="json", jsonld_with_context=False) -> Path:
         out = Path(prov_path)
         out.parent.mkdir(parents=True, exist_ok=True)
-        # Assemble document
-        doc = ProvDoc(
-            provenance=[
-                self.activity,
-                self.agent,
-                *self.output_nodes,
-                *([self.env_node] if self.env_node else [])
-            ]
-        )
         if fmt == "json":
-            data = doc.to_jsonld(with_context=jsonld_with_context)
+            data = self.to_doc().to_jsonld(with_context=jsonld_with_context)
+            if result is not None and isinstance(result, JSONLDMixin):
+                data["result"] = [result.to_jsonld(with_context=jsonld_with_context)]
             final = out.with_suffix(".json")
             logging.info("Writing JSON-LD provenance %s", final)
             final.write_text(json.dumps(data, indent=2), encoding="utf-8")
             return final
         elif fmt == "trig":
-            import rdflib
-            ds = rdflib.Dataset()
-            ds.bind("", self.base_iri)
-            D = ds.default_context
-            doc.provenance.append(self.graph_meta)
-            for triple in doc.to_graph():
-                D.add(triple)
-            if result is not None:
-                if isinstance(result, (rdflib.Graph, rdflib.Dataset)):
-                    gx = ds.get_context(self.graph_id)
-                    for triple in result:
-                        gx.add(triple)
+            ds = self.to_dataset(result=result)
             final = out.with_suffix(".trig")
             logging.info("Writing TRIG provenance %s", final)
             ds.serialize(final, format="trig")
+            return final
         else:
-            raise Exception(f"No handler to write Prov object in format '{fmt}'")
+            raise Exception(f"No handler to write Prov object in format '{fmt}'")
+def write_combined_prov(
+    provs: list[ProvResult],
+    prov_path: str | Path,
+    fmt: str = "json",
+    jsonld_with_context: bool = False,
+):
+    if not provs:
+        raise ValueError("No provenance objects provided for combination")
+    out = Path(prov_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    if fmt == "json":
+        combined_doc = ProvDoc(provenance=[])
+        for prov_result in provs:
+            combined_doc.provenance.extend(prov_result.prov.to_doc().provenance)
+        data = combined_doc.to_jsonld(with_context=jsonld_with_context)
+        data["result"] = []
+        for prov_result in provs:
+            if isinstance(prov_result.result, JSONLDMixin):
+                data["result"].append(
+                    prov_result.result.to_jsonld(with_context=jsonld_with_context)
+                )
+        final = out.with_suffix(".json")
+        logging.info("Writing combined JSON-LD provenance %s", final)
+        final.write_text(json.dumps(data, indent=2), encoding="utf-8")
+        return final
+    if fmt == "trig":
+        import rdflib
+        ds = rdflib.Dataset()
+        for prov_result in provs:
+            ds.bind("", prov_result.prov.base_iri)
+            default_graph = ds.default_context
+            for triple in prov_result.prov.to_doc(include_graph_meta=True).to_graph():
+                default_graph.add(triple)
+            if isinstance(prov_result.result, (rdflib.Graph, rdflib.Dataset)):
+                gx = ds.get_context(prov_result.prov.graph_id)
+                for triple in prov_result.result:
+                    gx.add(triple)
+        final = out.with_suffix(".trig")
+        logging.info("Writing combined TRIG provenance %s", final)
+        ds.serialize(final, format="trig")
+        return final
+    raise Exception(f"No handler to write combined Prov objects in format '{fmt}'")

{makeprov-0.2.0 → makeprov-0.2.2}/src/makeprov.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: makeprov
-Version: 0.2.0
+Version: 0.2.2
 Summary: An provenance tracking library for simple Python workflows
 Author-email: Benno Kruit <b.b.kruit@amsterdamumc.nl>
 License: MIT

makeprov-0.2.2/tests/test_makeprov.py ADDED Viewed

@@ -0,0 +1,160 @@
+import json
+import sys
+import tempfile
+from pathlib import Path
+from rdflib import Graph, Literal, Namespace
+from rdflib.namespace import RDF, XSD
+from makeprov import InPath, OutPath, ProvenanceConfig, build, main, rule
+@rule(name="test_process_data")
+def process_data(input_file: InPath, output_file: OutPath):
+    with input_file.open('r') as infile, output_file.open('w') as outfile:
+        data = infile.read()
+        outfile.write(data)
+SALES_NS = Namespace("http://example.org/test/")
+TEST_PROV_DIR = Path(tempfile.mkdtemp(prefix="makeprov-tests-"))
+TEST_PROV_CONFIG = ProvenanceConfig(prov_dir=str(TEST_PROV_DIR))
+@rule(name="test_totals_graph", config=TEST_PROV_CONFIG)
+def totals_graph(input_csv: InPath, graph_out: OutPath) -> Graph:
+    graph = Graph()
+    graph.bind("sales", SALES_NS)
+    with input_csv.open('r') as handle:
+        for line in handle.read().strip().splitlines()[1:]:
+            region, units, revenue = line.split(',')
+            subject = SALES_NS[f"region/{region.lower()}"]
+            graph.add((subject, RDF.type, SALES_NS.RegionTotal))
+            graph.add((subject, SALES_NS.regionName, Literal(region)))
+            graph.add((subject, SALES_NS.totalUnits, Literal(units, datatype=XSD.integer)))
+            graph.add((subject, SALES_NS.totalRevenue, Literal(revenue, datatype=XSD.decimal)))
+    with graph_out.open('w') as handle:
+        handle.write(graph.serialize(format='turtle'))
+    return graph
+def test_process_data(tmp_path):
+    input_file = tmp_path / "input.txt"
+    output_file = tmp_path / "output.txt"
+    input_file.write_text("Hello, world!")
+    # Run the process_data function
+    result = process_data(InPath(str(input_file)), OutPath(str(output_file)))
+    # Check that the output file was created and contains the correct data
+    assert output_file.exists()
+    assert output_file.read_text() == "Hello, world!"
+def test_rule_returns_graph(tmp_path):
+    input_csv = tmp_path / "region_totals.csv"
+    graph_ttl = tmp_path / "region_totals.ttl"
+    input_csv.write_text("region,total_units,total_revenue\nNorth,6,119.94\n")
+    result = totals_graph(InPath(str(input_csv)), OutPath(str(graph_ttl)))
+    assert isinstance(result, Graph)
+    assert graph_ttl.exists()
+    assert "North" in graph_ttl.read_text()
+    print(*TEST_PROV_DIR.glob('*'))
+    assert list(TEST_PROV_DIR.glob('*'))
+def test_build_combines_provenance(tmp_path, monkeypatch):
+    prov_dir = tmp_path / "prov"
+    config = ProvenanceConfig(prov_dir=str(prov_dir))
+    @rule(name="combine_step_one", config=config)
+    def step_one(
+        source: InPath = InPath("combine-source.txt"),
+        mid: OutPath = OutPath("combine-mid.txt"),
+    ):
+        with source.open("r") as src, mid.open("w") as dst:
+            dst.write(src.read() + " step1")
+    @rule(name="combine_step_two", config=config)
+    def step_two(
+        mid: InPath = InPath("combine-mid.txt"),
+        final: OutPath = OutPath("combine-final.txt"),
+    ):
+        with mid.open("r") as src, final.open("w") as dst:
+            dst.write(src.read() + " step2")
+    monkeypatch.chdir(tmp_path)
+    (tmp_path / "combine-source.txt").write_text("data")
+    build("combine-final.txt")
+    final_output = tmp_path / "combine-final.txt"
+    assert final_output.exists()
+    assert final_output.read_text() == "data step1 step2"
+    prov_files = list(prov_dir.glob("*"))
+    assert len(prov_files) == 1
+    prov_json = json.loads(prov_files[0].read_text())
+    activities = [
+        node
+        for node in prov_json["provenance"]
+        if node.get("type") == "prov:Activity"
+        or (
+            isinstance(node.get("type"), list)
+            and "prov:Activity" in node.get("type", [])
+        )
+    ]
+    assert len(activities) == 2
+def test_cli_merge_prov(tmp_path, monkeypatch):
+    prov_dir = tmp_path / "prov"
+    intermediate = tmp_path / "cli-mid.txt"
+    final = tmp_path / "cli-final.txt"
+    config = ProvenanceConfig(prov_dir=str(prov_dir))
+    @rule(name="cli_merge_one", config=config)
+    def step_one(mid: OutPath = OutPath(intermediate)):
+        with mid.open("w") as dst:
+            dst.write("stage1")
+    @rule(name="cli_merge_two", config=config)
+    def step_two(mid: InPath = InPath(intermediate), final: OutPath = OutPath(final)):
+        with mid.open("r") as src, final.open("w") as dst:
+            dst.write(src.read() + " stage2")
+    def run_pipeline():
+        step_one()
+        step_two()
+    monkeypatch.chdir(tmp_path)
+    monkeypatch.setattr(sys, "argv", ["prog", "--merge-prov", "run-pipeline"])
+    main(subcommands=[run_pipeline])
+    assert final.exists()
+    assert final.read_text() == "stage1 stage2"
+    prov_files = list(prov_dir.glob("*"))
+    assert len(prov_files) == 1
+    prov_json = json.loads(prov_files[0].read_text())
+    activities = [
+        node
+        for node in prov_json["provenance"]
+        if node.get("type") == "prov:Activity"
+        or (
+            isinstance(node.get("type"), list)
+            and "prov:Activity" in node.get("type", [])
+        )
+    ]
+    assert len(activities) == 2

makeprov-0.2.0/tests/test_makeprov.py DELETED Viewed

@@ -1,67 +0,0 @@
-import tempfile
-from pathlib import Path
-from rdflib import Graph, Literal, Namespace
-from rdflib.namespace import RDF, XSD
-from makeprov import InPath, OutPath, ProvenanceConfig, rule
-@rule(name="test_process_data")
-def process_data(input_file: InPath, output_file: OutPath):
-    with input_file.open('r') as infile, output_file.open('w') as outfile:
-        data = infile.read()
-        outfile.write(data)
-SALES_NS = Namespace("http://example.org/test/")
-TEST_PROV_DIR = Path(tempfile.mkdtemp(prefix="makeprov-tests-"))
-TEST_PROV_CONFIG = ProvenanceConfig(prov_dir=str(TEST_PROV_DIR))
-@rule(name="test_totals_graph", config=TEST_PROV_CONFIG)
-def totals_graph(input_csv: InPath, graph_out: OutPath) -> Graph:
-    graph = Graph()
-    graph.bind("sales", SALES_NS)
-    with input_csv.open('r') as handle:
-        for line in handle.read().strip().splitlines()[1:]:
-            region, units, revenue = line.split(',')
-            subject = SALES_NS[f"region/{region.lower()}"]
-            graph.add((subject, RDF.type, SALES_NS.RegionTotal))
-            graph.add((subject, SALES_NS.regionName, Literal(region)))
-            graph.add((subject, SALES_NS.totalUnits, Literal(units, datatype=XSD.integer)))
-            graph.add((subject, SALES_NS.totalRevenue, Literal(revenue, datatype=XSD.decimal)))
-    with graph_out.open('w') as handle:
-        handle.write(graph.serialize(format='turtle'))
-    return graph
-def test_process_data(tmp_path):
-    input_file = tmp_path / "input.txt"
-    output_file = tmp_path / "output.txt"
-    input_file.write_text("Hello, world!")
-    # Run the process_data function
-    result = process_data(InPath(str(input_file)), OutPath(str(output_file)))
-    # Check that the output file was created and contains the correct data
-    assert output_file.exists()
-    assert output_file.read_text() == "Hello, world!"
-def test_rule_returns_graph(tmp_path):
-    input_csv = tmp_path / "region_totals.csv"
-    graph_ttl = tmp_path / "region_totals.ttl"
-    input_csv.write_text("region,total_units,total_revenue\nNorth,6,119.94\n")
-    result = totals_graph(InPath(str(input_csv)), OutPath(str(graph_ttl)))
-    assert isinstance(result, Graph)
-    assert graph_ttl.exists()
-    assert "North" in graph_ttl.read_text()
-    print(*TEST_PROV_DIR.glob('*'))
-    assert list(TEST_PROV_DIR.glob('*'))