data-annotations 2.2.0__tar.gz → 2.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_annotations-2.2.0 → data_annotations-2.3.0}/PKG-INFO +33 -3
- {data_annotations-2.2.0 → data_annotations-2.3.0}/README.md +32 -2
- {data_annotations-2.2.0 → data_annotations-2.3.0}/pyproject.toml +1 -1
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/annotations/models.py +2 -2
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/annotations/writers.py +7 -6
- data_annotations-2.3.0/src/data_annotations/cli_app/provenance_commands.py +321 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/provenance/__init__.py +14 -1
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/provenance/models.py +10 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/provenance/recovery.py +326 -4
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/provenance/writers.py +119 -10
- data_annotations-2.2.0/src/data_annotations/cli_app/provenance_commands.py +0 -107
- {data_annotations-2.2.0 → data_annotations-2.3.0}/LICENSE +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/__init__.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/_decorators.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/annotations/__init__.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/annotations/decorators.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/cli.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/cli_app/__init__.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/cli_app/annotate.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/cli_app/common.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/cli_app/prompts.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/description/__init__.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/description/decorators.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/description/models.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/description/writers.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/provenance/decorators.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/provenance/git.py +0 -0
- {data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/provenance/runtime.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-annotations
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.3.0
|
|
4
4
|
Summary: Annotate generated data artifacts
|
|
5
5
|
Keywords: annotations,data,metadata,provenance,reproducibility
|
|
6
6
|
Author: Rodrigo C. G. Pena
|
|
@@ -102,6 +102,9 @@ Every annotation document includes provenance with:
|
|
|
102
102
|
- Git commit, branch, dirty state, canonical repository remote, exact tags, and
|
|
103
103
|
`git describe` output when available
|
|
104
104
|
- The current `SLURM_JOB_ID` when available
|
|
105
|
+
- Structured snapshots for recorded local inputs, including file checksums,
|
|
106
|
+
directory content digests, and upstream annotation sidecar references when
|
|
107
|
+
present
|
|
105
108
|
|
|
106
109
|
You can also attach your own parameters, input file paths, and function names.
|
|
107
110
|
Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
|
|
@@ -380,6 +383,7 @@ File annotations store:
|
|
|
380
383
|
- `subject.kind`
|
|
381
384
|
- `subject.sha256`
|
|
382
385
|
- `provenance.*`
|
|
386
|
+
- `provenance.input_artifacts[]`
|
|
383
387
|
- `description.title`
|
|
384
388
|
- `description.summary`
|
|
385
389
|
- `description.fields`
|
|
@@ -396,6 +400,7 @@ Directory annotations store:
|
|
|
396
400
|
- `subject.child_bundles[]`
|
|
397
401
|
- `subject.content_digest`
|
|
398
402
|
- `provenance.*`
|
|
403
|
+
- `provenance.input_artifacts[]`
|
|
399
404
|
- `description.title`
|
|
400
405
|
- `description.summary`
|
|
401
406
|
- `description.artifact_groups[]`
|
|
@@ -529,13 +534,16 @@ per call.
|
|
|
529
534
|
## Recovery Helpers
|
|
530
535
|
|
|
531
536
|
Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
|
|
532
|
-
matches an annotation document
|
|
533
|
-
recorded
|
|
537
|
+
matches an annotation document. Use `analyze_provenance_chain(...)` when you also
|
|
538
|
+
want to verify recorded inputs and recursively follow upstream annotation
|
|
539
|
+
sidecars. Use `checkout_manifest_source(...)` to recover the recorded code state
|
|
540
|
+
from Git metadata.
|
|
534
541
|
|
|
535
542
|
```python
|
|
536
543
|
from pathlib import Path
|
|
537
544
|
|
|
538
545
|
from data_annotations.provenance import (
|
|
546
|
+
analyze_provenance_chain,
|
|
539
547
|
artifact_matches_manifest,
|
|
540
548
|
checkout_manifest_source,
|
|
541
549
|
)
|
|
@@ -544,6 +552,8 @@ annotation_path = Path("outputs/participants.csv.annotation.json")
|
|
|
544
552
|
artifact_path = Path("downloads/participants.csv")
|
|
545
553
|
|
|
546
554
|
if artifact_matches_manifest(artifact_path, annotation_path):
|
|
555
|
+
chain = analyze_provenance_chain(artifact_path)
|
|
556
|
+
print(chain.status)
|
|
547
557
|
recovered = checkout_manifest_source(annotation_path)
|
|
548
558
|
print(recovered.checkout_path)
|
|
549
559
|
print(recovered.script_path)
|
|
@@ -602,12 +612,24 @@ For provenance inspection and source recovery:
|
|
|
602
612
|
|
|
603
613
|
```bash
|
|
604
614
|
data-annotations provenance match path/to/artifact
|
|
615
|
+
data-annotations provenance chain path/to/artifact
|
|
616
|
+
data-annotations provenance chain path/to/artifact --full-paths
|
|
605
617
|
data-annotations provenance checkout path/to/artifact
|
|
606
618
|
```
|
|
607
619
|
|
|
608
620
|
Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
|
|
609
621
|
directories, prints a verification summary, and suggests the exact `checkout`
|
|
610
622
|
command to run next when Git recovery metadata is available.
|
|
623
|
+
Command `chain` uses the same sidecar discovery, then verifies the artifact,
|
|
624
|
+
recorded input snapshots, and any upstream annotation sidecars reachable from
|
|
625
|
+
those inputs. Its default output shows a compact relative-path tree and lists
|
|
626
|
+
stale, missing, or unverifiable nodes first; use `--full-paths` when you need
|
|
627
|
+
absolute paths.
|
|
628
|
+
|
|
629
|
+
If `data-annotations provenance --help` does not list `chain`, your shell is
|
|
630
|
+
resolving an older installed command. From a source checkout, use
|
|
631
|
+
`uv run data-annotations provenance chain ...`, or reinstall the CLI from the
|
|
632
|
+
updated source before using the bare `data-annotations` command.
|
|
611
633
|
|
|
612
634
|
### Run With `uvx`
|
|
613
635
|
|
|
@@ -632,6 +654,7 @@ the project environment. You can then run:
|
|
|
632
654
|
uv run data-annotations annotate file path/to/participants.csv
|
|
633
655
|
uv run data-annotations annotate directory path/to/run-001
|
|
634
656
|
uv run data-annotations provenance match path/to/participants.csv
|
|
657
|
+
uv run data-annotations provenance chain path/to/participants.csv
|
|
635
658
|
uv run data-annotations provenance checkout path/to/participants.csv
|
|
636
659
|
```
|
|
637
660
|
|
|
@@ -684,9 +707,12 @@ uv run data-annotations provenance checkout path/to/participants.csv
|
|
|
684
707
|
|
|
685
708
|
- `ProducedFile`
|
|
686
709
|
- `ChildBundle`
|
|
710
|
+
- `InputArtifact`
|
|
687
711
|
- `BaseProvenance`
|
|
688
712
|
- `FileManifest`
|
|
689
713
|
- `DirectoryManifest`
|
|
714
|
+
- `ProvenanceChainNode`
|
|
715
|
+
- `ProvenanceChainReport`
|
|
690
716
|
- `RecoveredSource`
|
|
691
717
|
|
|
692
718
|
### Provenance Functions
|
|
@@ -696,6 +722,8 @@ uv run data-annotations provenance checkout path/to/participants.csv
|
|
|
696
722
|
- `write_file_manifest(...)`
|
|
697
723
|
- `write_directory_manifest(...)`
|
|
698
724
|
- `directory_content_digest(...)`
|
|
725
|
+
- `analyze_provenance_chain(...)`
|
|
726
|
+
- `provenance_chain_is_fresh(...)`
|
|
699
727
|
- `artifact_matches_manifest(...)`
|
|
700
728
|
- `checkout_manifest_source(...)`
|
|
701
729
|
|
|
@@ -717,6 +745,8 @@ uv run python examples/write_file_manifest.py
|
|
|
717
745
|
uv run python examples/write_directory_manifest.py
|
|
718
746
|
uv run python examples/write_file_description.py
|
|
719
747
|
uv run python examples/write_directory_description.py
|
|
748
|
+
uv run python examples/provenance_chain.py
|
|
749
|
+
uv run python examples/provenance_chain_cli.py
|
|
720
750
|
uv run python examples/recover_provenance.py
|
|
721
751
|
uv run python examples/recover_provenance_cli.py
|
|
722
752
|
```
|
|
@@ -73,6 +73,9 @@ Every annotation document includes provenance with:
|
|
|
73
73
|
- Git commit, branch, dirty state, canonical repository remote, exact tags, and
|
|
74
74
|
`git describe` output when available
|
|
75
75
|
- The current `SLURM_JOB_ID` when available
|
|
76
|
+
- Structured snapshots for recorded local inputs, including file checksums,
|
|
77
|
+
directory content digests, and upstream annotation sidecar references when
|
|
78
|
+
present
|
|
76
79
|
|
|
77
80
|
You can also attach your own parameters, input file paths, and function names.
|
|
78
81
|
Local filesystem paths in provenance are stored as absolute paths. URI-style inputs
|
|
@@ -351,6 +354,7 @@ File annotations store:
|
|
|
351
354
|
- `subject.kind`
|
|
352
355
|
- `subject.sha256`
|
|
353
356
|
- `provenance.*`
|
|
357
|
+
- `provenance.input_artifacts[]`
|
|
354
358
|
- `description.title`
|
|
355
359
|
- `description.summary`
|
|
356
360
|
- `description.fields`
|
|
@@ -367,6 +371,7 @@ Directory annotations store:
|
|
|
367
371
|
- `subject.child_bundles[]`
|
|
368
372
|
- `subject.content_digest`
|
|
369
373
|
- `provenance.*`
|
|
374
|
+
- `provenance.input_artifacts[]`
|
|
370
375
|
- `description.title`
|
|
371
376
|
- `description.summary`
|
|
372
377
|
- `description.artifact_groups[]`
|
|
@@ -500,13 +505,16 @@ per call.
|
|
|
500
505
|
## Recovery Helpers
|
|
501
506
|
|
|
502
507
|
Use `artifact_matches_manifest(...)` to verify whether a detached artifact still
|
|
503
|
-
matches an annotation document
|
|
504
|
-
recorded
|
|
508
|
+
matches an annotation document. Use `analyze_provenance_chain(...)` when you also
|
|
509
|
+
want to verify recorded inputs and recursively follow upstream annotation
|
|
510
|
+
sidecars. Use `checkout_manifest_source(...)` to recover the recorded code state
|
|
511
|
+
from Git metadata.
|
|
505
512
|
|
|
506
513
|
```python
|
|
507
514
|
from pathlib import Path
|
|
508
515
|
|
|
509
516
|
from data_annotations.provenance import (
|
|
517
|
+
analyze_provenance_chain,
|
|
510
518
|
artifact_matches_manifest,
|
|
511
519
|
checkout_manifest_source,
|
|
512
520
|
)
|
|
@@ -515,6 +523,8 @@ annotation_path = Path("outputs/participants.csv.annotation.json")
|
|
|
515
523
|
artifact_path = Path("downloads/participants.csv")
|
|
516
524
|
|
|
517
525
|
if artifact_matches_manifest(artifact_path, annotation_path):
|
|
526
|
+
chain = analyze_provenance_chain(artifact_path)
|
|
527
|
+
print(chain.status)
|
|
518
528
|
recovered = checkout_manifest_source(annotation_path)
|
|
519
529
|
print(recovered.checkout_path)
|
|
520
530
|
print(recovered.script_path)
|
|
@@ -573,12 +583,24 @@ For provenance inspection and source recovery:
|
|
|
573
583
|
|
|
574
584
|
```bash
|
|
575
585
|
data-annotations provenance match path/to/artifact
|
|
586
|
+
data-annotations provenance chain path/to/artifact
|
|
587
|
+
data-annotations provenance chain path/to/artifact --full-paths
|
|
576
588
|
data-annotations provenance checkout path/to/artifact
|
|
577
589
|
```
|
|
578
590
|
|
|
579
591
|
Command `match` auto-discovers `*.annotation.json` for files and `data-annotations.json` for
|
|
580
592
|
directories, prints a verification summary, and suggests the exact `checkout`
|
|
581
593
|
command to run next when Git recovery metadata is available.
|
|
594
|
+
Command `chain` uses the same sidecar discovery, then verifies the artifact,
|
|
595
|
+
recorded input snapshots, and any upstream annotation sidecars reachable from
|
|
596
|
+
those inputs. Its default output shows a compact relative-path tree and lists
|
|
597
|
+
stale, missing, or unverifiable nodes first; use `--full-paths` when you need
|
|
598
|
+
absolute paths.
|
|
599
|
+
|
|
600
|
+
If `data-annotations provenance --help` does not list `chain`, your shell is
|
|
601
|
+
resolving an older installed command. From a source checkout, use
|
|
602
|
+
`uv run data-annotations provenance chain ...`, or reinstall the CLI from the
|
|
603
|
+
updated source before using the bare `data-annotations` command.
|
|
582
604
|
|
|
583
605
|
### Run With `uvx`
|
|
584
606
|
|
|
@@ -603,6 +625,7 @@ the project environment. You can then run:
|
|
|
603
625
|
uv run data-annotations annotate file path/to/participants.csv
|
|
604
626
|
uv run data-annotations annotate directory path/to/run-001
|
|
605
627
|
uv run data-annotations provenance match path/to/participants.csv
|
|
628
|
+
uv run data-annotations provenance chain path/to/participants.csv
|
|
606
629
|
uv run data-annotations provenance checkout path/to/participants.csv
|
|
607
630
|
```
|
|
608
631
|
|
|
@@ -655,9 +678,12 @@ uv run data-annotations provenance checkout path/to/participants.csv
|
|
|
655
678
|
|
|
656
679
|
- `ProducedFile`
|
|
657
680
|
- `ChildBundle`
|
|
681
|
+
- `InputArtifact`
|
|
658
682
|
- `BaseProvenance`
|
|
659
683
|
- `FileManifest`
|
|
660
684
|
- `DirectoryManifest`
|
|
685
|
+
- `ProvenanceChainNode`
|
|
686
|
+
- `ProvenanceChainReport`
|
|
661
687
|
- `RecoveredSource`
|
|
662
688
|
|
|
663
689
|
### Provenance Functions
|
|
@@ -667,6 +693,8 @@ uv run data-annotations provenance checkout path/to/participants.csv
|
|
|
667
693
|
- `write_file_manifest(...)`
|
|
668
694
|
- `write_directory_manifest(...)`
|
|
669
695
|
- `directory_content_digest(...)`
|
|
696
|
+
- `analyze_provenance_chain(...)`
|
|
697
|
+
- `provenance_chain_is_fresh(...)`
|
|
670
698
|
- `artifact_matches_manifest(...)`
|
|
671
699
|
- `checkout_manifest_source(...)`
|
|
672
700
|
|
|
@@ -688,6 +716,8 @@ uv run python examples/write_file_manifest.py
|
|
|
688
716
|
uv run python examples/write_directory_manifest.py
|
|
689
717
|
uv run python examples/write_file_description.py
|
|
690
718
|
uv run python examples/write_directory_description.py
|
|
719
|
+
uv run python examples/provenance_chain.py
|
|
720
|
+
uv run python examples/provenance_chain_cli.py
|
|
691
721
|
uv run python examples/recover_provenance.py
|
|
692
722
|
uv run python examples/recover_provenance_cli.py
|
|
693
723
|
```
|
{data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/annotations/models.py
RENAMED
|
@@ -22,14 +22,14 @@ class DirectoryArtifactSubject(BaseModel):
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class FileAnnotationDocument(BaseModel):
|
|
25
|
-
annotation_version: Literal["
|
|
25
|
+
annotation_version: Literal["5"] = "5"
|
|
26
26
|
subject: FileArtifactSubject
|
|
27
27
|
provenance: BaseProvenance
|
|
28
28
|
description: FileDescription
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class DirectoryAnnotationDocument(BaseModel):
|
|
32
|
-
annotation_version: Literal["
|
|
32
|
+
annotation_version: Literal["5"] = "5"
|
|
33
33
|
subject: DirectoryArtifactSubject
|
|
34
34
|
provenance: BaseProvenance
|
|
35
35
|
description: DirectoryDescription
|
{data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/annotations/writers.py
RENAMED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
1
2
|
from pathlib import Path
|
|
2
3
|
from typing import Any, Callable
|
|
3
4
|
|
|
@@ -154,7 +155,7 @@ def _build_file_annotation_document(
|
|
|
154
155
|
generation_context: dict[str, Any] | None = None,
|
|
155
156
|
artifact_kind: ArtifactKind = "other",
|
|
156
157
|
params: dict[str, Any] | None = None,
|
|
157
|
-
inputs:
|
|
158
|
+
inputs: Sequence[str | Path] | None = None,
|
|
158
159
|
function: Callable[..., Any] | None = None,
|
|
159
160
|
capture_mode: str = "runtime",
|
|
160
161
|
provenance_overrides: dict[str, Any] | None = None,
|
|
@@ -205,7 +206,7 @@ def _build_directory_annotation_document(
|
|
|
205
206
|
acquisition_context: dict[str, Any] | None = None,
|
|
206
207
|
generation_context: dict[str, Any] | None = None,
|
|
207
208
|
params: dict[str, Any] | None = None,
|
|
208
|
-
inputs:
|
|
209
|
+
inputs: Sequence[str | Path] | None = None,
|
|
209
210
|
function: Callable[..., Any] | None = None,
|
|
210
211
|
capture_mode: str = "runtime",
|
|
211
212
|
provenance_overrides: dict[str, Any] | None = None,
|
|
@@ -285,7 +286,7 @@ def write_file_annotation(
|
|
|
285
286
|
generation_context: dict[str, Any] | None = None,
|
|
286
287
|
artifact_kind: ArtifactKind = "other",
|
|
287
288
|
params: dict[str, Any] | None = None,
|
|
288
|
-
inputs:
|
|
289
|
+
inputs: Sequence[str | Path] | None = None,
|
|
289
290
|
function: Callable[..., Any] | None = None,
|
|
290
291
|
capture_mode: str = "runtime",
|
|
291
292
|
provenance_overrides: dict[str, Any] | None = None,
|
|
@@ -324,7 +325,7 @@ def write_directory_annotation(
|
|
|
324
325
|
acquisition_context: dict[str, Any] | None = None,
|
|
325
326
|
generation_context: dict[str, Any] | None = None,
|
|
326
327
|
params: dict[str, Any] | None = None,
|
|
327
|
-
inputs:
|
|
328
|
+
inputs: Sequence[str | Path] | None = None,
|
|
328
329
|
function: Callable[..., Any] | None = None,
|
|
329
330
|
capture_mode: str = "runtime",
|
|
330
331
|
provenance_overrides: dict[str, Any] | None = None,
|
|
@@ -363,7 +364,7 @@ def annotate_file(
|
|
|
363
364
|
generation_context: dict[str, Any] | None = None,
|
|
364
365
|
artifact_kind: ArtifactKind = "other",
|
|
365
366
|
params: dict[str, Any] | None = None,
|
|
366
|
-
inputs:
|
|
367
|
+
inputs: Sequence[str | Path] | None = None,
|
|
367
368
|
function: Callable[..., Any] | None = None,
|
|
368
369
|
write_readme: bool = True,
|
|
369
370
|
write_schema: bool | None = None,
|
|
@@ -418,7 +419,7 @@ def annotate_directory(
|
|
|
418
419
|
acquisition_context: dict[str, Any] | None = None,
|
|
419
420
|
generation_context: dict[str, Any] | None = None,
|
|
420
421
|
params: dict[str, Any] | None = None,
|
|
421
|
-
inputs:
|
|
422
|
+
inputs: Sequence[str | Path] | None = None,
|
|
422
423
|
function: Callable[..., Any] | None = None,
|
|
423
424
|
write_readme: bool = True,
|
|
424
425
|
write_schema: bool | None = None,
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from os.path import commonpath
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
import typer
|
|
8
|
+
|
|
9
|
+
from data_annotations.provenance import checkout_manifest_source
|
|
10
|
+
from data_annotations.provenance import recovery as provenance_recovery
|
|
11
|
+
|
|
12
|
+
from .common import (
|
|
13
|
+
_checkout_hint,
|
|
14
|
+
_echo_entries,
|
|
15
|
+
_error,
|
|
16
|
+
_match_target_path,
|
|
17
|
+
_missing_checkout_fields,
|
|
18
|
+
_resolve_manifest_path,
|
|
19
|
+
_resolved_path,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
provenance_app = typer.Typer(
|
|
23
|
+
no_args_is_help=True,
|
|
24
|
+
help="Inspect provenance recorded in annotation documents.",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@provenance_app.command("match")
|
|
29
|
+
def match_command(
|
|
30
|
+
target: Path = typer.Argument(
|
|
31
|
+
..., help="Artifact, directory, or annotation document path."
|
|
32
|
+
),
|
|
33
|
+
manifest: Path | None = typer.Option(
|
|
34
|
+
None,
|
|
35
|
+
"--manifest",
|
|
36
|
+
help="Explicit annotation document path to use instead of auto-discovery.",
|
|
37
|
+
),
|
|
38
|
+
) -> None:
|
|
39
|
+
manifest_path = _resolve_manifest_path(target, manifest)
|
|
40
|
+
candidate_path = _match_target_path(target, manifest)
|
|
41
|
+
loaded_manifest = provenance_recovery._load_manifest(manifest_path)
|
|
42
|
+
match = provenance_recovery._analyze_artifact_match(candidate_path, loaded_manifest)
|
|
43
|
+
|
|
44
|
+
typer.echo(f"Target: {candidate_path}")
|
|
45
|
+
typer.echo(f"Manifest: {manifest_path}")
|
|
46
|
+
typer.echo(f"Result: {match.status.replace('_', ' ').upper()}")
|
|
47
|
+
|
|
48
|
+
_echo_entries("Verified entries", match.verified_entries)
|
|
49
|
+
_echo_entries("Missing tracked entries", match.missing_tracked_entries)
|
|
50
|
+
_echo_entries("Mismatched tracked entries", match.mismatched_tracked_entries)
|
|
51
|
+
_echo_entries("Extra entries", match.extra_entries)
|
|
52
|
+
_echo_entries("Unverifiable tracked entries", match.unverifiable_tracked_entries)
|
|
53
|
+
|
|
54
|
+
if match.status in {"match", "partial_match"}:
|
|
55
|
+
missing_checkout_fields = _missing_checkout_fields(loaded_manifest)
|
|
56
|
+
if missing_checkout_fields:
|
|
57
|
+
typer.echo(
|
|
58
|
+
"Checkout unavailable: manifest is missing "
|
|
59
|
+
+ ", ".join(missing_checkout_fields)
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
typer.echo("Next step:")
|
|
63
|
+
typer.echo(
|
|
64
|
+
" "
|
|
65
|
+
+ _checkout_hint(
|
|
66
|
+
str(_resolved_path(target)),
|
|
67
|
+
str(_resolved_path(manifest)) if manifest is not None else None,
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
raise typer.Exit(code=1)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass(frozen=True)
|
|
76
|
+
class _DisplayChainNode:
|
|
77
|
+
path: str
|
|
78
|
+
status: provenance_recovery.ChainStatus
|
|
79
|
+
details: tuple[str, ...]
|
|
80
|
+
inputs: tuple["_DisplayChainNode", ...]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _is_uri_like(value: str) -> bool:
|
|
84
|
+
parsed = urlparse(value)
|
|
85
|
+
return bool(parsed.scheme and (parsed.netloc or "://" in value))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _node_paths_equal(left: str, right: str) -> bool:
|
|
89
|
+
if _is_uri_like(left) or _is_uri_like(right):
|
|
90
|
+
return left == right
|
|
91
|
+
return Path(left).expanduser().resolve() == Path(right).expanduser().resolve()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _should_merge_upstream_node(
|
|
95
|
+
node: provenance_recovery.ProvenanceChainNode,
|
|
96
|
+
child: provenance_recovery.ProvenanceChainNode,
|
|
97
|
+
) -> bool:
|
|
98
|
+
return _node_paths_equal(node.path, child.path)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _display_chain_node(
|
|
102
|
+
node: provenance_recovery.ProvenanceChainNode,
|
|
103
|
+
) -> _DisplayChainNode:
|
|
104
|
+
details = node.details
|
|
105
|
+
inputs = node.inputs
|
|
106
|
+
if len(inputs) == 1 and _should_merge_upstream_node(node, inputs[0]):
|
|
107
|
+
upstream = inputs[0]
|
|
108
|
+
details = (*details, *upstream.details)
|
|
109
|
+
inputs = upstream.inputs
|
|
110
|
+
|
|
111
|
+
return _DisplayChainNode(
|
|
112
|
+
path=node.path,
|
|
113
|
+
status=node.status,
|
|
114
|
+
details=details,
|
|
115
|
+
inputs=tuple(_display_chain_node(input_node) for input_node in inputs),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _filesystem_display_base(root: _DisplayChainNode) -> Path | None:
|
|
120
|
+
parent_paths: list[str] = []
|
|
121
|
+
|
|
122
|
+
def collect(node: _DisplayChainNode) -> None:
|
|
123
|
+
if not _is_uri_like(node.path):
|
|
124
|
+
path = Path(node.path).expanduser().resolve()
|
|
125
|
+
parent_paths.append(str(path if path.is_dir() else path.parent))
|
|
126
|
+
for input_node in node.inputs:
|
|
127
|
+
collect(input_node)
|
|
128
|
+
|
|
129
|
+
collect(root)
|
|
130
|
+
if not parent_paths:
|
|
131
|
+
return None
|
|
132
|
+
try:
|
|
133
|
+
return Path(commonpath(parent_paths))
|
|
134
|
+
except ValueError:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _format_chain_path(
|
|
139
|
+
value: str,
|
|
140
|
+
*,
|
|
141
|
+
display_base: Path | None,
|
|
142
|
+
full_paths: bool,
|
|
143
|
+
) -> str:
|
|
144
|
+
if full_paths or _is_uri_like(value):
|
|
145
|
+
return value
|
|
146
|
+
path = Path(value).expanduser().resolve()
|
|
147
|
+
if display_base is None:
|
|
148
|
+
return path.name
|
|
149
|
+
try:
|
|
150
|
+
return path.relative_to(display_base).as_posix()
|
|
151
|
+
except ValueError:
|
|
152
|
+
return str(path)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _problem_detail(details: tuple[str, ...]) -> str | None:
|
|
156
|
+
problem_terms = (
|
|
157
|
+
"changed",
|
|
158
|
+
"missing",
|
|
159
|
+
"no recorded",
|
|
160
|
+
"not locally verifiable",
|
|
161
|
+
"could not",
|
|
162
|
+
"unverifiable",
|
|
163
|
+
"stale:",
|
|
164
|
+
)
|
|
165
|
+
for detail in details:
|
|
166
|
+
lowered = detail.lower()
|
|
167
|
+
if any(term in lowered for term in problem_terms):
|
|
168
|
+
return detail
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _echo_chain_problems(
|
|
173
|
+
root: _DisplayChainNode,
|
|
174
|
+
*,
|
|
175
|
+
display_base: Path | None,
|
|
176
|
+
full_paths: bool,
|
|
177
|
+
) -> None:
|
|
178
|
+
problems: list[tuple[_DisplayChainNode, _DisplayChainNode | None, str]] = []
|
|
179
|
+
|
|
180
|
+
def collect(
|
|
181
|
+
node: _DisplayChainNode,
|
|
182
|
+
parent: _DisplayChainNode | None = None,
|
|
183
|
+
) -> None:
|
|
184
|
+
if node.status != "fresh":
|
|
185
|
+
detail = _problem_detail(node.details)
|
|
186
|
+
if detail is not None:
|
|
187
|
+
problems.append((node, parent, detail))
|
|
188
|
+
for input_node in node.inputs:
|
|
189
|
+
collect(input_node, node)
|
|
190
|
+
|
|
191
|
+
collect(root)
|
|
192
|
+
if not problems:
|
|
193
|
+
return
|
|
194
|
+
|
|
195
|
+
typer.echo("")
|
|
196
|
+
typer.echo("Problems:")
|
|
197
|
+
for node, parent, detail in problems:
|
|
198
|
+
typer.echo(
|
|
199
|
+
" "
|
|
200
|
+
+ f"{node.status.upper():<12}"
|
|
201
|
+
+ _format_chain_path(
|
|
202
|
+
node.path,
|
|
203
|
+
display_base=display_base,
|
|
204
|
+
full_paths=full_paths,
|
|
205
|
+
)
|
|
206
|
+
)
|
|
207
|
+
typer.echo(f" {'':<12}{detail}")
|
|
208
|
+
if parent is not None:
|
|
209
|
+
typer.echo(
|
|
210
|
+
" "
|
|
211
|
+
+ f"{'':<12}used by "
|
|
212
|
+
+ _format_chain_path(
|
|
213
|
+
parent.path,
|
|
214
|
+
display_base=display_base,
|
|
215
|
+
full_paths=full_paths,
|
|
216
|
+
)
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _echo_chain_tree(
|
|
221
|
+
node: _DisplayChainNode,
|
|
222
|
+
*,
|
|
223
|
+
display_base: Path | None,
|
|
224
|
+
full_paths: bool,
|
|
225
|
+
depth: int = 0,
|
|
226
|
+
) -> None:
|
|
227
|
+
indent = " " * depth
|
|
228
|
+
path = _format_chain_path(
|
|
229
|
+
node.path,
|
|
230
|
+
display_base=display_base,
|
|
231
|
+
full_paths=full_paths,
|
|
232
|
+
)
|
|
233
|
+
typer.echo(f"{indent}[{node.status.upper()}] {path}")
|
|
234
|
+
for input_node in node.inputs:
|
|
235
|
+
_echo_chain_tree(
|
|
236
|
+
input_node,
|
|
237
|
+
display_base=display_base,
|
|
238
|
+
full_paths=full_paths,
|
|
239
|
+
depth=depth + 1,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _echo_chain_report(
|
|
244
|
+
report: provenance_recovery.ProvenanceChainReport,
|
|
245
|
+
*,
|
|
246
|
+
full_paths: bool,
|
|
247
|
+
) -> None:
|
|
248
|
+
root = _display_chain_node(report.root)
|
|
249
|
+
display_base = _filesystem_display_base(root)
|
|
250
|
+
typer.echo(f"Result: {report.status.upper()}")
|
|
251
|
+
_echo_chain_problems(root, display_base=display_base, full_paths=full_paths)
|
|
252
|
+
typer.echo("")
|
|
253
|
+
typer.echo("Chain:")
|
|
254
|
+
_echo_chain_tree(root, display_base=display_base, full_paths=full_paths)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
@provenance_app.command("chain")
|
|
258
|
+
def chain_command(
|
|
259
|
+
target: Path = typer.Argument(
|
|
260
|
+
..., help="Artifact, directory, or annotation document path."
|
|
261
|
+
),
|
|
262
|
+
manifest: Path | None = typer.Option(
|
|
263
|
+
None,
|
|
264
|
+
"--manifest",
|
|
265
|
+
help="Explicit annotation document path to use instead of auto-discovery.",
|
|
266
|
+
),
|
|
267
|
+
full_paths: bool = typer.Option(
|
|
268
|
+
False,
|
|
269
|
+
"--full-paths",
|
|
270
|
+
help="Show full paths instead of compact paths relative to the chain root.",
|
|
271
|
+
),
|
|
272
|
+
) -> None:
|
|
273
|
+
manifest_path = _resolve_manifest_path(target, manifest)
|
|
274
|
+
candidate_path = _match_target_path(target, manifest)
|
|
275
|
+
report = provenance_recovery.analyze_provenance_chain(
|
|
276
|
+
candidate_path,
|
|
277
|
+
manifest_path,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
_echo_chain_report(report, full_paths=full_paths)
|
|
281
|
+
|
|
282
|
+
if report.status != "fresh":
|
|
283
|
+
raise typer.Exit(code=1)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
@provenance_app.command("checkout")
|
|
287
|
+
def checkout_command(
|
|
288
|
+
target: Path = typer.Argument(
|
|
289
|
+
..., help="Artifact, directory, or annotation document path."
|
|
290
|
+
),
|
|
291
|
+
manifest: Path | None = typer.Option(
|
|
292
|
+
None,
|
|
293
|
+
"--manifest",
|
|
294
|
+
help="Explicit annotation document path to use instead of auto-discovery.",
|
|
295
|
+
),
|
|
296
|
+
dest: Path | None = typer.Option(
|
|
297
|
+
None,
|
|
298
|
+
"--dest",
|
|
299
|
+
help="Optional checkout destination. Defaults to a stable user cache.",
|
|
300
|
+
),
|
|
301
|
+
) -> None:
|
|
302
|
+
manifest_path = _resolve_manifest_path(target, manifest)
|
|
303
|
+
|
|
304
|
+
try:
|
|
305
|
+
recovered = checkout_manifest_source(
|
|
306
|
+
manifest_path,
|
|
307
|
+
destination_dir=dest,
|
|
308
|
+
)
|
|
309
|
+
except ValueError as exc:
|
|
310
|
+
_error(str(exc), code=1)
|
|
311
|
+
except subprocess.CalledProcessError:
|
|
312
|
+
_error("failed to clone or checkout the recorded repository state", code=1)
|
|
313
|
+
|
|
314
|
+
typer.echo(f"Manifest: {manifest_path}")
|
|
315
|
+
typer.echo(f"Checkout path: {recovered.checkout_path}")
|
|
316
|
+
if recovered.script_path is not None:
|
|
317
|
+
typer.echo(f"Recovered script: {recovered.script_path}")
|
|
318
|
+
else:
|
|
319
|
+
typer.echo(
|
|
320
|
+
"Recovered repository checkout, but the generating script could not be resolved."
|
|
321
|
+
)
|
{data_annotations-2.2.0 → data_annotations-2.3.0}/src/data_annotations/provenance/__init__.py
RENAMED
|
@@ -4,12 +4,20 @@ from .models import (
|
|
|
4
4
|
ChildBundle,
|
|
5
5
|
DirectoryManifest,
|
|
6
6
|
FileManifest,
|
|
7
|
+
InputArtifact,
|
|
7
8
|
ProducedFile,
|
|
8
9
|
RecoveredSource,
|
|
9
10
|
)
|
|
10
11
|
from .decorators import record_directory_manifest, record_file_manifest
|
|
11
12
|
from .git import capture_git_info
|
|
12
|
-
from .recovery import
|
|
13
|
+
from .recovery import (
|
|
14
|
+
ProvenanceChainNode,
|
|
15
|
+
ProvenanceChainReport,
|
|
16
|
+
analyze_provenance_chain,
|
|
17
|
+
artifact_matches_manifest,
|
|
18
|
+
checkout_manifest_source,
|
|
19
|
+
provenance_chain_is_fresh,
|
|
20
|
+
)
|
|
13
21
|
from .runtime import capture_runtime_info
|
|
14
22
|
from .writers import (
|
|
15
23
|
callable_name,
|
|
@@ -25,13 +33,18 @@ __all__ = [
|
|
|
25
33
|
"ChildBundle",
|
|
26
34
|
"DirectoryManifest",
|
|
27
35
|
"FileManifest",
|
|
36
|
+
"InputArtifact",
|
|
28
37
|
"ProducedFile",
|
|
38
|
+
"ProvenanceChainNode",
|
|
39
|
+
"ProvenanceChainReport",
|
|
29
40
|
"RecoveredSource",
|
|
41
|
+
"analyze_provenance_chain",
|
|
30
42
|
"artifact_matches_manifest",
|
|
31
43
|
"callable_name",
|
|
32
44
|
"capture_git_info",
|
|
33
45
|
"capture_runtime_info",
|
|
34
46
|
"checkout_manifest_source",
|
|
47
|
+
"provenance_chain_is_fresh",
|
|
35
48
|
"directory_content_digest",
|
|
36
49
|
"record_directory_manifest",
|
|
37
50
|
"record_file_manifest",
|