data-annotations 2.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_annotations/__init__.py +2 -0
- data_annotations/_decorators.py +140 -0
- data_annotations/annotations/__init__.py +30 -0
- data_annotations/annotations/decorators.py +147 -0
- data_annotations/annotations/models.py +45 -0
- data_annotations/annotations/writers.py +368 -0
- data_annotations/cli.py +37 -0
- data_annotations/cli_app/__init__.py +1 -0
- data_annotations/cli_app/annotate.py +307 -0
- data_annotations/cli_app/common.py +276 -0
- data_annotations/cli_app/prompts.py +534 -0
- data_annotations/cli_app/provenance_commands.py +107 -0
- data_annotations/description/__init__.py +37 -0
- data_annotations/description/decorators.py +145 -0
- data_annotations/description/models.py +63 -0
- data_annotations/description/writers.py +321 -0
- data_annotations/provenance/__init__.py +37 -0
- data_annotations/provenance/decorators.py +111 -0
- data_annotations/provenance/git.py +121 -0
- data_annotations/provenance/models.py +50 -0
- data_annotations/provenance/recovery.py +473 -0
- data_annotations/provenance/runtime.py +248 -0
- data_annotations/provenance/writers.py +206 -0
- data_annotations-2.1.2.dist-info/METADATA +616 -0
- data_annotations-2.1.2.dist-info/RECORD +28 -0
- data_annotations-2.1.2.dist-info/WHEEL +4 -0
- data_annotations-2.1.2.dist-info/entry_points.txt +3 -0
- data_annotations-2.1.2.dist-info/licenses/LICENSE +28 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any
|
|
4
|
+
from urllib.parse import urlsplit, urlunsplit
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
_SCP_LIKE_REMOTE_RE = re.compile(r"^(?P<user>[^@]+)@(?P<host>[^:]+):(?P<path>.+)$")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _git_output(*args: str, subprocess_module=subprocess) -> str | None:
|
|
11
|
+
try:
|
|
12
|
+
return subprocess_module.check_output(
|
|
13
|
+
["git", *args],
|
|
14
|
+
text=True,
|
|
15
|
+
stderr=subprocess_module.DEVNULL,
|
|
16
|
+
).strip()
|
|
17
|
+
except Exception:
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _strip_git_suffix(path: str) -> str:
|
|
22
|
+
if path.endswith(".git"):
|
|
23
|
+
return path[:-4]
|
|
24
|
+
return path
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _sanitize_remote_url(remote_url: str | None) -> str | None:
|
|
28
|
+
if not remote_url:
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
scp_match = _SCP_LIKE_REMOTE_RE.match(remote_url)
|
|
32
|
+
if scp_match is not None:
|
|
33
|
+
host = scp_match.group("host")
|
|
34
|
+
path = _strip_git_suffix(scp_match.group("path").lstrip("/"))
|
|
35
|
+
return f"https://{host}/{path}"
|
|
36
|
+
|
|
37
|
+
parsed = urlsplit(remote_url)
|
|
38
|
+
hostname = parsed.hostname
|
|
39
|
+
if not hostname:
|
|
40
|
+
return remote_url
|
|
41
|
+
|
|
42
|
+
netloc = hostname
|
|
43
|
+
if parsed.port is not None:
|
|
44
|
+
netloc = f"{netloc}:{parsed.port}"
|
|
45
|
+
|
|
46
|
+
path = parsed.path
|
|
47
|
+
if parsed.scheme in {"http", "https"}:
|
|
48
|
+
return urlunsplit((parsed.scheme, netloc, _strip_git_suffix(path), "", ""))
|
|
49
|
+
|
|
50
|
+
if parsed.scheme in {"ssh", "git"} and path:
|
|
51
|
+
return urlunsplit(("https", netloc, _strip_git_suffix(path), "", ""))
|
|
52
|
+
|
|
53
|
+
return urlunsplit((parsed.scheme, netloc, path, parsed.query, parsed.fragment))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _remote_names(*, git_output_fn) -> list[str]:
|
|
57
|
+
remote_output = git_output_fn("remote")
|
|
58
|
+
if not remote_output:
|
|
59
|
+
return []
|
|
60
|
+
return [line.strip() for line in remote_output.splitlines() if line.strip()]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _upstream_remote_name(*, git_output_fn) -> str | None:
|
|
64
|
+
upstream = git_output_fn(
|
|
65
|
+
"rev-parse",
|
|
66
|
+
"--abbrev-ref",
|
|
67
|
+
"--symbolic-full-name",
|
|
68
|
+
"@{upstream}",
|
|
69
|
+
)
|
|
70
|
+
if not upstream or "/" not in upstream:
|
|
71
|
+
return None
|
|
72
|
+
return upstream.split("/", 1)[0]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _canonical_remote_name(*, git_output_fn) -> str | None:
|
|
76
|
+
upstream_remote = _upstream_remote_name(git_output_fn=git_output_fn)
|
|
77
|
+
if upstream_remote and git_output_fn("remote", "get-url", upstream_remote):
|
|
78
|
+
return upstream_remote
|
|
79
|
+
|
|
80
|
+
if git_output_fn("remote", "get-url", "origin"):
|
|
81
|
+
return "origin"
|
|
82
|
+
|
|
83
|
+
remotes = _remote_names(git_output_fn=git_output_fn)
|
|
84
|
+
if len(remotes) == 1 and git_output_fn("remote", "get-url", remotes[0]):
|
|
85
|
+
return remotes[0]
|
|
86
|
+
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def capture_git_info(
|
|
91
|
+
*,
|
|
92
|
+
git_output_fn=None,
|
|
93
|
+
subprocess_module=None,
|
|
94
|
+
) -> dict[str, Any]:
|
|
95
|
+
git_output_fn = _git_output if git_output_fn is None else git_output_fn
|
|
96
|
+
subprocess_module = subprocess if subprocess_module is None else subprocess_module
|
|
97
|
+
sha = git_output_fn("rev-parse", "HEAD")
|
|
98
|
+
branch = git_output_fn("rev-parse", "--abbrev-ref", "HEAD")
|
|
99
|
+
remote_name = _canonical_remote_name(git_output_fn=git_output_fn)
|
|
100
|
+
remote_url = _sanitize_remote_url(
|
|
101
|
+
git_output_fn("remote", "get-url", remote_name) if remote_name else None
|
|
102
|
+
)
|
|
103
|
+
try:
|
|
104
|
+
dirty = (
|
|
105
|
+
subprocess_module.call(
|
|
106
|
+
["git", "diff", "--quiet"],
|
|
107
|
+
stdout=subprocess_module.DEVNULL,
|
|
108
|
+
stderr=subprocess_module.DEVNULL,
|
|
109
|
+
)
|
|
110
|
+
!= 0
|
|
111
|
+
)
|
|
112
|
+
except Exception:
|
|
113
|
+
dirty = None
|
|
114
|
+
|
|
115
|
+
return {
|
|
116
|
+
"git_sha": sha,
|
|
117
|
+
"git_branch": branch,
|
|
118
|
+
"git_dirty": dirty,
|
|
119
|
+
"git_remote_name": remote_name,
|
|
120
|
+
"git_remote_url": remote_url,
|
|
121
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Any, Literal
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
ArtifactKind = Literal["plot", "model", "table", "dataset", "report", "other"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ProducedFile(BaseModel):
|
|
10
|
+
path: str
|
|
11
|
+
kind: ArtifactKind = "other"
|
|
12
|
+
sha256: str | None = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseProvenance(BaseModel):
|
|
16
|
+
created_at: datetime
|
|
17
|
+
capture_mode: Literal["runtime", "post_hoc"] = "runtime"
|
|
18
|
+
hostname: str
|
|
19
|
+
username: str
|
|
20
|
+
script: str | None = None
|
|
21
|
+
script_repo_path: str | None = None
|
|
22
|
+
function: str | None = None
|
|
23
|
+
command: list[str] = Field(default_factory=list)
|
|
24
|
+
params: dict[str, Any] = Field(default_factory=dict)
|
|
25
|
+
inputs: list[str] = Field(default_factory=list)
|
|
26
|
+
git_sha: str | None = None
|
|
27
|
+
git_branch: str | None = None
|
|
28
|
+
git_dirty: bool | None = None
|
|
29
|
+
git_remote_name: str | None = None
|
|
30
|
+
git_remote_url: str | None = None
|
|
31
|
+
slurm_job_id: str | None = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DirectoryManifest(BaseProvenance):
|
|
35
|
+
output_dir: str
|
|
36
|
+
produced_files: list[ProducedFile] = Field(default_factory=list)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class FileManifest(BaseProvenance):
|
|
40
|
+
artifact_path: str
|
|
41
|
+
artifact_kind: ArtifactKind = "other"
|
|
42
|
+
artifact_sha256: str | None = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class RecoveredSource(BaseModel):
|
|
46
|
+
checkout_path: str
|
|
47
|
+
script_path: str | None = None
|
|
48
|
+
git_sha: str
|
|
49
|
+
git_remote_url: str
|
|
50
|
+
script_repo_path: str | None = None
|
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Literal
|
|
10
|
+
|
|
11
|
+
from .models import BaseProvenance, DirectoryManifest, FileManifest, RecoveredSource
|
|
12
|
+
from .writers import sha256_file
|
|
13
|
+
|
|
14
|
+
MatchStatus = Literal["match", "partial_match", "mismatch", "unverifiable"]
|
|
15
|
+
_IGNORED_DIRECTORY_EXTRAS = frozenset({"manifest.json", "README.md", "schema.json"})
|
|
16
|
+
_IGNORED_DIRECTORY_SUFFIXES = (".meta.json", ".README.md", ".prov.json", ".schema.json")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _is_ignored_directory_extra(path: Path) -> bool:
|
|
20
|
+
path_name = path.name
|
|
21
|
+
return path_name in _IGNORED_DIRECTORY_EXTRAS or path_name.endswith(
|
|
22
|
+
_IGNORED_DIRECTORY_SUFFIXES
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True)
|
|
27
|
+
class ManifestMatch:
|
|
28
|
+
manifest_kind: Literal["file", "directory"]
|
|
29
|
+
status: MatchStatus
|
|
30
|
+
verified_entries: tuple[str, ...] = ()
|
|
31
|
+
missing_tracked_entries: tuple[str, ...] = ()
|
|
32
|
+
mismatched_tracked_entries: tuple[str, ...] = ()
|
|
33
|
+
extra_entries: tuple[str, ...] = ()
|
|
34
|
+
unverifiable_tracked_entries: tuple[str, ...] = ()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _file_annotation_manifest(payload: dict[str, Any]) -> FileManifest:
|
|
38
|
+
subject = payload["subject"]
|
|
39
|
+
provenance = payload["provenance"]
|
|
40
|
+
return FileManifest.model_validate(
|
|
41
|
+
{
|
|
42
|
+
**provenance,
|
|
43
|
+
"artifact_path": subject["path"],
|
|
44
|
+
"artifact_kind": subject.get("kind", "other"),
|
|
45
|
+
"artifact_sha256": subject.get("sha256"),
|
|
46
|
+
}
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _directory_annotation_manifest(payload: dict[str, Any]) -> DirectoryManifest:
|
|
51
|
+
subject = payload["subject"]
|
|
52
|
+
provenance = payload["provenance"]
|
|
53
|
+
return DirectoryManifest.model_validate(
|
|
54
|
+
{
|
|
55
|
+
**provenance,
|
|
56
|
+
"output_dir": subject["path"],
|
|
57
|
+
"produced_files": subject.get("produced_files", []),
|
|
58
|
+
}
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _normalize_allowed_value_payload(payload: dict[str, Any]) -> dict[str, Any]:
|
|
63
|
+
normalized = dict(payload)
|
|
64
|
+
if "description" in normalized and "summary" not in normalized:
|
|
65
|
+
normalized["summary"] = normalized.pop("description")
|
|
66
|
+
return normalized
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _normalize_field_definition_payload(payload: dict[str, Any]) -> dict[str, Any]:
|
|
70
|
+
normalized = dict(payload)
|
|
71
|
+
if "description" in normalized and "summary" not in normalized:
|
|
72
|
+
normalized["summary"] = normalized.pop("description")
|
|
73
|
+
normalized["allowed_values"] = [
|
|
74
|
+
_normalize_allowed_value_payload(value)
|
|
75
|
+
for value in normalized.get("allowed_values", [])
|
|
76
|
+
]
|
|
77
|
+
return normalized
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _normalize_artifact_description_payload(payload: dict[str, Any]) -> dict[str, Any]:
|
|
81
|
+
normalized = dict(payload)
|
|
82
|
+
if "description" in normalized and "summary" not in normalized:
|
|
83
|
+
normalized["summary"] = normalized.pop("description")
|
|
84
|
+
normalized["fields"] = [
|
|
85
|
+
_normalize_field_definition_payload(field)
|
|
86
|
+
for field in normalized.get("fields", [])
|
|
87
|
+
]
|
|
88
|
+
return normalized
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _normalize_description_payload(payload: dict[str, Any]) -> dict[str, Any]:
|
|
92
|
+
normalized = dict(payload)
|
|
93
|
+
if "description" in normalized and "summary" not in normalized:
|
|
94
|
+
normalized["summary"] = normalized.pop("description")
|
|
95
|
+
if (
|
|
96
|
+
"description_updated_at" in normalized
|
|
97
|
+
and "description_updated_at" not in normalized
|
|
98
|
+
):
|
|
99
|
+
normalized["description_updated_at"] = normalized.pop("description_updated_at")
|
|
100
|
+
normalized["fields"] = [
|
|
101
|
+
_normalize_field_definition_payload(field)
|
|
102
|
+
for field in normalized.get("fields", [])
|
|
103
|
+
]
|
|
104
|
+
normalized["artifacts"] = [
|
|
105
|
+
_normalize_artifact_description_payload(artifact)
|
|
106
|
+
for artifact in normalized.get("artifacts", [])
|
|
107
|
+
]
|
|
108
|
+
return normalized
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _normalize_annotation_payload(payload: dict[str, Any]) -> dict[str, Any]:
|
|
112
|
+
if payload.get("annotation_version") == "3":
|
|
113
|
+
return payload
|
|
114
|
+
|
|
115
|
+
normalized = dict(payload)
|
|
116
|
+
normalized["annotation_version"] = "3"
|
|
117
|
+
description = normalized.get("description")
|
|
118
|
+
if isinstance(description, dict):
|
|
119
|
+
normalized["description"] = _normalize_description_payload(description)
|
|
120
|
+
return normalized
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _load_annotation_document(manifest: str | Path):
|
|
124
|
+
from data_annotations.annotations import (
|
|
125
|
+
DirectoryAnnotationDocument,
|
|
126
|
+
FileAnnotationDocument,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
payload = json.loads(Path(manifest).read_text(encoding="utf-8"))
|
|
130
|
+
if (
|
|
131
|
+
payload.get("annotation_version") in {"2", "3"}
|
|
132
|
+
and isinstance(payload.get("subject"), dict)
|
|
133
|
+
and isinstance(payload.get("provenance"), dict)
|
|
134
|
+
):
|
|
135
|
+
payload = _normalize_annotation_payload(payload)
|
|
136
|
+
if "produced_files" in payload["subject"]:
|
|
137
|
+
return DirectoryAnnotationDocument.model_validate(payload)
|
|
138
|
+
return FileAnnotationDocument.model_validate(payload)
|
|
139
|
+
raise ValueError("annotation payload is not a supported annotation document")
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _parse_manifest_payload(payload: dict) -> FileManifest | DirectoryManifest:
|
|
143
|
+
if (
|
|
144
|
+
payload.get("annotation_version") in {"2", "3"}
|
|
145
|
+
and isinstance(payload.get("subject"), dict)
|
|
146
|
+
and isinstance(payload.get("provenance"), dict)
|
|
147
|
+
):
|
|
148
|
+
subject = payload["subject"]
|
|
149
|
+
if "produced_files" in subject:
|
|
150
|
+
return _directory_annotation_manifest(payload)
|
|
151
|
+
if "path" in subject:
|
|
152
|
+
return _file_annotation_manifest(payload)
|
|
153
|
+
if "artifact_path" in payload:
|
|
154
|
+
return FileManifest.model_validate(payload)
|
|
155
|
+
if "output_dir" in payload:
|
|
156
|
+
return DirectoryManifest.model_validate(payload)
|
|
157
|
+
raise ValueError("annotation payload is not a supported provenance document")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _load_manifest(
|
|
161
|
+
manifest: str | Path | BaseProvenance,
|
|
162
|
+
) -> FileManifest | DirectoryManifest:
|
|
163
|
+
if isinstance(manifest, FileManifest | DirectoryManifest):
|
|
164
|
+
return manifest
|
|
165
|
+
if isinstance(manifest, BaseProvenance):
|
|
166
|
+
raise ValueError("manifest must be a file or directory provenance manifest")
|
|
167
|
+
|
|
168
|
+
manifest_path = Path(manifest)
|
|
169
|
+
payload = json.loads(manifest_path.read_text(encoding="utf-8"))
|
|
170
|
+
return _parse_manifest_payload(payload)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _path_label(path: Path) -> str:
|
|
174
|
+
return path.as_posix()
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _tracked_relative_path(
|
|
178
|
+
*,
|
|
179
|
+
produced_path: str | Path,
|
|
180
|
+
output_dir: str | Path,
|
|
181
|
+
) -> Path | None:
|
|
182
|
+
produced_path = Path(produced_path)
|
|
183
|
+
output_dir = Path(output_dir)
|
|
184
|
+
|
|
185
|
+
if produced_path.is_absolute():
|
|
186
|
+
try:
|
|
187
|
+
return produced_path.relative_to(output_dir)
|
|
188
|
+
except ValueError:
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
return produced_path.relative_to(output_dir)
|
|
193
|
+
except ValueError:
|
|
194
|
+
return produced_path
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _analyze_file_match(
|
|
198
|
+
artifact_path: Path,
|
|
199
|
+
manifest: FileManifest,
|
|
200
|
+
) -> ManifestMatch:
|
|
201
|
+
entry_name = artifact_path.name
|
|
202
|
+
if manifest.artifact_sha256 is None:
|
|
203
|
+
return ManifestMatch(
|
|
204
|
+
manifest_kind="file",
|
|
205
|
+
status="unverifiable",
|
|
206
|
+
unverifiable_tracked_entries=(entry_name,),
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
if sha256_file(artifact_path) == manifest.artifact_sha256:
|
|
210
|
+
return ManifestMatch(
|
|
211
|
+
manifest_kind="file",
|
|
212
|
+
status="match",
|
|
213
|
+
verified_entries=(entry_name,),
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
return ManifestMatch(
|
|
217
|
+
manifest_kind="file",
|
|
218
|
+
status="mismatch",
|
|
219
|
+
mismatched_tracked_entries=(entry_name,),
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _analyze_directory_match(
|
|
224
|
+
artifact_path: Path,
|
|
225
|
+
manifest: DirectoryManifest,
|
|
226
|
+
) -> ManifestMatch:
|
|
227
|
+
verified_entries: list[str] = []
|
|
228
|
+
missing_tracked_entries: list[str] = []
|
|
229
|
+
mismatched_tracked_entries: list[str] = []
|
|
230
|
+
unverifiable_tracked_entries: list[str] = []
|
|
231
|
+
tracked_relative_paths: set[Path] = set()
|
|
232
|
+
|
|
233
|
+
for produced_file in manifest.produced_files:
|
|
234
|
+
relative_path = _tracked_relative_path(
|
|
235
|
+
produced_path=produced_file.path,
|
|
236
|
+
output_dir=manifest.output_dir,
|
|
237
|
+
)
|
|
238
|
+
if relative_path is None:
|
|
239
|
+
unverifiable_tracked_entries.append(str(produced_file.path))
|
|
240
|
+
continue
|
|
241
|
+
|
|
242
|
+
tracked_relative_paths.add(relative_path)
|
|
243
|
+
candidate_path = artifact_path / relative_path
|
|
244
|
+
relative_label = _path_label(relative_path)
|
|
245
|
+
|
|
246
|
+
if not candidate_path.exists() or not candidate_path.is_file():
|
|
247
|
+
missing_tracked_entries.append(relative_label)
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
if produced_file.sha256 is None:
|
|
251
|
+
unverifiable_tracked_entries.append(relative_label)
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
if sha256_file(candidate_path) == produced_file.sha256:
|
|
255
|
+
verified_entries.append(relative_label)
|
|
256
|
+
else:
|
|
257
|
+
mismatched_tracked_entries.append(relative_label)
|
|
258
|
+
|
|
259
|
+
candidate_relative_paths = {
|
|
260
|
+
path.relative_to(artifact_path)
|
|
261
|
+
for path in artifact_path.rglob("*")
|
|
262
|
+
if path.is_file()
|
|
263
|
+
}
|
|
264
|
+
extra_entries = sorted(
|
|
265
|
+
_path_label(path)
|
|
266
|
+
for path in candidate_relative_paths
|
|
267
|
+
if path not in tracked_relative_paths and not _is_ignored_directory_extra(path)
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
verified_entries = sorted(verified_entries)
|
|
271
|
+
missing_tracked_entries = sorted(missing_tracked_entries)
|
|
272
|
+
mismatched_tracked_entries = sorted(mismatched_tracked_entries)
|
|
273
|
+
unverifiable_tracked_entries = sorted(unverifiable_tracked_entries)
|
|
274
|
+
|
|
275
|
+
if mismatched_tracked_entries:
|
|
276
|
+
status: MatchStatus = "mismatch"
|
|
277
|
+
elif verified_entries:
|
|
278
|
+
if missing_tracked_entries or extra_entries or unverifiable_tracked_entries:
|
|
279
|
+
status = "partial_match"
|
|
280
|
+
else:
|
|
281
|
+
status = "match"
|
|
282
|
+
else:
|
|
283
|
+
status = "unverifiable"
|
|
284
|
+
|
|
285
|
+
return ManifestMatch(
|
|
286
|
+
manifest_kind="directory",
|
|
287
|
+
status=status,
|
|
288
|
+
verified_entries=tuple(verified_entries),
|
|
289
|
+
missing_tracked_entries=tuple(missing_tracked_entries),
|
|
290
|
+
mismatched_tracked_entries=tuple(mismatched_tracked_entries),
|
|
291
|
+
extra_entries=tuple(extra_entries),
|
|
292
|
+
unverifiable_tracked_entries=tuple(unverifiable_tracked_entries),
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _analyze_artifact_match(
|
|
297
|
+
artifact_path: str | Path,
|
|
298
|
+
manifest: str | Path | BaseProvenance,
|
|
299
|
+
) -> ManifestMatch:
|
|
300
|
+
loaded_manifest = _load_manifest(manifest)
|
|
301
|
+
artifact_path = Path(artifact_path).expanduser().resolve()
|
|
302
|
+
|
|
303
|
+
if isinstance(loaded_manifest, FileManifest):
|
|
304
|
+
if artifact_path.is_dir():
|
|
305
|
+
raise ValueError("file manifest matching requires a file target")
|
|
306
|
+
return _analyze_file_match(artifact_path, loaded_manifest)
|
|
307
|
+
|
|
308
|
+
if not artifact_path.is_dir():
|
|
309
|
+
raise ValueError("directory manifest matching requires a directory target")
|
|
310
|
+
return _analyze_directory_match(artifact_path, loaded_manifest)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def artifact_matches_manifest(
|
|
314
|
+
artifact_path: str | Path,
|
|
315
|
+
manifest: str | Path | FileManifest | DirectoryManifest,
|
|
316
|
+
) -> bool:
|
|
317
|
+
match = _analyze_artifact_match(artifact_path, manifest)
|
|
318
|
+
return match.status in {"match", "partial_match"}
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _safe_remote_slug(remote_url: str) -> str:
|
|
322
|
+
normalized = remote_url.removesuffix(".git").rstrip("/").replace("\\", "/")
|
|
323
|
+
slug = re.sub(r"[^A-Za-z0-9._-]+", "-", normalized).strip("-") or "repo"
|
|
324
|
+
digest = hashlib.sha1(remote_url.encode("utf-8")).hexdigest()[:8]
|
|
325
|
+
return f"{slug[-64:]}-{digest}"
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _checkout_cache_root(
|
|
329
|
+
*,
|
|
330
|
+
env: dict[str, str] | os._Environ[str] | None = None,
|
|
331
|
+
home: Path | None = None,
|
|
332
|
+
platform: str | None = None,
|
|
333
|
+
) -> Path:
|
|
334
|
+
env = os.environ if env is None else env
|
|
335
|
+
home = Path.home() if home is None else home
|
|
336
|
+
platform = sys.platform if platform is None else platform
|
|
337
|
+
|
|
338
|
+
if platform == "win32":
|
|
339
|
+
base_dir = Path(env.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
|
340
|
+
elif platform == "darwin":
|
|
341
|
+
base_dir = home / "Library" / "Caches"
|
|
342
|
+
else:
|
|
343
|
+
base_dir = Path(env.get("XDG_CACHE_HOME", home / ".cache"))
|
|
344
|
+
|
|
345
|
+
return base_dir.expanduser().resolve() / "data-annotations" / "checkouts"
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def _default_checkout_path(
|
|
349
|
+
*,
|
|
350
|
+
git_remote_url: str,
|
|
351
|
+
git_sha: str,
|
|
352
|
+
env: dict[str, str] | os._Environ[str] | None = None,
|
|
353
|
+
home: Path | None = None,
|
|
354
|
+
platform: str | None = None,
|
|
355
|
+
) -> Path:
|
|
356
|
+
return (
|
|
357
|
+
_checkout_cache_root(env=env, home=home, platform=platform)
|
|
358
|
+
/ _safe_remote_slug(git_remote_url)
|
|
359
|
+
/ git_sha
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def _git_output(*args: str) -> str | None:
|
|
364
|
+
try:
|
|
365
|
+
return subprocess.check_output(
|
|
366
|
+
["git", *args],
|
|
367
|
+
text=True,
|
|
368
|
+
stderr=subprocess.DEVNULL,
|
|
369
|
+
).strip()
|
|
370
|
+
except Exception:
|
|
371
|
+
return None
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _checkout_matches_recorded_state(
|
|
375
|
+
checkout_path: Path,
|
|
376
|
+
*,
|
|
377
|
+
git_sha: str,
|
|
378
|
+
git_remote_url: str,
|
|
379
|
+
) -> bool:
|
|
380
|
+
if not checkout_path.is_dir():
|
|
381
|
+
return False
|
|
382
|
+
|
|
383
|
+
head_sha = _git_output("-C", str(checkout_path), "rev-parse", "HEAD")
|
|
384
|
+
remote_url = _git_output(
|
|
385
|
+
"-C", str(checkout_path), "config", "--get", "remote.origin.url"
|
|
386
|
+
)
|
|
387
|
+
return head_sha == git_sha and remote_url == git_remote_url
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _prepare_checkout_path(
|
|
391
|
+
checkout_path: Path,
|
|
392
|
+
*,
|
|
393
|
+
git_sha: str,
|
|
394
|
+
git_remote_url: str,
|
|
395
|
+
) -> tuple[Path, bool]:
|
|
396
|
+
checkout_path = checkout_path.expanduser().resolve()
|
|
397
|
+
|
|
398
|
+
if checkout_path.exists():
|
|
399
|
+
if _checkout_matches_recorded_state(
|
|
400
|
+
checkout_path,
|
|
401
|
+
git_sha=git_sha,
|
|
402
|
+
git_remote_url=git_remote_url,
|
|
403
|
+
):
|
|
404
|
+
return checkout_path, True
|
|
405
|
+
|
|
406
|
+
if checkout_path.is_dir() and not any(checkout_path.iterdir()):
|
|
407
|
+
return checkout_path, False
|
|
408
|
+
|
|
409
|
+
raise ValueError(
|
|
410
|
+
f"checkout destination already exists and does not match the recorded repository state: {checkout_path}"
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
checkout_path.parent.mkdir(parents=True, exist_ok=True)
|
|
414
|
+
return checkout_path, False
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def checkout_manifest_source(
|
|
418
|
+
manifest: str | Path | BaseProvenance,
|
|
419
|
+
*,
|
|
420
|
+
destination_dir: str | Path | None = None,
|
|
421
|
+
) -> RecoveredSource:
|
|
422
|
+
loaded_manifest = _load_manifest(manifest)
|
|
423
|
+
if loaded_manifest.git_remote_url is None:
|
|
424
|
+
raise ValueError("manifest does not record git_remote_url")
|
|
425
|
+
if loaded_manifest.git_sha is None:
|
|
426
|
+
raise ValueError("manifest does not record git_sha")
|
|
427
|
+
|
|
428
|
+
checkout_path = (
|
|
429
|
+
_default_checkout_path(
|
|
430
|
+
git_remote_url=loaded_manifest.git_remote_url,
|
|
431
|
+
git_sha=loaded_manifest.git_sha,
|
|
432
|
+
)
|
|
433
|
+
if destination_dir is None
|
|
434
|
+
else Path(destination_dir).expanduser().resolve()
|
|
435
|
+
)
|
|
436
|
+
checkout_path, reused = _prepare_checkout_path(
|
|
437
|
+
checkout_path,
|
|
438
|
+
git_sha=loaded_manifest.git_sha,
|
|
439
|
+
git_remote_url=loaded_manifest.git_remote_url,
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
if not reused:
|
|
443
|
+
subprocess.check_call(
|
|
444
|
+
["git", "clone", loaded_manifest.git_remote_url, str(checkout_path)],
|
|
445
|
+
stdout=subprocess.DEVNULL,
|
|
446
|
+
stderr=subprocess.DEVNULL,
|
|
447
|
+
)
|
|
448
|
+
subprocess.check_call(
|
|
449
|
+
[
|
|
450
|
+
"git",
|
|
451
|
+
"-C",
|
|
452
|
+
str(checkout_path),
|
|
453
|
+
"checkout",
|
|
454
|
+
"--detach",
|
|
455
|
+
loaded_manifest.git_sha,
|
|
456
|
+
],
|
|
457
|
+
stdout=subprocess.DEVNULL,
|
|
458
|
+
stderr=subprocess.DEVNULL,
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
script_path: str | None = None
|
|
462
|
+
if loaded_manifest.script_repo_path is not None:
|
|
463
|
+
candidate = checkout_path / loaded_manifest.script_repo_path
|
|
464
|
+
if candidate.exists():
|
|
465
|
+
script_path = str(candidate.resolve())
|
|
466
|
+
|
|
467
|
+
return RecoveredSource(
|
|
468
|
+
checkout_path=str(checkout_path.resolve()),
|
|
469
|
+
script_path=script_path,
|
|
470
|
+
git_sha=loaded_manifest.git_sha,
|
|
471
|
+
git_remote_url=loaded_manifest.git_remote_url,
|
|
472
|
+
script_repo_path=loaded_manifest.script_repo_path,
|
|
473
|
+
)
|