data-annotations 2.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,534 @@
1
+ import re
2
+ import shlex
3
+ from collections.abc import Sequence
4
+ from pathlib import Path
5
+ from typing import Any, Callable
6
+
7
+ import questionary
8
+ import typer
9
+ from questionary import Choice
10
+
11
+ from data_annotations.provenance.models import ArtifactKind
12
+ from data_annotations.description import FieldDefinition
13
+ from data_annotations.provenance import git as provenance_git
14
+ from data_annotations.provenance import runtime as provenance_runtime
15
+
16
+ from .common import (
17
+ _ARTIFACT_KINDS,
18
+ _UNKNOWN_SELECTION,
19
+ _collect_params,
20
+ _is_current_annotation_command,
21
+ _normalize_remote_url,
22
+ _parse_command_string,
23
+ _parse_param_entry,
24
+ _validate_artifact_kind,
25
+ )
26
+
27
+
28
+ def _ask_question(result: Any) -> Any:
29
+ if result is None:
30
+ raise typer.Abort()
31
+ return result
32
+
33
+
34
+ def _questionary_text(
35
+ message: str,
36
+ *,
37
+ default: str = "",
38
+ instruction: str | None = None,
39
+ ) -> str:
40
+ return _ask_question(
41
+ questionary.text(
42
+ message,
43
+ default=default,
44
+ instruction=instruction,
45
+ ).ask()
46
+ )
47
+
48
+
49
+ def _questionary_select(
50
+ message: str,
51
+ choices: Sequence[str | Choice],
52
+ *,
53
+ default: str | Choice | None = None,
54
+ instruction: str | None = None,
55
+ ) -> Any:
56
+ return _ask_question(
57
+ questionary.select(
58
+ message,
59
+ choices=choices,
60
+ default=default,
61
+ instruction=instruction,
62
+ use_search_filter=False,
63
+ ).ask()
64
+ )
65
+
66
+
67
+ def _questionary_confirm(
68
+ message: str,
69
+ *,
70
+ default: bool = True,
71
+ instruction: str | None = None,
72
+ ) -> bool:
73
+ return bool(
74
+ _ask_question(
75
+ questionary.confirm(
76
+ message,
77
+ default=default,
78
+ instruction=instruction,
79
+ ).ask()
80
+ )
81
+ )
82
+
83
+
84
+ def _questionary_checkbox(
85
+ message: str,
86
+ choices: list[str | Choice],
87
+ *,
88
+ instruction: str | None = None,
89
+ ) -> list[Any]:
90
+ return list(
91
+ _ask_question(
92
+ questionary.checkbox(
93
+ message,
94
+ choices=choices,
95
+ instruction=instruction,
96
+ ).ask()
97
+ )
98
+ )
99
+
100
+
101
+ def _prompt_required_text(label: str, *, default: str | None = None) -> str:
102
+ while True:
103
+ value = _questionary_text(label, default=default or "").strip()
104
+ if value:
105
+ return value
106
+ typer.secho("Please enter a value.", err=True, fg=typer.colors.RED)
107
+ default = None
108
+
109
+
110
+ def _prompt_optional_text(label: str, *, default: str | None = None) -> str | None:
111
+ value = _questionary_text(label, default=default or "").strip()
112
+ return value or None
113
+
114
+
115
+ def _prompt_optional_text_no_default(label: str) -> str | None:
116
+ value = _questionary_text(label).strip()
117
+ return value or None
118
+
119
+
120
+ def _prompt_artifact_kind(
121
+ label: str,
122
+ *,
123
+ default: ArtifactKind = "other",
124
+ ) -> ArtifactKind:
125
+ return _validate_artifact_kind(
126
+ str(
127
+ _questionary_select(
128
+ label,
129
+ _ARTIFACT_KINDS,
130
+ default=default,
131
+ instruction="Choose the artifact type.",
132
+ )
133
+ )
134
+ )
135
+
136
+
137
+ def _prompt_optional_bool(label: str) -> bool | None:
138
+ value = _questionary_select(
139
+ label,
140
+ [
141
+ Choice("Unknown", value=_UNKNOWN_SELECTION),
142
+ Choice("Yes", value=True),
143
+ Choice("No", value=False),
144
+ ],
145
+ default=_UNKNOWN_SELECTION,
146
+ instruction="Choose a value or leave it unknown.",
147
+ )
148
+ return None if value == _UNKNOWN_SELECTION else value
149
+
150
+
151
+ def _echo_provenance_prompt_intro() -> None:
152
+ typer.echo(
153
+ "\n"
154
+ "Provenance questions: optional fields may be skipped.\n"
155
+ "Where a detected value is available, you can reuse it or enter a custom one."
156
+ )
157
+
158
+
159
+ def _prompt_override_text(label: str, *, current: str | None) -> str | None:
160
+ if not current:
161
+ return _prompt_optional_text_no_default(label)
162
+ action = _questionary_select(
163
+ label,
164
+ [
165
+ Choice("Leave unknown", value="unknown"),
166
+ Choice(f"Use detected value: {current}", value="detected"),
167
+ Choice("Enter custom value", value="custom"),
168
+ ],
169
+ default="unknown",
170
+ instruction="Select how to fill this field.",
171
+ )
172
+ if action == "unknown":
173
+ return None
174
+ if action == "detected":
175
+ return current
176
+ return _prompt_optional_text_no_default(f"{label} (custom value)")
177
+
178
+
179
+ def _prompt_override_bool(label: str, *, current: bool | None) -> bool | None:
180
+ choices: list[Choice] = [Choice("Leave unknown", value=_UNKNOWN_SELECTION)]
181
+ if current is not None:
182
+ detected_label = "yes" if current else "no"
183
+ choices.append(
184
+ Choice(f"Use detected value: {detected_label}", value="detected")
185
+ )
186
+ choices.extend(
187
+ [
188
+ Choice("Yes", value=True),
189
+ Choice("No", value=False),
190
+ ]
191
+ )
192
+ value = _questionary_select(
193
+ label,
194
+ choices,
195
+ default=_UNKNOWN_SELECTION,
196
+ instruction="Select how to fill this field.",
197
+ )
198
+ if value == "detected":
199
+ return current
200
+ if value == _UNKNOWN_SELECTION:
201
+ return None
202
+ return value
203
+
204
+
205
+ def _looks_like_script_path(token: str) -> bool:
206
+ if not token:
207
+ return False
208
+ if "/" in token or "\\" in token:
209
+ return True
210
+ if token.startswith("."):
211
+ return True
212
+ return Path(token).suffix.lower() in {
213
+ ".py",
214
+ ".ipynb",
215
+ ".sh",
216
+ ".bash",
217
+ ".zsh",
218
+ ".r",
219
+ ".rmd",
220
+ }
221
+
222
+
223
+ def _unwrap_invocation_script_token(command: list[str]) -> str | None:
224
+ tokens = list(command)
225
+ if not tokens:
226
+ return None
227
+
228
+ if len(tokens) == 1:
229
+ return tokens[0]
230
+
231
+ if tokens[:2] == ["uv", "run"]:
232
+ tokens = tokens[2:]
233
+ if not tokens:
234
+ return None
235
+
236
+ runner = Path(tokens[0]).name
237
+ if re.fullmatch(r"python(\d+(\.\d+)?)?", runner):
238
+ index = 1
239
+ while index < len(tokens):
240
+ token = tokens[index]
241
+ if token == "-m":
242
+ return None
243
+ if token == "-c":
244
+ return None
245
+ if token.startswith("-"):
246
+ index += 1
247
+ continue
248
+ return token if _looks_like_script_path(token) else None
249
+ return None
250
+
251
+ if runner in {"bash", "sh", "zsh", "Rscript"}:
252
+ index = 1
253
+ while index < len(tokens):
254
+ token = tokens[index]
255
+ if token.startswith("-"):
256
+ index += 1
257
+ continue
258
+ return token if _looks_like_script_path(token) else None
259
+ return None
260
+
261
+ return tokens[0] if _looks_like_script_path(tokens[0]) else None
262
+
263
+
264
+ def _infer_invocation_provenance(
265
+ value: str | None,
266
+ *,
267
+ infer_script_repo_path_fn: Callable[[str | Path | None], str | None] | None = None,
268
+ ) -> tuple[list[str], str | None, str | None]:
269
+ if value is None:
270
+ return [], None, None
271
+
272
+ stripped = value.strip()
273
+ if not stripped:
274
+ return [], None, None
275
+
276
+ command = _parse_command_string(stripped)
277
+ if not command:
278
+ return [], None, None
279
+
280
+ script = _unwrap_invocation_script_token(command)
281
+ infer_script_repo_path_fn = (
282
+ provenance_runtime.infer_script_repo_path
283
+ if infer_script_repo_path_fn is None
284
+ else infer_script_repo_path_fn
285
+ )
286
+ script_repo_path = infer_script_repo_path_fn(script) if script is not None else None
287
+ return command, script, script_repo_path
288
+
289
+
290
+ def _prompt_invocation_override(
291
+ *,
292
+ current_command: list[str],
293
+ current_script: str | None,
294
+ current_script_repo_path: str | None,
295
+ ) -> tuple[list[str], str | None, str | None]:
296
+ current_command_text = shlex.join(current_command) if current_command else None
297
+ if current_command_text is None:
298
+ return _infer_invocation_provenance(
299
+ _prompt_optional_text_no_default(
300
+ "Original generating command or script path"
301
+ )
302
+ )
303
+
304
+ action = _questionary_select(
305
+ "Original generating command or script path",
306
+ [
307
+ Choice("Leave unknown", value="unknown"),
308
+ Choice(f"Use detected value: {current_command_text}", value="detected"),
309
+ Choice("Enter custom value", value="custom"),
310
+ ],
311
+ default="unknown",
312
+ instruction="Select how to fill this field.",
313
+ )
314
+ if action == "unknown":
315
+ return [], None, None
316
+ if action == "detected":
317
+ return current_command, current_script, current_script_repo_path
318
+ return _infer_invocation_provenance(
319
+ _prompt_optional_text_no_default(
320
+ "Original generating command or script path (custom value)"
321
+ )
322
+ )
323
+
324
+
325
+ def _prompt_directory_file_selection(candidates: list[Path]) -> list[Path]:
326
+ if not candidates:
327
+ return []
328
+ selected = _questionary_checkbox(
329
+ "Select files to include in this directory annotation",
330
+ [
331
+ Choice(candidate.name, value=str(candidate), checked=True)
332
+ for candidate in candidates
333
+ ],
334
+ instruction="Use space to toggle files, then press Enter to continue.",
335
+ )
336
+ selected_paths = {str(Path(value).resolve()) for value in selected}
337
+ return [
338
+ candidate
339
+ for candidate in candidates
340
+ if str(candidate.resolve()) in selected_paths
341
+ ]
342
+
343
+
344
+ def _prompt_inputs() -> list[str]:
345
+ inputs: list[str] = []
346
+ while _questionary_confirm(
347
+ "Were there input files or directories used to generate this artifact?",
348
+ default=False,
349
+ instruction="(y/N) Inputs can be local paths or URIs. ",
350
+ ):
351
+ inputs.append(_prompt_required_text("Input path or URI"))
352
+ return inputs
353
+
354
+
355
+ def _prompt_params() -> dict[str, Any]:
356
+ params: dict[str, Any] = {}
357
+ while _questionary_confirm(
358
+ "Were there function parameters used to generate this artifact? ",
359
+ default=False,
360
+ instruction="(y/N) Parameters should be entered as KEY=VALUE. ",
361
+ ):
362
+ key, parsed_value = _parse_param_entry(
363
+ _prompt_required_text("Generation parameter (KEY=VALUE)")
364
+ )
365
+ params[key] = parsed_value
366
+ return params
367
+
368
+
369
+ def _prompt_primary_key(field_names: list[str]) -> list[str]:
370
+ if not field_names:
371
+ return []
372
+ return [
373
+ str(value)
374
+ for value in _questionary_checkbox(
375
+ "Select primary key fields",
376
+ [Choice(name, value=name) for name in field_names],
377
+ instruction="Leave everything unchecked if there is no primary key.",
378
+ )
379
+ ]
380
+
381
+
382
+ def _prompt_missing_value_codes() -> dict[str, str]:
383
+ codes: dict[str, str] = {}
384
+ while _questionary_confirm(
385
+ "Add a missing value code?",
386
+ default=False,
387
+ ):
388
+ entry = _prompt_required_text("Missing value code (CODE=MEANING)")
389
+ if "=" not in entry:
390
+ typer.secho(
391
+ "Please enter missing value codes as CODE=MEANING.",
392
+ err=True,
393
+ fg=typer.colors.RED,
394
+ )
395
+ continue
396
+ code, meaning = entry.split("=", 1)
397
+ code = code.strip()
398
+ meaning = meaning.strip()
399
+ if not code or not meaning:
400
+ typer.secho(
401
+ "Both the missing value code and its meaning are required.",
402
+ err=True,
403
+ fg=typer.colors.RED,
404
+ )
405
+ continue
406
+ codes[code] = meaning
407
+ return codes
408
+
409
+
410
+ def _prompt_field_definitions() -> list[FieldDefinition]:
411
+ fields: list[FieldDefinition] = []
412
+ while _questionary_confirm(
413
+ "Add a field definition?",
414
+ default=False,
415
+ ):
416
+ field_name = _prompt_required_text("Field name")
417
+ fields.append(
418
+ FieldDefinition(
419
+ name=field_name,
420
+ summary=_prompt_required_text(f"Summary for {field_name}"),
421
+ data_type=_prompt_optional_text(f"Data type for {field_name}"),
422
+ required=_prompt_optional_bool(f"Required for {field_name}"),
423
+ nullable=_prompt_optional_bool(f"Nullable for {field_name}"),
424
+ )
425
+ )
426
+ return fields
427
+
428
+
429
+ def _prompt_schema_details() -> tuple[list[FieldDefinition], list[str], dict[str, str]]:
430
+ if not _questionary_confirm("Add field-level schema details?", default=False):
431
+ return [], [], {}
432
+ fields = _prompt_field_definitions()
433
+ primary_key = _prompt_primary_key([field.name for field in fields])
434
+ missing_value_codes = _prompt_missing_value_codes()
435
+ return fields, primary_key, missing_value_codes
436
+
437
+
438
+ def _collect_post_hoc_provenance(
439
+ *,
440
+ input_values: list[str] | None,
441
+ param_values: list[str] | None,
442
+ script: str | None,
443
+ script_repo_path: str | None,
444
+ command: str | None,
445
+ function: str | None,
446
+ git_sha: str | None,
447
+ git_branch: str | None,
448
+ git_remote_name: str | None,
449
+ git_remote_url: str | None,
450
+ git_dirty: bool | None,
451
+ ) -> tuple[list[str], dict[str, Any], dict[str, Any]]:
452
+ runtime_info = provenance_runtime.capture_runtime_info()
453
+ git_info = provenance_git.capture_git_info()
454
+ _echo_provenance_prompt_intro()
455
+
456
+ overrides: dict[str, Any] = {}
457
+
458
+ current_command = runtime_info.get("command") or []
459
+ if _is_current_annotation_command(current_command):
460
+ current_command = []
461
+
462
+ if script is not None or script_repo_path is not None or command is not None:
463
+ overrides["script"] = script
464
+ overrides["script_repo_path"] = script_repo_path
465
+ overrides["command"] = (
466
+ _parse_command_string(command) if command is not None else []
467
+ )
468
+ else:
469
+ (
470
+ overrides["command"],
471
+ overrides["script"],
472
+ overrides["script_repo_path"],
473
+ ) = _prompt_invocation_override(
474
+ current_command=current_command,
475
+ current_script=runtime_info.get("script"),
476
+ current_script_repo_path=runtime_info.get("script_repo_path"),
477
+ )
478
+
479
+ function_value: str | None = (
480
+ function
481
+ if function is not None
482
+ else _prompt_override_text(
483
+ "Generating function within the script", current=None
484
+ )
485
+ )
486
+ overrides["function"] = function_value
487
+
488
+ inputs = list(input_values) if input_values else _prompt_inputs()
489
+ params = _collect_params(param_values) if param_values else _prompt_params()
490
+
491
+ git_sha_value: str | None = (
492
+ git_sha
493
+ if git_sha is not None
494
+ else _prompt_override_text("Git commit SHA", current=git_info.get("git_sha"))
495
+ )
496
+ overrides["git_sha"] = git_sha_value
497
+
498
+ git_branch_value: str | None = (
499
+ git_branch
500
+ if git_branch is not None
501
+ else _prompt_override_text("Git branch", current=git_info.get("git_branch"))
502
+ )
503
+ overrides["git_branch"] = git_branch_value
504
+
505
+ git_remote_name_value: str | None = (
506
+ git_remote_name
507
+ if git_remote_name is not None
508
+ else _prompt_override_text(
509
+ "Git remote name",
510
+ current=git_info.get("git_remote_name"),
511
+ )
512
+ )
513
+ overrides["git_remote_name"] = git_remote_name_value
514
+
515
+ git_remote_url_value: str | None
516
+ if git_remote_url is not None:
517
+ git_remote_url_value = _normalize_remote_url(git_remote_url)
518
+ else:
519
+ git_remote_url_value = _prompt_override_text(
520
+ "Git remote URL",
521
+ current=git_info.get("git_remote_url"),
522
+ )
523
+ if git_remote_url_value is not None:
524
+ git_remote_url_value = _normalize_remote_url(str(git_remote_url_value))
525
+ overrides["git_remote_url"] = git_remote_url_value
526
+
527
+ git_dirty_value: bool | None = (
528
+ git_dirty
529
+ if git_dirty is not None
530
+ else _prompt_override_bool("Git dirty state", current=git_info.get("git_dirty"))
531
+ )
532
+ overrides["git_dirty"] = git_dirty_value
533
+
534
+ return inputs, params, overrides
@@ -0,0 +1,107 @@
1
+ import subprocess
2
+ from pathlib import Path
3
+
4
+ import typer
5
+
6
+ from data_annotations.provenance import checkout_manifest_source
7
+ from data_annotations.provenance import recovery as provenance_recovery
8
+
9
+ from .common import (
10
+ _checkout_hint,
11
+ _echo_entries,
12
+ _error,
13
+ _match_target_path,
14
+ _missing_checkout_fields,
15
+ _resolve_manifest_path,
16
+ _resolved_path,
17
+ )
18
+
19
+ provenance_app = typer.Typer(
20
+ no_args_is_help=True,
21
+ help="Inspect provenance recorded in annotation documents.",
22
+ )
23
+
24
+
25
+ @provenance_app.command("match")
26
+ def match_command(
27
+ target: Path = typer.Argument(
28
+ ..., help="Artifact, directory, or annotation document path."
29
+ ),
30
+ manifest: Path | None = typer.Option(
31
+ None,
32
+ "--manifest",
33
+ help="Explicit annotation document path to use instead of auto-discovery.",
34
+ ),
35
+ ) -> None:
36
+ manifest_path = _resolve_manifest_path(target, manifest)
37
+ candidate_path = _match_target_path(target, manifest)
38
+ loaded_manifest = provenance_recovery._load_manifest(manifest_path)
39
+ match = provenance_recovery._analyze_artifact_match(candidate_path, loaded_manifest)
40
+
41
+ typer.echo(f"Target: {candidate_path}")
42
+ typer.echo(f"Manifest: {manifest_path}")
43
+ typer.echo(f"Result: {match.status.replace('_', ' ').upper()}")
44
+
45
+ _echo_entries("Verified entries", match.verified_entries)
46
+ _echo_entries("Missing tracked entries", match.missing_tracked_entries)
47
+ _echo_entries("Mismatched tracked entries", match.mismatched_tracked_entries)
48
+ _echo_entries("Extra entries", match.extra_entries)
49
+ _echo_entries("Unverifiable tracked entries", match.unverifiable_tracked_entries)
50
+
51
+ if match.status in {"match", "partial_match"}:
52
+ missing_checkout_fields = _missing_checkout_fields(loaded_manifest)
53
+ if missing_checkout_fields:
54
+ typer.echo(
55
+ "Checkout unavailable: manifest is missing "
56
+ + ", ".join(missing_checkout_fields)
57
+ )
58
+ else:
59
+ typer.echo("Next step:")
60
+ typer.echo(
61
+ " "
62
+ + _checkout_hint(
63
+ str(_resolved_path(target)),
64
+ str(_resolved_path(manifest)) if manifest is not None else None,
65
+ )
66
+ )
67
+ return
68
+
69
+ raise typer.Exit(code=1)
70
+
71
+
72
+ @provenance_app.command("checkout")
73
+ def checkout_command(
74
+ target: Path = typer.Argument(
75
+ ..., help="Artifact, directory, or annotation document path."
76
+ ),
77
+ manifest: Path | None = typer.Option(
78
+ None,
79
+ "--manifest",
80
+ help="Explicit annotation document path to use instead of auto-discovery.",
81
+ ),
82
+ dest: Path | None = typer.Option(
83
+ None,
84
+ "--dest",
85
+ help="Optional checkout destination. Defaults to a stable user cache.",
86
+ ),
87
+ ) -> None:
88
+ manifest_path = _resolve_manifest_path(target, manifest)
89
+
90
+ try:
91
+ recovered = checkout_manifest_source(
92
+ manifest_path,
93
+ destination_dir=dest,
94
+ )
95
+ except ValueError as exc:
96
+ _error(str(exc), code=1)
97
+ except subprocess.CalledProcessError:
98
+ _error("failed to clone or checkout the recorded repository state", code=1)
99
+
100
+ typer.echo(f"Manifest: {manifest_path}")
101
+ typer.echo(f"Checkout path: {recovered.checkout_path}")
102
+ if recovered.script_path is not None:
103
+ typer.echo(f"Recovered script: {recovered.script_path}")
104
+ else:
105
+ typer.echo(
106
+ "Recovered repository checkout, but the generating script could not be resolved."
107
+ )
@@ -0,0 +1,37 @@
1
+ from data_annotations.provenance.models import ArtifactKind
2
+
3
+ from .models import (
4
+ AllowedValue,
5
+ ArtifactDescription,
6
+ DirectoryDescription,
7
+ DocumentedArtifact,
8
+ FieldDefinition,
9
+ FileDescription,
10
+ )
11
+ from .decorators import record_directory_description, record_file_description
12
+ from .writers import (
13
+ render_directory_readme,
14
+ render_file_readme,
15
+ write_directory_description,
16
+ write_directory_readme,
17
+ write_file_description,
18
+ write_file_readme,
19
+ )
20
+
21
+ __all__ = [
22
+ "ArtifactKind",
23
+ "AllowedValue",
24
+ "ArtifactDescription",
25
+ "DirectoryDescription",
26
+ "DocumentedArtifact",
27
+ "FieldDefinition",
28
+ "FileDescription",
29
+ "record_directory_description",
30
+ "record_file_description",
31
+ "render_directory_readme",
32
+ "render_file_readme",
33
+ "write_directory_description",
34
+ "write_directory_readme",
35
+ "write_file_description",
36
+ "write_file_readme",
37
+ ]