data-annotations 2.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,145 @@
1
+ from datetime import datetime, timezone
2
+ from functools import wraps
3
+ from typing import Any, Callable
4
+
5
+ from data_annotations._decorators import (
6
+ argument_path,
7
+ bind_arguments,
8
+ coerce_documented_artifacts,
9
+ coerce_produced_files,
10
+ )
11
+ from data_annotations.provenance.models import ArtifactKind
12
+
13
+ from .models import (
14
+ ArtifactDescription,
15
+ DirectoryDescription,
16
+ DocumentedArtifact,
17
+ FieldDefinition,
18
+ FileDescription,
19
+ )
20
+ from .writers import write_directory_description, write_file_description
21
+
22
+
23
+ def _coerce_fields(fields: list[FieldDefinition] | None) -> list[FieldDefinition]:
24
+ return [FieldDefinition.model_validate(field) for field in (fields or [])]
25
+
26
+
27
+ def _artifact_description(artifact: DocumentedArtifact) -> ArtifactDescription:
28
+ return ArtifactDescription(
29
+ path=artifact.path,
30
+ title=artifact.title,
31
+ summary=artifact.summary,
32
+ fields=_coerce_fields(artifact.fields),
33
+ primary_key=list(artifact.primary_key),
34
+ missing_value_codes=dict(artifact.missing_value_codes),
35
+ )
36
+
37
+
38
+ def _timestamp() -> datetime:
39
+ return datetime.now(timezone.utc)
40
+
41
+
42
+ def record_file_description(
43
+ *,
44
+ artifact_path_arg: str = "artifact_path",
45
+ artifact_kind: ArtifactKind = "other",
46
+ title: str | None = None,
47
+ summary: str | None = None,
48
+ fields: list[FieldDefinition] | None = None,
49
+ primary_key: list[str] | None = None,
50
+ missing_value_codes: dict[str, str] | None = None,
51
+ acquisition_context: dict[str, Any] | None = None,
52
+ generation_context: dict[str, Any] | None = None,
53
+ readme_suffix: str = ".README.md",
54
+ ):
55
+ """
56
+ Decorate a function that writes one described artifact.
57
+
58
+ Wrapped function contract:
59
+ - Accept a local output path argument, named ``artifact_path`` by default.
60
+ - The decorator writes the README sidecar from ``artifact_path``.
61
+ - The return value is not inspected and is returned unchanged.
62
+ """
63
+
64
+ def deco(fn: Callable[..., Any]):
65
+ @wraps(fn)
66
+ def wrapper(*args, **kwargs):
67
+ bound = bind_arguments(fn, args, kwargs)
68
+ result = fn(*args, **kwargs)
69
+
70
+ artifact_path = argument_path(bound, argument_name=artifact_path_arg)
71
+ description_model = FileDescription(
72
+ title=title,
73
+ summary=summary,
74
+ fields=_coerce_fields(fields),
75
+ primary_key=primary_key or [],
76
+ missing_value_codes=missing_value_codes or {},
77
+ acquisition_context=acquisition_context or {},
78
+ generation_context=generation_context or {},
79
+ description_updated_at=_timestamp(),
80
+ )
81
+ write_file_description(
82
+ str(artifact_path) + readme_suffix,
83
+ artifact_path=str(artifact_path),
84
+ artifact_kind=artifact_kind,
85
+ description=description_model,
86
+ )
87
+ return result
88
+
89
+ return wrapper
90
+
91
+ return deco
92
+
93
+
94
+ def record_directory_description(
95
+ *,
96
+ output_dir_arg: str = "output_dir",
97
+ title: str | None = None,
98
+ summary: str | None = None,
99
+ acquisition_context: dict[str, Any] | None = None,
100
+ generation_context: dict[str, Any] | None = None,
101
+ readme_filename: str = "README.md",
102
+ ):
103
+ """
104
+ Decorate a function that writes several described outputs in a directory.
105
+
106
+ Wrapped function contract:
107
+ - Accept a local output directory argument, named ``output_dir`` by default.
108
+ - Return a materialized iterable, usually a ``list`` or ``tuple``.
109
+ - Supported return items are:
110
+ - DocumentedArtifact
111
+ - ProducedFile
112
+ - (path, kind)
113
+ - path-like objects (kind defaults to ``"other"``)
114
+ - The original return value is passed through unchanged.
115
+ """
116
+
117
+ def deco(fn: Callable[..., Any]):
118
+ @wraps(fn)
119
+ def wrapper(*args, **kwargs):
120
+ bound = bind_arguments(fn, args, kwargs)
121
+ result = fn(*args, **kwargs)
122
+
123
+ items = list(result)
124
+ output_dir = argument_path(bound, argument_name=output_dir_arg)
125
+ artifacts = coerce_documented_artifacts(items)
126
+ produced_files = coerce_produced_files(items)
127
+ description_model = DirectoryDescription(
128
+ title=title,
129
+ summary=summary,
130
+ artifacts=[_artifact_description(artifact) for artifact in artifacts],
131
+ acquisition_context=acquisition_context or {},
132
+ generation_context=generation_context or {},
133
+ description_updated_at=_timestamp(),
134
+ )
135
+ write_directory_description(
136
+ output_dir / readme_filename,
137
+ output_dir=str(output_dir),
138
+ produced_files=produced_files,
139
+ description=description_model,
140
+ )
141
+ return result
142
+
143
+ return wrapper
144
+
145
+ return deco
@@ -0,0 +1,63 @@
1
+ from datetime import datetime
2
+ from typing import Any
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+ from data_annotations.provenance.models import ArtifactKind
7
+
8
+
9
+ class AllowedValue(BaseModel):
10
+ value: Any
11
+ label: str | None = None
12
+ summary: str | None = None
13
+
14
+
15
+ class FieldDefinition(BaseModel):
16
+ name: str
17
+ data_type: str | None = None
18
+ summary: str
19
+ required: bool | None = None
20
+ nullable: bool | None = None
21
+ unit: str | None = None
22
+ example: Any | None = None
23
+ notes: str | None = None
24
+ allowed_values: list[AllowedValue] = Field(default_factory=list)
25
+
26
+
27
+ class ArtifactDescription(BaseModel):
28
+ path: str
29
+ title: str | None = None
30
+ summary: str | None = None
31
+ fields: list[FieldDefinition] = Field(default_factory=list)
32
+ primary_key: list[str] = Field(default_factory=list)
33
+ missing_value_codes: dict[str, str] = Field(default_factory=dict)
34
+
35
+
36
+ class DocumentedArtifact(BaseModel):
37
+ path: str
38
+ kind: ArtifactKind = "other"
39
+ title: str | None = None
40
+ summary: str | None = None
41
+ fields: list[FieldDefinition] = Field(default_factory=list)
42
+ primary_key: list[str] = Field(default_factory=list)
43
+ missing_value_codes: dict[str, str] = Field(default_factory=dict)
44
+
45
+
46
+ class FileDescription(BaseModel):
47
+ title: str | None = None
48
+ summary: str | None = None
49
+ fields: list[FieldDefinition] = Field(default_factory=list)
50
+ primary_key: list[str] = Field(default_factory=list)
51
+ missing_value_codes: dict[str, str] = Field(default_factory=dict)
52
+ acquisition_context: dict[str, Any] = Field(default_factory=dict)
53
+ generation_context: dict[str, Any] = Field(default_factory=dict)
54
+ description_updated_at: datetime
55
+
56
+
57
+ class DirectoryDescription(BaseModel):
58
+ title: str | None = None
59
+ summary: str | None = None
60
+ artifacts: list[ArtifactDescription] = Field(default_factory=list)
61
+ acquisition_context: dict[str, Any] = Field(default_factory=dict)
62
+ generation_context: dict[str, Any] = Field(default_factory=dict)
63
+ description_updated_at: datetime
@@ -0,0 +1,321 @@
1
+ from pathlib import Path
2
+ from typing import Any
3
+
4
+ from data_annotations.provenance.models import ArtifactKind, ProducedFile
5
+
6
+ from .models import (
7
+ AllowedValue,
8
+ ArtifactDescription,
9
+ DirectoryDescription,
10
+ FieldDefinition,
11
+ FileDescription,
12
+ )
13
+
14
+
15
+ def _stringify(value: Any) -> str:
16
+ if value is None:
17
+ return ""
18
+ if isinstance(value, str):
19
+ return value
20
+ if isinstance(value, bool):
21
+ return "yes" if value else "no"
22
+ if isinstance(value, (list, tuple, set)):
23
+ return ", ".join(_stringify(item) for item in value)
24
+ if isinstance(value, dict):
25
+ return ", ".join(f"{key}={_stringify(item)}" for key, item in value.items())
26
+ return str(value)
27
+
28
+
29
+ def _escape_table_cell(value: Any) -> str:
30
+ return _stringify(value).replace("|", "\\|").replace("\n", "<br>")
31
+
32
+
33
+ def _summarize_allowed_value(value: AllowedValue) -> str:
34
+ label = f" ({value.label})" if value.label else ""
35
+ return f"{_stringify(value.value)}{label}"
36
+
37
+
38
+ def _allowed_values_have_descriptions(values: list[AllowedValue]) -> bool:
39
+ return any(value.summary for value in values)
40
+
41
+
42
+ def _render_allowed_values(values: list[AllowedValue]) -> list[str]:
43
+ lines: list[str] = []
44
+ for value in values:
45
+ prefix = f"- `{_stringify(value.value)}`"
46
+ if value.label:
47
+ prefix += f" ({value.label})"
48
+ if value.summary:
49
+ prefix += f": {value.summary}"
50
+ lines.append(prefix)
51
+ return lines
52
+
53
+
54
+ def _render_context(context: dict[str, Any]) -> list[str]:
55
+ return [f"- {key}: {_stringify(value)}" for key, value in context.items()]
56
+
57
+
58
+ def _use_field_table(fields: list[FieldDefinition]) -> bool:
59
+ return not any(
60
+ field.notes or _allowed_values_have_descriptions(field.allowed_values)
61
+ for field in fields
62
+ )
63
+
64
+
65
+ def _render_field_table(fields: list[FieldDefinition]) -> list[str]:
66
+ lines = [
67
+ "| Name | Type | Summary | Required | Nullable | Unit | Example | Allowed Values |",
68
+ "| ---- | ---- | ------- | -------- | -------- | ---- | ------- | -------------- |",
69
+ ]
70
+ for field in fields:
71
+ lines.append(
72
+ "| "
73
+ + " | ".join(
74
+ [
75
+ _escape_table_cell(field.name),
76
+ _escape_table_cell(field.data_type),
77
+ _escape_table_cell(field.summary),
78
+ _escape_table_cell(field.required),
79
+ _escape_table_cell(field.nullable),
80
+ _escape_table_cell(field.unit),
81
+ _escape_table_cell(field.example),
82
+ _escape_table_cell(
83
+ ", ".join(
84
+ _summarize_allowed_value(value)
85
+ for value in field.allowed_values
86
+ )
87
+ ),
88
+ ]
89
+ )
90
+ + " |"
91
+ )
92
+ return lines
93
+
94
+
95
+ def _render_field_sections(
96
+ fields: list[FieldDefinition],
97
+ *,
98
+ heading_level: int,
99
+ ) -> list[str]:
100
+ lines: list[str] = []
101
+ heading_prefix = "#" * heading_level
102
+
103
+ for index, field in enumerate(fields):
104
+ if index > 0:
105
+ lines.append("")
106
+ lines.append(f"{heading_prefix} {field.name}")
107
+ lines.append("")
108
+ lines.append(f"- Summary: {field.summary}")
109
+ if field.data_type:
110
+ lines.append(f"- Type: `{field.data_type}`")
111
+ if field.required is not None:
112
+ lines.append(f"- Required: {_stringify(field.required)}")
113
+ if field.nullable is not None:
114
+ lines.append(f"- Nullable: {_stringify(field.nullable)}")
115
+ if field.unit:
116
+ lines.append(f"- Unit: `{field.unit}`")
117
+ if field.example is not None:
118
+ lines.append(f"- Example: `{_stringify(field.example)}`")
119
+ if field.notes:
120
+ lines.append(f"- Notes: {field.notes}")
121
+ if field.allowed_values:
122
+ lines.append("- Allowed values:")
123
+ lines.extend(_render_allowed_values(field.allowed_values))
124
+
125
+ return lines
126
+
127
+
128
+ def _render_fields(fields: list[FieldDefinition], *, heading_level: int) -> list[str]:
129
+ if not fields:
130
+ return []
131
+ if _use_field_table(fields):
132
+ return _render_field_table(fields)
133
+ return _render_field_sections(fields, heading_level=heading_level)
134
+
135
+
136
+ def render_file_readme(
137
+ *,
138
+ artifact_path: str,
139
+ artifact_kind: ArtifactKind,
140
+ description: FileDescription,
141
+ ) -> str:
142
+ title = description.title or Path(artifact_path).name
143
+ body = description.summary or "No summary provided."
144
+ lines = [
145
+ f"# {title}",
146
+ "",
147
+ body,
148
+ "",
149
+ "## Artifact",
150
+ "",
151
+ f"- Path: `{artifact_path}`",
152
+ f"- Kind: `{artifact_kind}`",
153
+ ]
154
+
155
+ if description.fields:
156
+ lines.extend(["", "## Fields", ""])
157
+ lines.extend(_render_fields(description.fields, heading_level=3))
158
+
159
+ if description.primary_key:
160
+ lines.extend(
161
+ [
162
+ "",
163
+ "## Keys",
164
+ "",
165
+ f"- Primary key: `{', '.join(description.primary_key)}`",
166
+ ]
167
+ )
168
+
169
+ if description.missing_value_codes:
170
+ codes = "; ".join(
171
+ f"`{code}` = {meaning}"
172
+ for code, meaning in description.missing_value_codes.items()
173
+ )
174
+ lines.extend(["", "## Missing Value Codes", "", f"- {codes}"])
175
+
176
+ if description.acquisition_context:
177
+ lines.extend(["", "## Acquisition Context", ""])
178
+ lines.extend(_render_context(description.acquisition_context))
179
+
180
+ if description.generation_context:
181
+ lines.extend(["", "## Generation Context", ""])
182
+ lines.extend(_render_context(description.generation_context))
183
+
184
+ return "\n".join(lines).rstrip() + "\n"
185
+
186
+
187
+ def _artifact_kind_map(
188
+ produced_files: list[ProducedFile],
189
+ ) -> dict[str, ArtifactKind]:
190
+ return {item.path: item.kind for item in produced_files}
191
+
192
+
193
+ def _render_artifact_summary(
194
+ artifact: ArtifactDescription,
195
+ *,
196
+ artifact_kind: ArtifactKind | None,
197
+ ) -> list[str]:
198
+ lines = [f"- Path: `{artifact.path}`"]
199
+ if artifact_kind is not None:
200
+ lines.append(f"- Kind: `{artifact_kind}`")
201
+ if artifact.summary:
202
+ lines.append(f"- Summary: {artifact.summary}")
203
+ if artifact.primary_key:
204
+ lines.append(f"- Primary key: `{', '.join(artifact.primary_key)}`")
205
+ if artifact.missing_value_codes:
206
+ codes = "; ".join(
207
+ f"`{code}` = {meaning}"
208
+ for code, meaning in artifact.missing_value_codes.items()
209
+ )
210
+ lines.append(f"- Missing value codes: {codes}")
211
+ return lines
212
+
213
+
214
+ def render_directory_readme(
215
+ *,
216
+ output_dir: str,
217
+ produced_files: list[ProducedFile],
218
+ description: DirectoryDescription,
219
+ ) -> str:
220
+ title = description.title or Path(output_dir).name
221
+ body = description.summary or "No summary provided."
222
+ lines = [
223
+ f"# {title}",
224
+ "",
225
+ body,
226
+ ]
227
+ artifact_kinds = _artifact_kind_map(produced_files)
228
+
229
+ if description.artifacts:
230
+ lines.extend(["", "## Artifacts"])
231
+ for artifact in description.artifacts:
232
+ heading = artifact.title or Path(artifact.path).name
233
+ lines.extend(["", f"### {heading}", ""])
234
+ lines.extend(
235
+ _render_artifact_summary(
236
+ artifact,
237
+ artifact_kind=artifact_kinds.get(artifact.path),
238
+ )
239
+ )
240
+ if artifact.fields:
241
+ lines.extend(["", "#### Fields", ""])
242
+ lines.extend(_render_fields(artifact.fields, heading_level=5))
243
+
244
+ if description.acquisition_context:
245
+ lines.extend(["", "## Acquisition Context", ""])
246
+ lines.extend(_render_context(description.acquisition_context))
247
+
248
+ if description.generation_context:
249
+ lines.extend(["", "## Generation Context", ""])
250
+ lines.extend(_render_context(description.generation_context))
251
+
252
+ return "\n".join(lines).rstrip() + "\n"
253
+
254
+
255
+ def write_file_readme(
256
+ output_path: str | Path,
257
+ *,
258
+ artifact_path: str,
259
+ artifact_kind: ArtifactKind,
260
+ description: FileDescription,
261
+ ) -> Path:
262
+ output_path = Path(output_path)
263
+ output_path.write_text(
264
+ render_file_readme(
265
+ artifact_path=artifact_path,
266
+ artifact_kind=artifact_kind,
267
+ description=description,
268
+ ),
269
+ encoding="utf-8",
270
+ )
271
+ return output_path
272
+
273
+
274
+ def write_directory_readme(
275
+ output_path: str | Path,
276
+ *,
277
+ output_dir: str,
278
+ produced_files: list[ProducedFile],
279
+ description: DirectoryDescription,
280
+ ) -> Path:
281
+ output_path = Path(output_path)
282
+ output_path.parent.mkdir(parents=True, exist_ok=True)
283
+ output_path.write_text(
284
+ render_directory_readme(
285
+ output_dir=output_dir,
286
+ produced_files=produced_files,
287
+ description=description,
288
+ ),
289
+ encoding="utf-8",
290
+ )
291
+ return output_path
292
+
293
+
294
+ def write_file_description(
295
+ output_path: str | Path,
296
+ *,
297
+ artifact_path: str,
298
+ artifact_kind: ArtifactKind,
299
+ description: FileDescription,
300
+ ) -> Path:
301
+ return write_file_readme(
302
+ output_path,
303
+ artifact_path=artifact_path,
304
+ artifact_kind=artifact_kind,
305
+ description=description,
306
+ )
307
+
308
+
309
+ def write_directory_description(
310
+ output_path: str | Path,
311
+ *,
312
+ output_dir: str,
313
+ produced_files: list[ProducedFile],
314
+ description: DirectoryDescription,
315
+ ) -> Path:
316
+ return write_directory_readme(
317
+ output_path,
318
+ output_dir=output_dir,
319
+ produced_files=produced_files,
320
+ description=description,
321
+ )
@@ -0,0 +1,37 @@
1
+ from .models import (
2
+ ArtifactKind,
3
+ BaseProvenance,
4
+ DirectoryManifest,
5
+ FileManifest,
6
+ ProducedFile,
7
+ RecoveredSource,
8
+ )
9
+ from .decorators import record_directory_manifest, record_file_manifest
10
+ from .git import capture_git_info
11
+ from .recovery import artifact_matches_manifest, checkout_manifest_source
12
+ from .runtime import capture_runtime_info
13
+ from .writers import (
14
+ callable_name,
15
+ sha256_file,
16
+ write_directory_manifest,
17
+ write_file_manifest,
18
+ )
19
+
20
+ __all__ = [
21
+ "ArtifactKind",
22
+ "BaseProvenance",
23
+ "DirectoryManifest",
24
+ "FileManifest",
25
+ "ProducedFile",
26
+ "RecoveredSource",
27
+ "artifact_matches_manifest",
28
+ "callable_name",
29
+ "capture_git_info",
30
+ "capture_runtime_info",
31
+ "checkout_manifest_source",
32
+ "record_directory_manifest",
33
+ "record_file_manifest",
34
+ "sha256_file",
35
+ "write_directory_manifest",
36
+ "write_file_manifest",
37
+ ]
@@ -0,0 +1,111 @@
1
+ from functools import wraps
2
+ from typing import Any, Callable
3
+
4
+ from data_annotations._decorators import (
5
+ DEFAULT_INPUT_ARGS,
6
+ argument_path,
7
+ bind_arguments,
8
+ coerce_produced_files,
9
+ extract_inputs,
10
+ extract_params,
11
+ )
12
+
13
+ from . import writers
14
+ from .models import ArtifactKind
15
+
16
+
17
+ def record_file_manifest(
18
+ *,
19
+ artifact_path_arg: str = "artifact_path",
20
+ input_args: tuple[str, ...] = DEFAULT_INPUT_ARGS,
21
+ artifact_kind: ArtifactKind = "other",
22
+ suffix: str = ".meta.json",
23
+ ):
24
+ """
25
+ Decorate a function that writes one artifact to ``artifact_path_arg``.
26
+
27
+ Wrapped function contract:
28
+ - Accept a local output path argument, named ``artifact_path`` by default.
29
+ - Any bound arguments named in ``input_args`` are recorded as provenance inputs.
30
+ - Remaining bound arguments become provenance params.
31
+ - The return value is not inspected and is returned unchanged.
32
+ """
33
+
34
+ def deco(fn: Callable[..., Any]):
35
+ @wraps(fn)
36
+ def wrapper(*args, **kwargs):
37
+ bound = bind_arguments(fn, args, kwargs)
38
+ result = fn(*args, **kwargs)
39
+
40
+ artifact_path = argument_path(bound, argument_name=artifact_path_arg)
41
+ params = extract_params(
42
+ bound,
43
+ target_args=(artifact_path_arg,),
44
+ input_args=input_args,
45
+ )
46
+ inputs = extract_inputs(bound, input_args=input_args)
47
+
48
+ writers.write_file_manifest(
49
+ artifact_path,
50
+ artifact_kind=artifact_kind,
51
+ params=params,
52
+ inputs=inputs,
53
+ function=fn,
54
+ suffix=suffix,
55
+ )
56
+ return result
57
+
58
+ return wrapper
59
+
60
+ return deco
61
+
62
+
63
+ def record_directory_manifest(
64
+ *,
65
+ output_dir_arg: str = "output_dir",
66
+ input_args: tuple[str, ...] = DEFAULT_INPUT_ARGS,
67
+ manifest_name: str = "manifest.json",
68
+ ):
69
+ """
70
+ Decorate a function that writes several outputs inside ``output_dir_arg``.
71
+
72
+ Wrapped function contract:
73
+ - Accept a local output directory argument, named ``output_dir`` by default.
74
+ - Return a materialized iterable, usually a ``list`` or ``tuple``.
75
+ - Supported return items are:
76
+ - DocumentedArtifact
77
+ - ProducedFile
78
+ - (path, kind)
79
+ - path-like objects (kind defaults to "other")
80
+ - Any bound arguments named in ``input_args`` are recorded as provenance inputs.
81
+ - Remaining bound arguments become provenance params.
82
+ - The original return value is passed through unchanged.
83
+ """
84
+
85
+ def deco(fn: Callable[..., Any]):
86
+ @wraps(fn)
87
+ def wrapper(*args, **kwargs):
88
+ bound = bind_arguments(fn, args, kwargs)
89
+ result = fn(*args, **kwargs)
90
+ output_dir = argument_path(bound, argument_name=output_dir_arg)
91
+ produced_files = coerce_produced_files(result)
92
+ params = extract_params(
93
+ bound,
94
+ target_args=(output_dir_arg,),
95
+ input_args=input_args,
96
+ )
97
+ inputs = extract_inputs(bound, input_args=input_args)
98
+
99
+ writers.write_directory_manifest(
100
+ output_dir,
101
+ produced_files=produced_files,
102
+ params=params,
103
+ inputs=inputs,
104
+ function=fn,
105
+ filename=manifest_name,
106
+ )
107
+ return result
108
+
109
+ return wrapper
110
+
111
+ return deco