data-annotations 2.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,248 @@
1
+ import json
2
+ import os
3
+ import socket
4
+ import subprocess
5
+ import sys
6
+ from datetime import datetime, timezone
7
+ from pathlib import Path
8
+ from typing import Any, Callable
9
+ from urllib.parse import urlencode
10
+ from urllib.request import Request, urlopen
11
+
12
+
13
+ def utcnow() -> datetime:
14
+ return datetime.now(timezone.utc)
15
+
16
+
17
+ def _connection_file_from_argv(argv: list[str]) -> str | None:
18
+ for index, arg in enumerate(argv[1:], start=1):
19
+ if arg.startswith("--f="):
20
+ return arg.split("=", 1)[1]
21
+ if arg == "-f" and index + 1 < len(argv):
22
+ return argv[index + 1]
23
+ try:
24
+ from ipykernel.connect import get_connection_file
25
+
26
+ return get_connection_file()
27
+ except Exception:
28
+ return None
29
+
30
+
31
+ def _kernel_id_from_connection_file(connection_file: str | Path | None) -> str | None:
32
+ if connection_file is None:
33
+ return None
34
+ stem = Path(connection_file).stem
35
+ if not stem.startswith("kernel-"):
36
+ return None
37
+ return stem.removeprefix("kernel-")
38
+
39
+
40
+ def _iter_jupyter_server_info_files(runtime_dir: Path) -> list[Path]:
41
+ return sorted(
42
+ [
43
+ *runtime_dir.glob("jpserver-*.json"),
44
+ *runtime_dir.glob("nbserver-*.json"),
45
+ ]
46
+ )
47
+
48
+
49
+ def _current_ipython():
50
+ try:
51
+ from IPython import get_ipython
52
+ except Exception:
53
+ return None
54
+ return get_ipython()
55
+
56
+
57
+ def _normalize_session_path(
58
+ path: str | Path | None,
59
+ *,
60
+ cwd: Path | None = None,
61
+ ) -> Path | None:
62
+ if path is None:
63
+ return None
64
+ path = Path(path).expanduser()
65
+ if path.is_absolute():
66
+ return path.resolve()
67
+ base_dir = Path.cwd() if cwd is None else cwd
68
+ return (base_dir / path).resolve()
69
+
70
+
71
+ def _notebook_path_from_session_name(
72
+ *,
73
+ env: dict[str, str] | os._Environ[str] | None = None,
74
+ current_ipython_fn: Callable[[], Any] | None = None,
75
+ cwd: Path | None = None,
76
+ ) -> Path | None:
77
+ env = os.environ if env is None else env
78
+ current_ipython_fn = (
79
+ _current_ipython if current_ipython_fn is None else current_ipython_fn
80
+ )
81
+ session_name = env.get("JPY_SESSION_NAME")
82
+ if session_name:
83
+ return _normalize_session_path(session_name, cwd=cwd)
84
+
85
+ shell = current_ipython_fn()
86
+ if shell is None:
87
+ return None
88
+
89
+ session_name = shell.user_ns.get("__session__")
90
+ if not isinstance(session_name, str) or not session_name:
91
+ return None
92
+
93
+ return _normalize_session_path(session_name, cwd=cwd)
94
+
95
+
96
+ def _resolve_notebook_path(
97
+ argv: list[str],
98
+ *,
99
+ env: dict[str, str] | os._Environ[str] | None = None,
100
+ current_ipython_fn: Callable[[], Any] | None = None,
101
+ request_cls=None,
102
+ urlopen_fn=None,
103
+ cwd: Path | None = None,
104
+ ) -> Path | None:
105
+ env = os.environ if env is None else env
106
+ current_ipython_fn = (
107
+ _current_ipython if current_ipython_fn is None else current_ipython_fn
108
+ )
109
+ request_cls = Request if request_cls is None else request_cls
110
+ urlopen_fn = urlopen if urlopen_fn is None else urlopen_fn
111
+ if not argv or Path(argv[0]).name != "ipykernel_launcher.py":
112
+ return None
113
+
114
+ notebook_path = _notebook_path_from_session_name(
115
+ env=env,
116
+ current_ipython_fn=current_ipython_fn,
117
+ cwd=cwd,
118
+ )
119
+ if notebook_path is not None:
120
+ return notebook_path
121
+
122
+ connection_file = _connection_file_from_argv(argv)
123
+ kernel_id = _kernel_id_from_connection_file(connection_file)
124
+ if connection_file is None or kernel_id is None:
125
+ return None
126
+
127
+ runtime_dir = Path(connection_file).expanduser().resolve().parent
128
+ for server_info_path in _iter_jupyter_server_info_files(runtime_dir):
129
+ try:
130
+ server_info = json.loads(server_info_path.read_text(encoding="utf-8"))
131
+ except Exception:
132
+ continue
133
+
134
+ server_url = server_info.get("url")
135
+ if not server_url:
136
+ continue
137
+
138
+ base_url = server_info.get("base_url", "/").rstrip("/")
139
+ sessions_url = f"{server_url.rstrip('/')}{base_url}/api/sessions"
140
+ token = server_info.get("token")
141
+ if token:
142
+ sessions_url = f"{sessions_url}?{urlencode({'token': token})}"
143
+
144
+ try:
145
+ request = request_cls(sessions_url, headers={"Accept": "application/json"})
146
+ with urlopen_fn(request, timeout=0.2) as response:
147
+ sessions = json.loads(response.read().decode("utf-8"))
148
+ except Exception:
149
+ continue
150
+
151
+ for session in sessions:
152
+ session_kernel_id = (session.get("kernel") or {}).get("id")
153
+ if session_kernel_id != kernel_id:
154
+ continue
155
+
156
+ notebook_path = (
157
+ (session.get("notebook") or {}).get("path")
158
+ ) or session.get("path")
159
+ if not notebook_path:
160
+ return None
161
+
162
+ root_dir = server_info.get("root_dir") or server_info.get("notebook_dir")
163
+ if root_dir:
164
+ return (Path(root_dir) / notebook_path).resolve()
165
+ return Path(notebook_path).expanduser().resolve()
166
+
167
+ return None
168
+
169
+
170
+ def _git_repo_root(
171
+ path: str | Path,
172
+ *,
173
+ subprocess_module=subprocess,
174
+ ) -> Path | None:
175
+ candidate = Path(path).expanduser().resolve()
176
+ cwd = candidate if candidate.is_dir() else candidate.parent
177
+ try:
178
+ repo_root = subprocess_module.check_output(
179
+ ["git", "-C", str(cwd), "rev-parse", "--show-toplevel"],
180
+ text=True,
181
+ stderr=subprocess_module.DEVNULL,
182
+ ).strip()
183
+ except Exception:
184
+ return None
185
+
186
+ if not repo_root:
187
+ return None
188
+ return Path(repo_root).resolve()
189
+
190
+
191
+ def infer_script_repo_path(script: str | Path | None) -> str | None:
192
+ if script is None:
193
+ return None
194
+ script_path = Path(script).expanduser().resolve()
195
+ repo_root = _git_repo_root(script_path)
196
+ if repo_root is None:
197
+ return None
198
+ try:
199
+ return str(script_path.relative_to(repo_root))
200
+ except ValueError:
201
+ return None
202
+
203
+
204
+ def capture_runtime_info(
205
+ *,
206
+ argv: list[str] | None = None,
207
+ utcnow_fn: Callable[[], datetime] | None = None,
208
+ gethostname: Callable[[], str] | None = None,
209
+ env: dict[str, str] | os._Environ[str] | None = None,
210
+ current_ipython_fn: Callable[[], Any] | None = None,
211
+ request_cls=None,
212
+ urlopen_fn=None,
213
+ cwd: Path | None = None,
214
+ ) -> dict[str, Any]:
215
+ utcnow_fn = utcnow if utcnow_fn is None else utcnow_fn
216
+ gethostname = socket.gethostname if gethostname is None else gethostname
217
+ env = os.environ if env is None else env
218
+ current_ipython_fn = (
219
+ _current_ipython if current_ipython_fn is None else current_ipython_fn
220
+ )
221
+ request_cls = Request if request_cls is None else request_cls
222
+ urlopen_fn = urlopen if urlopen_fn is None else urlopen_fn
223
+ argv = list(sys.argv) if argv is None else list(argv)
224
+ notebook_path = _resolve_notebook_path(
225
+ argv,
226
+ env=env,
227
+ current_ipython_fn=current_ipython_fn,
228
+ request_cls=request_cls,
229
+ urlopen_fn=urlopen_fn,
230
+ cwd=cwd,
231
+ )
232
+ if notebook_path is not None:
233
+ script = str(notebook_path)
234
+ command = [str(notebook_path)]
235
+ else:
236
+ script = os.path.abspath(argv[0]) if argv else None
237
+ command = argv
238
+ script_repo_path = infer_script_repo_path(script)
239
+
240
+ return {
241
+ "created_at": utcnow_fn(),
242
+ "hostname": gethostname(),
243
+ "username": env.get("USER", "unknown"),
244
+ "script": script,
245
+ "script_repo_path": script_repo_path,
246
+ "command": command,
247
+ "slurm_job_id": env.get("SLURM_JOB_ID"),
248
+ }
@@ -0,0 +1,206 @@
1
+ import hashlib
2
+ from pathlib import Path
3
+ from typing import Any, Callable
4
+ from urllib.parse import urlparse
5
+
6
+ from . import git, runtime
7
+ from .models import (
8
+ ArtifactKind,
9
+ BaseProvenance,
10
+ DirectoryManifest,
11
+ FileManifest,
12
+ ProducedFile,
13
+ )
14
+
15
+
16
+ def sha256_file(path: str | Path, chunk_size: int = 1024 * 1024) -> str:
17
+ path = Path(path)
18
+ digest = hashlib.sha256()
19
+ with path.open("rb") as handle:
20
+ for chunk in iter(lambda: handle.read(chunk_size), b""):
21
+ digest.update(chunk)
22
+ return digest.hexdigest()
23
+
24
+
25
+ def callable_name(fn: Callable[..., Any] | None) -> str | None:
26
+ if fn is None:
27
+ return None
28
+ module = getattr(fn, "__module__", fn.__class__.__module__)
29
+ qualname = getattr(fn, "__qualname__", fn.__class__.__qualname__)
30
+ return f"{module}.{qualname}"
31
+
32
+
33
+ def _normalize_local_path(path: str | Path) -> str:
34
+ return str(Path(path).expanduser().resolve())
35
+
36
+
37
+ def _is_uri_like(value: str) -> bool:
38
+ parsed = urlparse(value)
39
+ return bool(parsed.scheme and (parsed.netloc or "://" in value))
40
+
41
+
42
+ def _normalize_input(value: str | Path) -> str:
43
+ if isinstance(value, str) and _is_uri_like(value):
44
+ return value
45
+ return _normalize_local_path(value)
46
+
47
+
48
+ def _normalize_produced_file(produced_file: ProducedFile) -> ProducedFile:
49
+ return produced_file.model_copy(
50
+ update={"path": _normalize_local_path(produced_file.path)}
51
+ )
52
+
53
+
54
+ def _manifest_context(
55
+ *,
56
+ params: dict[str, Any] | None = None,
57
+ inputs: list[str] | None = None,
58
+ function: Callable[..., Any] | None = None,
59
+ capture_mode: str = "runtime",
60
+ overrides: dict[str, Any] | None = None,
61
+ normalize_inputs: bool = True,
62
+ ) -> dict[str, Any]:
63
+ context: dict[str, Any] = {
64
+ **runtime.capture_runtime_info(),
65
+ **git.capture_git_info(),
66
+ "capture_mode": capture_mode,
67
+ "params": params or {},
68
+ "inputs": (
69
+ [_normalize_input(value) for value in (inputs or [])]
70
+ if normalize_inputs
71
+ else list(inputs or [])
72
+ ),
73
+ }
74
+ if function is not None:
75
+ context["function"] = callable_name(function)
76
+ if overrides:
77
+ context.update(overrides)
78
+ return context
79
+
80
+
81
+ def _build_directory_manifest(
82
+ output_dir: str | Path,
83
+ *,
84
+ produced_files: list[ProducedFile],
85
+ params: dict[str, Any] | None = None,
86
+ inputs: list[str] | None = None,
87
+ function: Callable[..., Any] | None = None,
88
+ capture_mode: str = "runtime",
89
+ overrides: dict[str, Any] | None = None,
90
+ normalize_inputs: bool = True,
91
+ ) -> DirectoryManifest:
92
+ normalized_output_dir = Path(_normalize_local_path(output_dir))
93
+ return DirectoryManifest(
94
+ **_manifest_context(
95
+ params=params,
96
+ inputs=inputs,
97
+ function=function,
98
+ capture_mode=capture_mode,
99
+ overrides=overrides,
100
+ normalize_inputs=normalize_inputs,
101
+ ),
102
+ output_dir=str(normalized_output_dir),
103
+ produced_files=[_normalize_produced_file(item) for item in produced_files],
104
+ )
105
+
106
+
107
+ def _build_file_manifest(
108
+ artifact_path: str | Path,
109
+ *,
110
+ artifact_kind: ArtifactKind = "other",
111
+ params: dict[str, Any] | None = None,
112
+ inputs: list[str] | None = None,
113
+ function: Callable[..., Any] | None = None,
114
+ capture_mode: str = "runtime",
115
+ overrides: dict[str, Any] | None = None,
116
+ normalize_inputs: bool = True,
117
+ ) -> FileManifest:
118
+ normalized_artifact_path = Path(_normalize_local_path(artifact_path))
119
+ return FileManifest(
120
+ **_manifest_context(
121
+ params=params,
122
+ inputs=inputs,
123
+ function=function,
124
+ capture_mode=capture_mode,
125
+ overrides=overrides,
126
+ normalize_inputs=normalize_inputs,
127
+ ),
128
+ artifact_path=str(normalized_artifact_path),
129
+ artifact_kind=artifact_kind,
130
+ artifact_sha256=(
131
+ sha256_file(normalized_artifact_path)
132
+ if normalized_artifact_path.exists()
133
+ else None
134
+ ),
135
+ )
136
+
137
+
138
+ def _write_manifest(manifest: FileManifest | DirectoryManifest, path: Path) -> Path:
139
+ path.write_text(manifest.model_dump_json(indent=2), encoding="utf-8")
140
+ return path
141
+
142
+
143
+ def write_directory_manifest(
144
+ output_dir: str | Path,
145
+ *,
146
+ produced_files: list[ProducedFile],
147
+ params: dict[str, Any] | None = None,
148
+ inputs: list[str] | None = None,
149
+ function: Callable[..., Any] | None = None,
150
+ filename: str = "manifest.json",
151
+ ) -> Path:
152
+ from data_annotations.annotations import (
153
+ DirectoryAnnotationDocument,
154
+ DirectoryArtifactSubject,
155
+ )
156
+ from data_annotations.description import ArtifactDescription, DirectoryDescription
157
+
158
+ normalized_output_dir = Path(_normalize_local_path(output_dir))
159
+ normalized_output_dir.mkdir(parents=True, exist_ok=True)
160
+
161
+ manifest = _build_directory_manifest(
162
+ normalized_output_dir,
163
+ produced_files=produced_files,
164
+ params=params,
165
+ inputs=inputs,
166
+ function=function,
167
+ )
168
+ document = DirectoryAnnotationDocument(
169
+ subject=DirectoryArtifactSubject(
170
+ path=manifest.output_dir,
171
+ produced_files=list(manifest.produced_files),
172
+ ),
173
+ provenance=BaseProvenance.model_validate(
174
+ manifest.model_dump(exclude={"output_dir", "produced_files"})
175
+ ),
176
+ description=DirectoryDescription(
177
+ artifacts=[
178
+ ArtifactDescription(path=item.path) for item in manifest.produced_files
179
+ ],
180
+ description_updated_at=manifest.created_at,
181
+ ),
182
+ )
183
+ path = normalized_output_dir / filename
184
+ path.write_text(document.model_dump_json(indent=2), encoding="utf-8")
185
+ return path
186
+
187
+
188
+ def write_file_manifest(
189
+ artifact_path: str | Path,
190
+ *,
191
+ artifact_kind: ArtifactKind = "other",
192
+ params: dict[str, Any] | None = None,
193
+ inputs: list[str] | None = None,
194
+ function: Callable[..., Any] | None = None,
195
+ suffix: str = ".meta.json",
196
+ ) -> Path:
197
+ from data_annotations.annotations import write_file_annotation
198
+
199
+ return write_file_annotation(
200
+ artifact_path,
201
+ artifact_kind=artifact_kind,
202
+ params=params,
203
+ inputs=inputs,
204
+ function=function,
205
+ suffix=suffix,
206
+ )