lamindb 0.76.7__py3-none-any.whl → 0.76.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. lamindb/__init__.py +113 -113
  2. lamindb/_artifact.py +1205 -1178
  3. lamindb/_can_validate.py +579 -579
  4. lamindb/_collection.py +387 -387
  5. lamindb/_curate.py +1601 -1601
  6. lamindb/_feature.py +155 -155
  7. lamindb/_feature_set.py +242 -242
  8. lamindb/_filter.py +23 -23
  9. lamindb/_finish.py +256 -256
  10. lamindb/_from_values.py +382 -382
  11. lamindb/_is_versioned.py +40 -40
  12. lamindb/_parents.py +476 -476
  13. lamindb/_query_manager.py +125 -125
  14. lamindb/_query_set.py +362 -362
  15. lamindb/_record.py +649 -649
  16. lamindb/_run.py +57 -57
  17. lamindb/_save.py +308 -295
  18. lamindb/_storage.py +14 -14
  19. lamindb/_transform.py +127 -127
  20. lamindb/_ulabel.py +56 -56
  21. lamindb/_utils.py +9 -9
  22. lamindb/_view.py +72 -72
  23. lamindb/core/__init__.py +94 -94
  24. lamindb/core/_context.py +574 -574
  25. lamindb/core/_data.py +438 -438
  26. lamindb/core/_feature_manager.py +867 -867
  27. lamindb/core/_label_manager.py +253 -253
  28. lamindb/core/_mapped_collection.py +597 -597
  29. lamindb/core/_settings.py +187 -187
  30. lamindb/core/_sync_git.py +138 -138
  31. lamindb/core/_track_environment.py +27 -27
  32. lamindb/core/datasets/__init__.py +59 -59
  33. lamindb/core/datasets/_core.py +571 -571
  34. lamindb/core/datasets/_fake.py +36 -36
  35. lamindb/core/exceptions.py +90 -77
  36. lamindb/core/fields.py +12 -12
  37. lamindb/core/loaders.py +164 -164
  38. lamindb/core/schema.py +56 -56
  39. lamindb/core/storage/__init__.py +25 -25
  40. lamindb/core/storage/_anndata_accessor.py +740 -740
  41. lamindb/core/storage/_anndata_sizes.py +41 -41
  42. lamindb/core/storage/_backed_access.py +98 -98
  43. lamindb/core/storage/_tiledbsoma.py +204 -204
  44. lamindb/core/storage/_valid_suffixes.py +21 -21
  45. lamindb/core/storage/_zarr.py +110 -110
  46. lamindb/core/storage/objects.py +62 -62
  47. lamindb/core/storage/paths.py +172 -141
  48. lamindb/core/subsettings/__init__.py +12 -12
  49. lamindb/core/subsettings/_creation_settings.py +38 -38
  50. lamindb/core/subsettings/_transform_settings.py +21 -21
  51. lamindb/core/types.py +19 -19
  52. lamindb/core/versioning.py +158 -158
  53. lamindb/integrations/__init__.py +12 -12
  54. lamindb/integrations/_vitessce.py +107 -107
  55. lamindb/setup/__init__.py +14 -14
  56. lamindb/setup/core/__init__.py +4 -4
  57. {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/LICENSE +201 -201
  58. {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/METADATA +3 -3
  59. lamindb-0.76.8.dist-info/RECORD +60 -0
  60. {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/WHEEL +1 -1
  61. lamindb-0.76.7.dist-info/RECORD +0 -60
lamindb/core/_context.py CHANGED
@@ -1,574 +1,574 @@
1
- from __future__ import annotations
2
-
3
- import builtins
4
- import hashlib
5
- from datetime import datetime, timezone
6
- from pathlib import Path, PurePath
7
- from typing import TYPE_CHECKING
8
-
9
- import lamindb_setup as ln_setup
10
- from lamin_utils import logger
11
- from lamindb_setup.core.hashing import hash_file
12
- from lnschema_core import Run, Transform, ids
13
- from lnschema_core.ids import base62_12
14
- from lnschema_core.models import format_field_value
15
- from lnschema_core.users import current_user_id
16
-
17
- from ._settings import settings
18
- from ._sync_git import get_transform_reference_from_git_repo
19
- from ._track_environment import track_environment
20
- from .exceptions import (
21
- MissingContextUID,
22
- NotebookNotSaved,
23
- NotebookNotSavedError,
24
- NoTitleError,
25
- TrackNotCalled,
26
- UpdateContext,
27
- )
28
- from .subsettings._transform_settings import transform_settings
29
- from .versioning import bump_version as bump_version_function
30
- from .versioning import increment_base62, message_update_key_in_version_family
31
-
32
- if TYPE_CHECKING:
33
- from lamindb_setup.core.types import UPathStr
34
- from lnschema_core.types import TransformType
35
-
36
- is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
37
-
38
- msg_path_failed = (
39
- "failed to infer notebook path.\nfix: pass `path` to ln.context.track()"
40
- )
41
-
42
-
43
- def get_uid_ext(version: str) -> str:
44
- from lamin_utils._base62 import encodebytes
45
-
46
- # merely zero-padding the nbproject version such that the base62 encoding is
47
- # at least 4 characters long doesn't yields sufficiently diverse hashes and
48
- # leads to collisions; it'd be nice because the uid_ext would be ordered
49
- return encodebytes(hashlib.md5(version.encode()).digest())[:4] # noqa: S324
50
-
51
-
52
- def get_notebook_path():
53
- from nbproject.dev._jupyter_communicate import (
54
- notebook_path as get_notebook_path,
55
- )
56
-
57
- path = None
58
- try:
59
- path = get_notebook_path()
60
- except Exception:
61
- raise RuntimeError(msg_path_failed) from None
62
- if path is None:
63
- raise RuntimeError(msg_path_failed) from None
64
- return path
65
-
66
-
67
- # from https://stackoverflow.com/questions/61901628
68
- def get_notebook_name_colab() -> str:
69
- from socket import gethostbyname, gethostname # type: ignore
70
-
71
- from requests import get # type: ignore
72
-
73
- ip = gethostbyname(gethostname()) # 172.28.0.12
74
- try:
75
- name = get(f"http://{ip}:9000/api/sessions").json()[0]["name"] # noqa: S113
76
- except Exception:
77
- logger.warning(
78
- "could not get notebook name from Google Colab, using: notebook.ipynb"
79
- )
80
- name = "notebook.ipynb"
81
- return name.rstrip(".ipynb")
82
-
83
-
84
- def raise_missing_context(transform_type: str, key: str) -> bool:
85
- transform = Transform.filter(key=key).latest_version().first()
86
- if transform is None:
87
- new_uid = f"{base62_12()}0000"
88
- message = f"To track this {transform_type}, copy & paste the below into the current cell and re-run it\n\n"
89
- message += f'ln.context.uid = "{new_uid}"\nln.context.track()'
90
- else:
91
- uid = transform.uid
92
- suid, vuid = uid[: Transform._len_stem_uid], uid[Transform._len_stem_uid :]
93
- new_vuid = increment_base62(vuid)
94
- new_uid = f"{suid}{new_vuid}"
95
- message = f"You already have a version family with key '{key}' (stem_uid='{transform.stem_uid}').\n\n- to make a revision, set `ln.context.uid = '{new_uid}'`\n- to start a new version family, rename your file and rerun: `ln.context.track()`"
96
- if transform_type == "notebook":
97
- print(f"→ {message}\n")
98
- response = input("→ Ready to re-run? (y/n)")
99
- if response == "y":
100
- logger.important(
101
- "Note: Restart your notebook if you want consecutive cell execution"
102
- )
103
- return True
104
- raise MissingContextUID("Please follow the instructions.")
105
- else:
106
- raise MissingContextUID(message)
107
- return False
108
-
109
-
110
- def pretty_pypackages(dependencies: dict) -> str:
111
- deps_list = []
112
- for pkg, ver in dependencies.items():
113
- if ver != "":
114
- deps_list.append(pkg + f"=={ver}")
115
- else:
116
- deps_list.append(pkg)
117
- deps_list.sort()
118
- return " ".join(deps_list)
119
-
120
-
121
- class Context:
122
- """Run context.
123
-
124
- Enables convenient data lineage tracking by managing a transform & run
125
- upon :meth:`~lamindb.core.Context.track` & :meth:`~lamindb.core.Context.finish`.
126
-
127
- Examples:
128
-
129
- Is typically used via :class:`~lamindb.context`:
130
-
131
- >>> import lamindb as ln
132
- >>> ln.context.track()
133
- >>> # do things while tracking data lineage
134
- >>> ln.context.finish()
135
-
136
- """
137
-
138
- def __init__(self):
139
- self._uid: str | None = None
140
- self._name: str | None = None
141
- self._version: str | None = None
142
- self._transform: Transform | None = None
143
- self._run: Run | None = None
144
- self._path: Path | None = None
145
- """A local path to the script that's running."""
146
- self._logging_message: str = ""
147
-
148
- @property
149
- def transform(self) -> Transform | None:
150
- """Transform of context."""
151
- return self._transform
152
-
153
- @property
154
- def uid(self) -> str | None:
155
- """`uid` to create transform."""
156
- return self._uid
157
-
158
- @uid.setter
159
- def uid(self, value: str | None):
160
- self._uid = value
161
-
162
- @property
163
- def name(self) -> str | None:
164
- """`name` to create transform."""
165
- return self._name
166
-
167
- @name.setter
168
- def name(self, value: str | None):
169
- self._name = value
170
-
171
- @property
172
- def version(self) -> str | None:
173
- """`version` to create transform."""
174
- return self._version
175
-
176
- @version.setter
177
- def version(self, value: str | None):
178
- self._version = value
179
-
180
- @property
181
- def run(self) -> Run | None:
182
- """Run of context."""
183
- return self._run
184
-
185
- def track(
186
- self,
187
- *,
188
- params: dict | None = None,
189
- new_run: bool | None = None,
190
- path: str | None = None,
191
- transform: Transform | None = None,
192
- ) -> None:
193
- """Starts data lineage tracking for a run.
194
-
195
- - sets :attr:`~lamindb.core.Context.transform` &
196
- :attr:`~lamindb.core.Context.run` by creating or loading `Transform` &
197
- `Run` records
198
- - saves compute environment as a `requirements.txt` file: `run.environment`
199
-
200
- If :attr:`~lamindb.core.Settings.sync_git_repo` is set, checks whether a
201
- script-like transform exists in a git repository and links it.
202
-
203
- Args:
204
- params: A dictionary of parameters to track for the run.
205
- new_run: If `False`, loads latest run of transform
206
- (default notebook), if `True`, creates new run (default pipeline).
207
- path: Filepath of notebook or script. Only needed if it can't be
208
- automatically detected.
209
- transform: Useful to track an abstract pipeline.
210
-
211
- Examples:
212
-
213
- To track the run of a notebook or script, call:
214
-
215
- >>> import lamindb as ln
216
- >>> ln.context.track()
217
-
218
- """
219
- self._path = None
220
- if transform is None:
221
- is_tracked = False
222
- transform_settings_are_set = (
223
- transform_settings.stem_uid is not None
224
- and transform_settings.version is not None
225
- )
226
- transform = None
227
- stem_uid = None
228
- if self.uid is not None:
229
- transform = Transform.filter(uid=self.uid).one_or_none()
230
- if self.version is not None:
231
- # test inconsistent version passed
232
- if (
233
- transform is not None
234
- and transform.version is not None
235
- and self.version != transform.version
236
- ):
237
- raise SystemExit(
238
- f"Please pass consistent version: ln.context.version = '{transform.version}'"
239
- )
240
- # test whether version was already used for another member of the family
241
- suid, vuid = (
242
- self.uid[: Transform._len_stem_uid],
243
- self.uid[Transform._len_stem_uid :],
244
- )
245
- transform = Transform.filter(
246
- uid__startswith=suid, version=self.version
247
- ).one_or_none()
248
- if (
249
- transform is not None
250
- and vuid != transform.uid[Transform._len_stem_uid :]
251
- ):
252
- better_version = bump_version_function(self.version)
253
- raise SystemExit(
254
- f"Version '{self.version}' is already taken by Transform(uid='{transform.uid}'); please set another version, e.g., ln.context.version = '{better_version}'"
255
- )
256
- elif transform_settings_are_set:
257
- stem_uid, self.version = (
258
- transform_settings.stem_uid,
259
- transform_settings.version,
260
- )
261
- transform = Transform.filter(
262
- uid__startswith=stem_uid, version=self.version
263
- ).one_or_none()
264
- if is_run_from_ipython:
265
- key, name = self._track_notebook(path=path)
266
- transform_type = "notebook"
267
- transform_ref = None
268
- transform_ref_type = None
269
- else:
270
- (name, key, transform_ref, transform_ref_type) = self._track_script(
271
- path=path
272
- )
273
- transform_type = "script"
274
- if self.uid is not None or transform_settings_are_set:
275
- # overwrite whatever is auto-detected in the notebook or script
276
- if self.name is not None:
277
- name = self.name
278
- self._create_or_load_transform(
279
- uid=self.uid,
280
- stem_uid=stem_uid,
281
- version=self.version,
282
- name=name,
283
- transform_ref=transform_ref,
284
- transform_ref_type=transform_ref_type,
285
- transform_type=transform_type,
286
- key=key,
287
- transform=transform,
288
- )
289
- # if no error is raised, the transform is tracked
290
- is_tracked = True
291
- if not is_tracked:
292
- early_return = raise_missing_context(transform_type, key)
293
- if early_return:
294
- return None
295
- else:
296
- if transform.type in {"notebook", "script"}:
297
- raise ValueError(
298
- "Use ln.context.track() without passing transform in a notebook or script"
299
- " - metadata is automatically parsed"
300
- )
301
- transform_exists = None
302
- if transform.id is not None:
303
- # transform has an id but unclear whether already saved
304
- transform_exists = Transform.filter(id=transform.id).first()
305
- if transform_exists is None:
306
- transform.save()
307
- self._logging_message += f"created Transform(uid='{transform.uid}')"
308
- transform_exists = transform
309
- else:
310
- self._logging_message += f"loaded Transform(uid='{transform.uid}')"
311
- self._transform = transform_exists
312
-
313
- if new_run is None: # for notebooks, default to loading latest runs
314
- new_run = False if self._transform.type == "notebook" else True # type: ignore
315
-
316
- run = None
317
- if not new_run: # try loading latest run by same user
318
- run = (
319
- Run.filter(transform=self._transform, created_by_id=current_user_id())
320
- .order_by("-created_at")
321
- .first()
322
- )
323
- if run is not None: # loaded latest run
324
- run.started_at = datetime.now(timezone.utc) # update run time
325
- self._logging_message += (
326
- f" & loaded Run(started_at={format_field_value(run.started_at)})"
327
- )
328
-
329
- if run is None: # create new run
330
- run = Run(
331
- transform=self._transform,
332
- params=params,
333
- )
334
- run.started_at = datetime.now(timezone.utc)
335
- self._logging_message += (
336
- f" & created Run(started_at={format_field_value(run.started_at)})"
337
- )
338
- # can only determine at ln.finish() if run was consecutive in
339
- # interactive session, otherwise, is consecutive
340
- run.is_consecutive = True if is_run_from_ipython else None
341
- # need to save in all cases
342
- run.save()
343
- if params is not None:
344
- run.params.add_values(params)
345
- self._run = run
346
- track_environment(run)
347
- logger.important(self._logging_message)
348
- self._logging_message = ""
349
-
350
- def _track_script(
351
- self,
352
- *,
353
- path: UPathStr | None,
354
- ) -> tuple[str, str, str, str]:
355
- if path is None:
356
- import inspect
357
-
358
- frame = inspect.stack()[2]
359
- module = inspect.getmodule(frame[0])
360
- self._path = Path(module.__file__)
361
- else:
362
- self._path = Path(path)
363
- name = self._path.name
364
- key = name
365
- reference = None
366
- reference_type = None
367
- if settings.sync_git_repo is not None:
368
- reference = get_transform_reference_from_git_repo(self._path)
369
- reference_type = "url"
370
- return name, key, reference, reference_type
371
-
372
- def _track_notebook(
373
- self,
374
- *,
375
- path: str | None,
376
- ):
377
- if path is None:
378
- path = get_notebook_path()
379
- key = Path(path).name
380
- if isinstance(path, (Path, PurePath)):
381
- path_str = path.as_posix() # type: ignore
382
- else:
383
- path_str = str(path)
384
- if path_str.endswith("Untitled.ipynb"):
385
- raise RuntimeError("Please rename your notebook before tracking it")
386
- if path_str.startswith("/fileId="):
387
- name = get_notebook_name_colab()
388
- key = f"{name}.ipynb"
389
- else:
390
- import nbproject
391
-
392
- try:
393
- nbproject_title = nbproject.meta.live.title
394
- except IndexError:
395
- raise NotebookNotSavedError(
396
- "The notebook is not saved, please save the notebook and"
397
- " rerun `ln.context.track()`"
398
- ) from None
399
- if nbproject_title is None:
400
- raise NoTitleError(
401
- "Please add a title to your notebook in a markdown cell: # Title"
402
- ) from None
403
- name = nbproject_title
404
- # log imported python packages
405
- if not path_str.startswith("/fileId="):
406
- try:
407
- from nbproject.dev._pypackage import infer_pypackages
408
-
409
- nb = nbproject.dev.read_notebook(path_str)
410
- logger.important(
411
- "notebook imports:"
412
- f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}"
413
- )
414
- except Exception:
415
- logger.debug("inferring imported packages failed")
416
- pass
417
- self._path = Path(path_str)
418
- return key, name
419
-
420
- def _create_or_load_transform(
421
- self,
422
- *,
423
- uid: str | None,
424
- stem_uid: str | None,
425
- version: str | None,
426
- name: str,
427
- transform_ref: str | None = None,
428
- transform_ref_type: str | None = None,
429
- key: str | None = None,
430
- transform_type: TransformType = None,
431
- transform: Transform | None = None,
432
- ):
433
- # make a new transform record
434
- if transform is None:
435
- if uid is None:
436
- uid = f"{stem_uid}{get_uid_ext(version)}"
437
- # note that here we're not passing revises because we're not querying it
438
- # hence, we need to do a revision family lookup based on key
439
- # hence, we need key to be not None
440
- assert key is not None # noqa: S101
441
- transform = Transform(
442
- uid=uid,
443
- version=version,
444
- name=name,
445
- key=key,
446
- reference=transform_ref,
447
- reference_type=transform_ref_type,
448
- type=transform_type,
449
- ).save()
450
- self._logging_message += f"created Transform(uid='{transform.uid}')"
451
- else:
452
- uid = transform.uid
453
- # check whether the transform file has been renamed
454
- if transform.key != key:
455
- suid = transform.stem_uid
456
- new_suid = ids.base62_12()
457
- transform_type = "Notebook" if is_run_from_ipython else "Script"
458
- note = message_update_key_in_version_family(
459
- suid=suid,
460
- existing_key=transform.key,
461
- new_key=key,
462
- registry="Transform",
463
- )
464
- raise UpdateContext(
465
- f"{transform_type} filename changed.\n\nEither init a new transform family by setting:\n\n"
466
- f'ln.context.uid = "{new_suid}0000"\n\n{note}'
467
- )
468
- elif transform.name != name:
469
- transform.name = name
470
- transform.save()
471
- self._logging_message += (
472
- "updated transform name, " # white space on purpose
473
- )
474
- # check whether transform source code was already saved
475
- if (
476
- transform._source_code_artifact_id is not None
477
- or transform.source_code is not None
478
- ):
479
- bump_revision = False
480
- if is_run_from_ipython:
481
- bump_revision = True
482
- else:
483
- hash, _ = hash_file(self._path) # ignore hash_type for now
484
- if transform.hash is not None:
485
- condition = hash != transform.hash
486
- else:
487
- condition = hash != transform._source_code_artifact.hash
488
- if condition:
489
- bump_revision = True
490
- else:
491
- self._logging_message += (
492
- f"loaded Transform(uid='{transform.uid}')"
493
- )
494
- if bump_revision:
495
- change_type = (
496
- "Re-running saved notebook"
497
- if is_run_from_ipython
498
- else "Source code changed"
499
- )
500
- suid, vuid = (
501
- uid[:-4],
502
- uid[-4:],
503
- )
504
- new_vuid = increment_base62(vuid)
505
- raise UpdateContext(
506
- f"{change_type}, bump revision by setting:\n\n"
507
- f'ln.context.uid = "{suid}{new_vuid}"'
508
- )
509
- else:
510
- self._logging_message += f"loaded Transform(uid='{transform.uid}')"
511
- self._transform = transform
512
-
513
- def finish(self, ignore_non_consecutive: None | bool = None) -> None:
514
- """Mark the run context as finished.
515
-
516
- - writes a timestamp: `run.finished_at`
517
- - saves the source code: `transform.source_code`
518
-
519
- When called in the last cell of a notebook:
520
-
521
- - prompts for user input if not consecutively executed
522
- - requires to save the notebook in your editor right before
523
- - saves a run report: `run.report`
524
-
525
- Args:
526
- ignore_non_consecutive: Whether to ignore if a notebook was non-consecutively executed.
527
-
528
- Examples:
529
-
530
- >>> import lamindb as ln
531
- >>> ln.context.track()
532
- >>> # do things while tracking data lineage
533
- >>> ln.context.finish()
534
-
535
- See Also:
536
- `lamin save script.py` or `lamin save notebook.ipynb` → `docs </cli#lamin-save>`__
537
-
538
- """
539
- from lamindb._finish import save_context_core
540
-
541
- def get_seconds_since_modified(filepath) -> float:
542
- return datetime.now().timestamp() - filepath.stat().st_mtime
543
-
544
- def get_shortcut() -> str:
545
- import platform
546
-
547
- return "CMD + s" if platform.system() == "Darwin" else "CTRL + s"
548
-
549
- if context.run is None:
550
- raise TrackNotCalled("Please run `ln.context.track()` before `ln.finish()`")
551
- if context._path is None:
552
- if context.run.transform.type in {"script", "notebook"}:
553
- raise ValueError(
554
- f"Transform type is not allowed to be 'script' or 'notebook' but is {context.run.transform.type}."
555
- )
556
- context.run.finished_at = datetime.now(timezone.utc)
557
- context.run.save()
558
- # nothing else to do
559
- return None
560
- if is_run_from_ipython: # notebooks
561
- if get_seconds_since_modified(context._path) > 2 and not ln_setup._TESTING:
562
- raise NotebookNotSaved(
563
- f"Please save the notebook in your editor (shortcut `{get_shortcut()}`) right before calling `ln.context.finish()`"
564
- )
565
- save_context_core(
566
- run=context.run,
567
- transform=context.run.transform,
568
- filepath=context._path,
569
- finished_at=True,
570
- ignore_non_consecutive=ignore_non_consecutive,
571
- )
572
-
573
-
574
- context = Context()
1
+ from __future__ import annotations
2
+
3
+ import builtins
4
+ import hashlib
5
+ from datetime import datetime, timezone
6
+ from pathlib import Path, PurePath
7
+ from typing import TYPE_CHECKING
8
+
9
+ import lamindb_setup as ln_setup
10
+ from lamin_utils import logger
11
+ from lamindb_setup.core.hashing import hash_file
12
+ from lnschema_core import Run, Transform, ids
13
+ from lnschema_core.ids import base62_12
14
+ from lnschema_core.models import format_field_value
15
+ from lnschema_core.users import current_user_id
16
+
17
+ from ._settings import settings
18
+ from ._sync_git import get_transform_reference_from_git_repo
19
+ from ._track_environment import track_environment
20
+ from .exceptions import (
21
+ MissingContextUID,
22
+ NotebookNotSaved,
23
+ NotebookNotSavedError,
24
+ NoTitleError,
25
+ TrackNotCalled,
26
+ UpdateContext,
27
+ )
28
+ from .subsettings._transform_settings import transform_settings
29
+ from .versioning import bump_version as bump_version_function
30
+ from .versioning import increment_base62, message_update_key_in_version_family
31
+
32
+ if TYPE_CHECKING:
33
+ from lamindb_setup.core.types import UPathStr
34
+ from lnschema_core.types import TransformType
35
+
36
+ is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
37
+
38
+ msg_path_failed = (
39
+ "failed to infer notebook path.\nfix: pass `path` to ln.context.track()"
40
+ )
41
+
42
+
43
+ def get_uid_ext(version: str) -> str:
44
+ from lamin_utils._base62 import encodebytes
45
+
46
+ # merely zero-padding the nbproject version such that the base62 encoding is
47
+ # at least 4 characters long doesn't yields sufficiently diverse hashes and
48
+ # leads to collisions; it'd be nice because the uid_ext would be ordered
49
+ return encodebytes(hashlib.md5(version.encode()).digest())[:4] # noqa: S324
50
+
51
+
52
+ def get_notebook_path():
53
+ from nbproject.dev._jupyter_communicate import (
54
+ notebook_path as get_notebook_path,
55
+ )
56
+
57
+ path = None
58
+ try:
59
+ path = get_notebook_path()
60
+ except Exception:
61
+ raise RuntimeError(msg_path_failed) from None
62
+ if path is None:
63
+ raise RuntimeError(msg_path_failed) from None
64
+ return path
65
+
66
+
67
+ # from https://stackoverflow.com/questions/61901628
68
+ def get_notebook_name_colab() -> str:
69
+ from socket import gethostbyname, gethostname # type: ignore
70
+
71
+ from requests import get # type: ignore
72
+
73
+ ip = gethostbyname(gethostname()) # 172.28.0.12
74
+ try:
75
+ name = get(f"http://{ip}:9000/api/sessions").json()[0]["name"] # noqa: S113
76
+ except Exception:
77
+ logger.warning(
78
+ "could not get notebook name from Google Colab, using: notebook.ipynb"
79
+ )
80
+ name = "notebook.ipynb"
81
+ return name.rstrip(".ipynb")
82
+
83
+
84
+ def raise_missing_context(transform_type: str, key: str) -> bool:
85
+ transform = Transform.filter(key=key).latest_version().first()
86
+ if transform is None:
87
+ new_uid = f"{base62_12()}0000"
88
+ message = f"To track this {transform_type}, copy & paste the below into the current cell and re-run it\n\n"
89
+ message += f'ln.context.uid = "{new_uid}"\nln.context.track()'
90
+ else:
91
+ uid = transform.uid
92
+ suid, vuid = uid[: Transform._len_stem_uid], uid[Transform._len_stem_uid :]
93
+ new_vuid = increment_base62(vuid)
94
+ new_uid = f"{suid}{new_vuid}"
95
+ message = f"You already have a version family with key '{key}' (stem_uid='{transform.stem_uid}').\n\n- to make a revision, set `ln.context.uid = '{new_uid}'`\n- to start a new version family, rename your file and rerun: `ln.context.track()`"
96
+ if transform_type == "notebook":
97
+ print(f"→ {message}\n")
98
+ response = input("→ Ready to re-run? (y/n)")
99
+ if response == "y":
100
+ logger.important(
101
+ "Note: Restart your notebook if you want consecutive cell execution"
102
+ )
103
+ return True
104
+ raise MissingContextUID("Please follow the instructions.")
105
+ else:
106
+ raise MissingContextUID(message)
107
+ return False
108
+
109
+
110
+ def pretty_pypackages(dependencies: dict) -> str:
111
+ deps_list = []
112
+ for pkg, ver in dependencies.items():
113
+ if ver != "":
114
+ deps_list.append(pkg + f"=={ver}")
115
+ else:
116
+ deps_list.append(pkg)
117
+ deps_list.sort()
118
+ return " ".join(deps_list)
119
+
120
+
121
+ class Context:
122
+ """Run context.
123
+
124
+ Enables convenient data lineage tracking by managing a transform & run
125
+ upon :meth:`~lamindb.core.Context.track` & :meth:`~lamindb.core.Context.finish`.
126
+
127
+ Examples:
128
+
129
+ Is typically used via :class:`~lamindb.context`:
130
+
131
+ >>> import lamindb as ln
132
+ >>> ln.context.track()
133
+ >>> # do things while tracking data lineage
134
+ >>> ln.context.finish()
135
+
136
+ """
137
+
138
+ def __init__(self):
139
+ self._uid: str | None = None
140
+ self._name: str | None = None
141
+ self._version: str | None = None
142
+ self._transform: Transform | None = None
143
+ self._run: Run | None = None
144
+ self._path: Path | None = None
145
+ """A local path to the script that's running."""
146
+ self._logging_message: str = ""
147
+
148
+ @property
149
+ def transform(self) -> Transform | None:
150
+ """Transform of context."""
151
+ return self._transform
152
+
153
+ @property
154
+ def uid(self) -> str | None:
155
+ """`uid` to create transform."""
156
+ return self._uid
157
+
158
+ @uid.setter
159
+ def uid(self, value: str | None):
160
+ self._uid = value
161
+
162
+ @property
163
+ def name(self) -> str | None:
164
+ """`name` to create transform."""
165
+ return self._name
166
+
167
+ @name.setter
168
+ def name(self, value: str | None):
169
+ self._name = value
170
+
171
+ @property
172
+ def version(self) -> str | None:
173
+ """`version` to create transform."""
174
+ return self._version
175
+
176
+ @version.setter
177
+ def version(self, value: str | None):
178
+ self._version = value
179
+
180
+ @property
181
+ def run(self) -> Run | None:
182
+ """Run of context."""
183
+ return self._run
184
+
185
+ def track(
186
+ self,
187
+ *,
188
+ params: dict | None = None,
189
+ new_run: bool | None = None,
190
+ path: str | None = None,
191
+ transform: Transform | None = None,
192
+ ) -> None:
193
+ """Starts data lineage tracking for a run.
194
+
195
+ - sets :attr:`~lamindb.core.Context.transform` &
196
+ :attr:`~lamindb.core.Context.run` by creating or loading `Transform` &
197
+ `Run` records
198
+ - saves compute environment as a `requirements.txt` file: `run.environment`
199
+
200
+ If :attr:`~lamindb.core.Settings.sync_git_repo` is set, checks whether a
201
+ script-like transform exists in a git repository and links it.
202
+
203
+ Args:
204
+ params: A dictionary of parameters to track for the run.
205
+ new_run: If `False`, loads latest run of transform
206
+ (default notebook), if `True`, creates new run (default pipeline).
207
+ path: Filepath of notebook or script. Only needed if it can't be
208
+ automatically detected.
209
+ transform: Useful to track an abstract pipeline.
210
+
211
+ Examples:
212
+
213
+ To track the run of a notebook or script, call:
214
+
215
+ >>> import lamindb as ln
216
+ >>> ln.context.track()
217
+
218
+ """
219
+ self._path = None
220
+ if transform is None:
221
+ is_tracked = False
222
+ transform_settings_are_set = (
223
+ transform_settings.stem_uid is not None
224
+ and transform_settings.version is not None
225
+ )
226
+ transform = None
227
+ stem_uid = None
228
+ if self.uid is not None:
229
+ transform = Transform.filter(uid=self.uid).one_or_none()
230
+ if self.version is not None:
231
+ # test inconsistent version passed
232
+ if (
233
+ transform is not None
234
+ and transform.version is not None
235
+ and self.version != transform.version
236
+ ):
237
+ raise SystemExit(
238
+ f"Please pass consistent version: ln.context.version = '{transform.version}'"
239
+ )
240
+ # test whether version was already used for another member of the family
241
+ suid, vuid = (
242
+ self.uid[: Transform._len_stem_uid],
243
+ self.uid[Transform._len_stem_uid :],
244
+ )
245
+ transform = Transform.filter(
246
+ uid__startswith=suid, version=self.version
247
+ ).one_or_none()
248
+ if (
249
+ transform is not None
250
+ and vuid != transform.uid[Transform._len_stem_uid :]
251
+ ):
252
+ better_version = bump_version_function(self.version)
253
+ raise SystemExit(
254
+ f"Version '{self.version}' is already taken by Transform(uid='{transform.uid}'); please set another version, e.g., ln.context.version = '{better_version}'"
255
+ )
256
+ elif transform_settings_are_set:
257
+ stem_uid, self.version = (
258
+ transform_settings.stem_uid,
259
+ transform_settings.version,
260
+ )
261
+ transform = Transform.filter(
262
+ uid__startswith=stem_uid, version=self.version
263
+ ).one_or_none()
264
+ if is_run_from_ipython:
265
+ key, name = self._track_notebook(path=path)
266
+ transform_type = "notebook"
267
+ transform_ref = None
268
+ transform_ref_type = None
269
+ else:
270
+ (name, key, transform_ref, transform_ref_type) = self._track_script(
271
+ path=path
272
+ )
273
+ transform_type = "script"
274
+ if self.uid is not None or transform_settings_are_set:
275
+ # overwrite whatever is auto-detected in the notebook or script
276
+ if self.name is not None:
277
+ name = self.name
278
+ self._create_or_load_transform(
279
+ uid=self.uid,
280
+ stem_uid=stem_uid,
281
+ version=self.version,
282
+ name=name,
283
+ transform_ref=transform_ref,
284
+ transform_ref_type=transform_ref_type,
285
+ transform_type=transform_type,
286
+ key=key,
287
+ transform=transform,
288
+ )
289
+ # if no error is raised, the transform is tracked
290
+ is_tracked = True
291
+ if not is_tracked:
292
+ early_return = raise_missing_context(transform_type, key)
293
+ if early_return:
294
+ return None
295
+ else:
296
+ if transform.type in {"notebook", "script"}:
297
+ raise ValueError(
298
+ "Use ln.context.track() without passing transform in a notebook or script"
299
+ " - metadata is automatically parsed"
300
+ )
301
+ transform_exists = None
302
+ if transform.id is not None:
303
+ # transform has an id but unclear whether already saved
304
+ transform_exists = Transform.filter(id=transform.id).first()
305
+ if transform_exists is None:
306
+ transform.save()
307
+ self._logging_message += f"created Transform(uid='{transform.uid}')"
308
+ transform_exists = transform
309
+ else:
310
+ self._logging_message += f"loaded Transform(uid='{transform.uid}')"
311
+ self._transform = transform_exists
312
+
313
+ if new_run is None: # for notebooks, default to loading latest runs
314
+ new_run = False if self._transform.type == "notebook" else True # type: ignore
315
+
316
+ run = None
317
+ if not new_run: # try loading latest run by same user
318
+ run = (
319
+ Run.filter(transform=self._transform, created_by_id=current_user_id())
320
+ .order_by("-created_at")
321
+ .first()
322
+ )
323
+ if run is not None: # loaded latest run
324
+ run.started_at = datetime.now(timezone.utc) # update run time
325
+ self._logging_message += (
326
+ f" & loaded Run(started_at={format_field_value(run.started_at)})"
327
+ )
328
+
329
+ if run is None: # create new run
330
+ run = Run(
331
+ transform=self._transform,
332
+ params=params,
333
+ )
334
+ run.started_at = datetime.now(timezone.utc)
335
+ self._logging_message += (
336
+ f" & created Run(started_at={format_field_value(run.started_at)})"
337
+ )
338
+ # can only determine at ln.finish() if run was consecutive in
339
+ # interactive session, otherwise, is consecutive
340
+ run.is_consecutive = True if is_run_from_ipython else None
341
+ # need to save in all cases
342
+ run.save()
343
+ if params is not None:
344
+ run.params.add_values(params)
345
+ self._run = run
346
+ track_environment(run)
347
+ logger.important(self._logging_message)
348
+ self._logging_message = ""
349
+
350
+ def _track_script(
351
+ self,
352
+ *,
353
+ path: UPathStr | None,
354
+ ) -> tuple[str, str, str, str]:
355
+ if path is None:
356
+ import inspect
357
+
358
+ frame = inspect.stack()[2]
359
+ module = inspect.getmodule(frame[0])
360
+ self._path = Path(module.__file__)
361
+ else:
362
+ self._path = Path(path)
363
+ name = self._path.name
364
+ key = name
365
+ reference = None
366
+ reference_type = None
367
+ if settings.sync_git_repo is not None:
368
+ reference = get_transform_reference_from_git_repo(self._path)
369
+ reference_type = "url"
370
+ return name, key, reference, reference_type
371
+
372
+ def _track_notebook(
373
+ self,
374
+ *,
375
+ path: str | None,
376
+ ):
377
+ if path is None:
378
+ path = get_notebook_path()
379
+ key = Path(path).name
380
+ if isinstance(path, (Path, PurePath)):
381
+ path_str = path.as_posix() # type: ignore
382
+ else:
383
+ path_str = str(path)
384
+ if path_str.endswith("Untitled.ipynb"):
385
+ raise RuntimeError("Please rename your notebook before tracking it")
386
+ if path_str.startswith("/fileId="):
387
+ name = get_notebook_name_colab()
388
+ key = f"{name}.ipynb"
389
+ else:
390
+ import nbproject
391
+
392
+ try:
393
+ nbproject_title = nbproject.meta.live.title
394
+ except IndexError:
395
+ raise NotebookNotSavedError(
396
+ "The notebook is not saved, please save the notebook and"
397
+ " rerun `ln.context.track()`"
398
+ ) from None
399
+ if nbproject_title is None:
400
+ raise NoTitleError(
401
+ "Please add a title to your notebook in a markdown cell: # Title"
402
+ ) from None
403
+ name = nbproject_title
404
+ # log imported python packages
405
+ if not path_str.startswith("/fileId="):
406
+ try:
407
+ from nbproject.dev._pypackage import infer_pypackages
408
+
409
+ nb = nbproject.dev.read_notebook(path_str)
410
+ logger.important(
411
+ "notebook imports:"
412
+ f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}"
413
+ )
414
+ except Exception:
415
+ logger.debug("inferring imported packages failed")
416
+ pass
417
+ self._path = Path(path_str)
418
+ return key, name
419
+
420
+ def _create_or_load_transform(
421
+ self,
422
+ *,
423
+ uid: str | None,
424
+ stem_uid: str | None,
425
+ version: str | None,
426
+ name: str,
427
+ transform_ref: str | None = None,
428
+ transform_ref_type: str | None = None,
429
+ key: str | None = None,
430
+ transform_type: TransformType = None,
431
+ transform: Transform | None = None,
432
+ ):
433
+ # make a new transform record
434
+ if transform is None:
435
+ if uid is None:
436
+ uid = f"{stem_uid}{get_uid_ext(version)}"
437
+ # note that here we're not passing revises because we're not querying it
438
+ # hence, we need to do a revision family lookup based on key
439
+ # hence, we need key to be not None
440
+ assert key is not None # noqa: S101
441
+ transform = Transform(
442
+ uid=uid,
443
+ version=version,
444
+ name=name,
445
+ key=key,
446
+ reference=transform_ref,
447
+ reference_type=transform_ref_type,
448
+ type=transform_type,
449
+ ).save()
450
+ self._logging_message += f"created Transform(uid='{transform.uid}')"
451
+ else:
452
+ uid = transform.uid
453
+ # check whether the transform file has been renamed
454
+ if transform.key != key:
455
+ suid = transform.stem_uid
456
+ new_suid = ids.base62_12()
457
+ transform_type = "Notebook" if is_run_from_ipython else "Script"
458
+ note = message_update_key_in_version_family(
459
+ suid=suid,
460
+ existing_key=transform.key,
461
+ new_key=key,
462
+ registry="Transform",
463
+ )
464
+ raise UpdateContext(
465
+ f"{transform_type} filename changed.\n\nEither init a new transform family by setting:\n\n"
466
+ f'ln.context.uid = "{new_suid}0000"\n\n{note}'
467
+ )
468
+ elif transform.name != name:
469
+ transform.name = name
470
+ transform.save()
471
+ self._logging_message += (
472
+ "updated transform name, " # white space on purpose
473
+ )
474
+ # check whether transform source code was already saved
475
+ if (
476
+ transform._source_code_artifact_id is not None
477
+ or transform.source_code is not None
478
+ ):
479
+ bump_revision = False
480
+ if is_run_from_ipython:
481
+ bump_revision = True
482
+ else:
483
+ hash, _ = hash_file(self._path) # ignore hash_type for now
484
+ if transform.hash is not None:
485
+ condition = hash != transform.hash
486
+ else:
487
+ condition = hash != transform._source_code_artifact.hash
488
+ if condition:
489
+ bump_revision = True
490
+ else:
491
+ self._logging_message += (
492
+ f"loaded Transform(uid='{transform.uid}')"
493
+ )
494
+ if bump_revision:
495
+ change_type = (
496
+ "Re-running saved notebook"
497
+ if is_run_from_ipython
498
+ else "Source code changed"
499
+ )
500
+ suid, vuid = (
501
+ uid[:-4],
502
+ uid[-4:],
503
+ )
504
+ new_vuid = increment_base62(vuid)
505
+ raise UpdateContext(
506
+ f"{change_type}, bump revision by setting:\n\n"
507
+ f'ln.context.uid = "{suid}{new_vuid}"'
508
+ )
509
+ else:
510
+ self._logging_message += f"loaded Transform(uid='{transform.uid}')"
511
+ self._transform = transform
512
+
513
+ def finish(self, ignore_non_consecutive: None | bool = None) -> None:
514
+ """Mark the run context as finished.
515
+
516
+ - writes a timestamp: `run.finished_at`
517
+ - saves the source code: `transform.source_code`
518
+
519
+ When called in the last cell of a notebook:
520
+
521
+ - prompts for user input if not consecutively executed
522
+ - requires to save the notebook in your editor right before
523
+ - saves a run report: `run.report`
524
+
525
+ Args:
526
+ ignore_non_consecutive: Whether to ignore if a notebook was non-consecutively executed.
527
+
528
+ Examples:
529
+
530
+ >>> import lamindb as ln
531
+ >>> ln.context.track()
532
+ >>> # do things while tracking data lineage
533
+ >>> ln.context.finish()
534
+
535
+ See Also:
536
+ `lamin save script.py` or `lamin save notebook.ipynb` → `docs </cli#lamin-save>`__
537
+
538
+ """
539
+ from lamindb._finish import save_context_core
540
+
541
+ def get_seconds_since_modified(filepath) -> float:
542
+ return datetime.now().timestamp() - filepath.stat().st_mtime
543
+
544
+ def get_shortcut() -> str:
545
+ import platform
546
+
547
+ return "CMD + s" if platform.system() == "Darwin" else "CTRL + s"
548
+
549
+ if context.run is None:
550
+ raise TrackNotCalled("Please run `ln.context.track()` before `ln.finish()`")
551
+ if context._path is None:
552
+ if context.run.transform.type in {"script", "notebook"}:
553
+ raise ValueError(
554
+ f"Transform type is not allowed to be 'script' or 'notebook' but is {context.run.transform.type}."
555
+ )
556
+ context.run.finished_at = datetime.now(timezone.utc)
557
+ context.run.save()
558
+ # nothing else to do
559
+ return None
560
+ if is_run_from_ipython: # notebooks
561
+ if get_seconds_since_modified(context._path) > 2 and not ln_setup._TESTING:
562
+ raise NotebookNotSaved(
563
+ f"Please save the notebook in your editor (shortcut `{get_shortcut()}`) right before calling `ln.context.finish()`"
564
+ )
565
+ save_context_core(
566
+ run=context.run,
567
+ transform=context.run.transform,
568
+ filepath=context._path,
569
+ finished_at=True,
570
+ ignore_non_consecutive=ignore_non_consecutive,
571
+ )
572
+
573
+
574
+ context = Context()