lamindb 0.76.6__py3-none-any.whl → 0.76.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. lamindb/__init__.py +113 -113
  2. lamindb/_artifact.py +1205 -1174
  3. lamindb/_can_validate.py +579 -579
  4. lamindb/_collection.py +387 -382
  5. lamindb/_curate.py +1601 -1601
  6. lamindb/_feature.py +155 -155
  7. lamindb/_feature_set.py +242 -242
  8. lamindb/_filter.py +23 -23
  9. lamindb/_finish.py +256 -256
  10. lamindb/_from_values.py +382 -382
  11. lamindb/_is_versioned.py +40 -40
  12. lamindb/_parents.py +476 -476
  13. lamindb/_query_manager.py +125 -125
  14. lamindb/_query_set.py +362 -362
  15. lamindb/_record.py +649 -649
  16. lamindb/_run.py +57 -57
  17. lamindb/_save.py +308 -295
  18. lamindb/_storage.py +14 -14
  19. lamindb/_transform.py +127 -127
  20. lamindb/_ulabel.py +56 -56
  21. lamindb/_utils.py +9 -9
  22. lamindb/_view.py +72 -72
  23. lamindb/core/__init__.py +94 -93
  24. lamindb/core/_context.py +574 -558
  25. lamindb/core/_data.py +438 -438
  26. lamindb/core/_feature_manager.py +867 -866
  27. lamindb/core/_label_manager.py +253 -252
  28. lamindb/core/_mapped_collection.py +597 -597
  29. lamindb/core/_settings.py +187 -187
  30. lamindb/core/_sync_git.py +138 -138
  31. lamindb/core/_track_environment.py +27 -27
  32. lamindb/core/datasets/__init__.py +59 -59
  33. lamindb/core/datasets/_core.py +571 -571
  34. lamindb/core/datasets/_fake.py +36 -36
  35. lamindb/core/exceptions.py +90 -77
  36. lamindb/core/fields.py +12 -12
  37. lamindb/core/loaders.py +164 -0
  38. lamindb/core/schema.py +56 -56
  39. lamindb/core/storage/__init__.py +25 -25
  40. lamindb/core/storage/_anndata_accessor.py +740 -740
  41. lamindb/core/storage/_anndata_sizes.py +41 -41
  42. lamindb/core/storage/_backed_access.py +98 -98
  43. lamindb/core/storage/_tiledbsoma.py +204 -196
  44. lamindb/core/storage/_valid_suffixes.py +21 -21
  45. lamindb/core/storage/_zarr.py +110 -110
  46. lamindb/core/storage/objects.py +62 -62
  47. lamindb/core/storage/paths.py +172 -245
  48. lamindb/core/subsettings/__init__.py +12 -12
  49. lamindb/core/subsettings/_creation_settings.py +38 -38
  50. lamindb/core/subsettings/_transform_settings.py +21 -21
  51. lamindb/core/types.py +19 -19
  52. lamindb/core/versioning.py +158 -158
  53. lamindb/integrations/__init__.py +12 -12
  54. lamindb/integrations/_vitessce.py +107 -107
  55. lamindb/setup/__init__.py +14 -14
  56. lamindb/setup/core/__init__.py +4 -4
  57. {lamindb-0.76.6.dist-info → lamindb-0.76.8.dist-info}/LICENSE +201 -201
  58. {lamindb-0.76.6.dist-info → lamindb-0.76.8.dist-info}/METADATA +5 -5
  59. lamindb-0.76.8.dist-info/RECORD +60 -0
  60. {lamindb-0.76.6.dist-info → lamindb-0.76.8.dist-info}/WHEEL +1 -1
  61. lamindb-0.76.6.dist-info/RECORD +0 -59
lamindb/core/_context.py CHANGED
@@ -1,558 +1,574 @@
1
- from __future__ import annotations
2
-
3
- import builtins
4
- import hashlib
5
- from datetime import datetime, timezone
6
- from pathlib import Path, PurePath
7
- from typing import TYPE_CHECKING
8
-
9
- import lamindb_setup as ln_setup
10
- from lamin_utils import logger
11
- from lamindb_setup.core.hashing import hash_file
12
- from lnschema_core import Run, Transform, ids
13
- from lnschema_core.ids import base62_12
14
- from lnschema_core.models import format_field_value
15
- from lnschema_core.users import current_user_id
16
-
17
- from ._settings import settings
18
- from ._sync_git import get_transform_reference_from_git_repo
19
- from ._track_environment import track_environment
20
- from .exceptions import (
21
- MissingContext,
22
- NotebookFileNotSavedToDisk,
23
- NotebookNotSavedError,
24
- NoTitleError,
25
- TrackNotCalled,
26
- UpdateContext,
27
- )
28
- from .subsettings._transform_settings import transform_settings
29
- from .versioning import bump_version as bump_version_function
30
- from .versioning import increment_base62, message_update_key_in_version_family
31
-
32
- if TYPE_CHECKING:
33
- from lamindb_setup.core.types import UPathStr
34
- from lnschema_core.types import TransformType
35
-
36
- is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
37
-
38
- msg_path_failed = (
39
- "failed to infer notebook path.\nfix: pass `path` to ln.context.track()"
40
- )
41
-
42
-
43
- def get_uid_ext(version: str) -> str:
44
- from lamin_utils._base62 import encodebytes
45
-
46
- # merely zero-padding the nbproject version such that the base62 encoding is
47
- # at least 4 characters long doesn't yields sufficiently diverse hashes and
48
- # leads to collisions; it'd be nice because the uid_ext would be ordered
49
- return encodebytes(hashlib.md5(version.encode()).digest())[:4] # noqa: S324
50
-
51
-
52
- def get_notebook_path():
53
- from nbproject.dev._jupyter_communicate import (
54
- notebook_path as get_notebook_path,
55
- )
56
-
57
- path = None
58
- try:
59
- path = get_notebook_path()
60
- except Exception:
61
- raise RuntimeError(msg_path_failed) from None
62
- if path is None:
63
- raise RuntimeError(msg_path_failed) from None
64
- return path
65
-
66
-
67
- # from https://stackoverflow.com/questions/61901628
68
- def get_notebook_name_colab() -> str:
69
- from socket import gethostbyname, gethostname # type: ignore
70
-
71
- from requests import get # type: ignore
72
-
73
- ip = gethostbyname(gethostname()) # 172.28.0.12
74
- try:
75
- name = get(f"http://{ip}:9000/api/sessions").json()[0]["name"] # noqa: S113
76
- except Exception:
77
- logger.warning(
78
- "could not get notebook name from Google Colab, using: notebook.ipynb"
79
- )
80
- name = "notebook.ipynb"
81
- return name.rstrip(".ipynb")
82
-
83
-
84
- def raise_missing_context(transform_type: str, key: str) -> None:
85
- transform = Transform.filter(key=key).latest_version().first()
86
- if transform is None:
87
- new_uid = f"{base62_12()}0000"
88
- message = f"To track this {transform_type}, set\n\n"
89
- else:
90
- uid = transform.uid
91
- suid, vuid = uid[: Transform._len_stem_uid], uid[Transform._len_stem_uid :]
92
- new_vuid = increment_base62(vuid)
93
- new_uid = f"{suid}{new_vuid}"
94
- message = f"You already have a {transform_type} version family with key '{key}', suid '{transform.stem_uid}' & name '{transform.name}'.\n\n- to create a new {transform_type} version family, rename your file and rerun: ln.context.track()\n- to bump the version, set: "
95
- message += f'ln.context.uid = "{new_uid}"'
96
- if transform_type == "notebook":
97
- message += "\n\nRestart your notebook if you want consecutive cell execution."
98
- raise MissingContext(message)
99
-
100
-
101
- def pretty_pypackages(dependencies: dict) -> str:
102
- deps_list = []
103
- for pkg, ver in dependencies.items():
104
- if ver != "":
105
- deps_list.append(pkg + f"=={ver}")
106
- else:
107
- deps_list.append(pkg)
108
- deps_list.sort()
109
- return " ".join(deps_list)
110
-
111
-
112
- class Context:
113
- """Run context.
114
-
115
- Enables convenient data lineage tracking by managing a transform & run
116
- upon :meth:`~lamindb.core.Context.track` & :meth:`~lamindb.core.Context.finish`.
117
-
118
- Examples:
119
-
120
- Is typically used via :class:`~lamindb.context`:
121
-
122
- >>> import lamindb as ln
123
- >>> ln.context.track()
124
- >>> # do things while tracking data lineage
125
- >>> ln.context.finish()
126
-
127
- """
128
-
129
- def __init__(self):
130
- self._uid: str | None = None
131
- self._name: str | None = None
132
- self._version: str | None = None
133
- self._transform: Transform | None = None
134
- self._run: Run | None = None
135
- self._path: Path | None = None
136
- """A local path to the script that's running."""
137
- self._logging_message: str = ""
138
-
139
- @property
140
- def transform(self) -> Transform | None:
141
- """Transform of context."""
142
- return self._transform
143
-
144
- @property
145
- def uid(self) -> str | None:
146
- """`uid` to create transform."""
147
- return self._uid
148
-
149
- @uid.setter
150
- def uid(self, value: str | None):
151
- self._uid = value
152
-
153
- @property
154
- def name(self) -> str | None:
155
- """`name` to create transform."""
156
- return self._name
157
-
158
- @name.setter
159
- def name(self, value: str | None):
160
- self._name = value
161
-
162
- @property
163
- def version(self) -> str | None:
164
- """`version` to create transform."""
165
- return self._version
166
-
167
- @version.setter
168
- def version(self, value: str | None):
169
- self._version = value
170
-
171
- @property
172
- def run(self) -> Run | None:
173
- """Run of context."""
174
- return self._run
175
-
176
- def track(
177
- self,
178
- *,
179
- params: dict | None = None,
180
- new_run: bool | None = None,
181
- path: str | None = None,
182
- transform: Transform | None = None,
183
- ) -> None:
184
- """Starts data lineage tracking for a run.
185
-
186
- - sets :attr:`~lamindb.core.Context.transform` &
187
- :attr:`~lamindb.core.Context.run` by creating or loading `Transform` &
188
- `Run` records
189
- - saves compute environment as a `requirements.txt` file: `run.environment`
190
-
191
- If :attr:`~lamindb.core.Settings.sync_git_repo` is set, checks whether a
192
- script-like transform exists in a git repository and links it.
193
-
194
- Args:
195
- params: A dictionary of parameters to track for the run.
196
- new_run: If `False`, loads latest run of transform
197
- (default notebook), if `True`, creates new run (default pipeline).
198
- path: Filepath of notebook or script. Only needed if it can't be
199
- automatically detected.
200
- transform: Useful to track an abstract pipeline.
201
-
202
- Examples:
203
-
204
- To track the run of a notebook or script, call:
205
-
206
- >>> import lamindb as ln
207
- >>> ln.context.track()
208
-
209
- """
210
- self._path = None
211
- if transform is None:
212
- is_tracked = False
213
- transform_settings_are_set = (
214
- transform_settings.stem_uid is not None
215
- and transform_settings.version is not None
216
- )
217
- transform = None
218
- stem_uid = None
219
- if self.uid is not None:
220
- transform = Transform.filter(uid=self.uid).one_or_none()
221
- if self.version is not None:
222
- # test inconsistent version passed
223
- if (
224
- transform is not None
225
- and transform.version is not None
226
- and self.version != transform.version
227
- ):
228
- raise SystemExit(
229
- f"Please pass consistent version: ln.context.version = '{transform.version}'"
230
- )
231
- # test whether version was already used for another member of the family
232
- suid, vuid = (
233
- self.uid[: Transform._len_stem_uid],
234
- self.uid[Transform._len_stem_uid :],
235
- )
236
- transform = Transform.filter(
237
- uid__startswith=suid, version=self.version
238
- ).one_or_none()
239
- if (
240
- transform is not None
241
- and vuid != transform.uid[Transform._len_stem_uid :]
242
- ):
243
- better_version = bump_version_function(self.version)
244
- raise SystemExit(
245
- f"Version '{self.version}' is already taken by Transform(uid='{transform.uid}'); please set another version, e.g., ln.context.version = '{better_version}'"
246
- )
247
- elif transform_settings_are_set:
248
- stem_uid, self.version = (
249
- transform_settings.stem_uid,
250
- transform_settings.version,
251
- )
252
- transform = Transform.filter(
253
- uid__startswith=stem_uid, version=self.version
254
- ).one_or_none()
255
- if is_run_from_ipython:
256
- key, name = self._track_notebook(path=path)
257
- transform_type = "notebook"
258
- transform_ref = None
259
- transform_ref_type = None
260
- else:
261
- (name, key, transform_ref, transform_ref_type) = self._track_script(
262
- path=path
263
- )
264
- transform_type = "script"
265
- if self.uid is not None or transform_settings_are_set:
266
- # overwrite whatever is auto-detected in the notebook or script
267
- if self.name is not None:
268
- name = self.name
269
- self._create_or_load_transform(
270
- uid=self.uid,
271
- stem_uid=stem_uid,
272
- version=self.version,
273
- name=name,
274
- transform_ref=transform_ref,
275
- transform_ref_type=transform_ref_type,
276
- transform_type=transform_type,
277
- key=key,
278
- transform=transform,
279
- )
280
- # if no error is raised, the transform is tracked
281
- is_tracked = True
282
- if not is_tracked:
283
- raise_missing_context(transform_type, key)
284
- else:
285
- if transform.type in {"notebook", "script"}:
286
- raise ValueError(
287
- "Use ln.context.track() without passing transform in a notebook or script"
288
- " - metadata is automatically parsed"
289
- )
290
- transform_exists = None
291
- if transform.id is not None:
292
- # transform has an id but unclear whether already saved
293
- transform_exists = Transform.filter(id=transform.id).first()
294
- if transform_exists is None:
295
- transform.save()
296
- self._logging_message += f"created Transform(uid='{transform.uid}')"
297
- transform_exists = transform
298
- else:
299
- self._logging_message += f"loaded Transform(uid='{transform.uid}')"
300
- self._transform = transform_exists
301
-
302
- if new_run is None: # for notebooks, default to loading latest runs
303
- new_run = False if self._transform.type == "notebook" else True # type: ignore
304
-
305
- run = None
306
- if not new_run: # try loading latest run by same user
307
- run = (
308
- Run.filter(transform=self._transform, created_by_id=current_user_id())
309
- .order_by("-created_at")
310
- .first()
311
- )
312
- if run is not None: # loaded latest run
313
- run.started_at = datetime.now(timezone.utc) # update run time
314
- self._logging_message += (
315
- f" & loaded Run(started_at={format_field_value(run.started_at)})"
316
- )
317
-
318
- if run is None: # create new run
319
- run = Run(
320
- transform=self._transform,
321
- params=params,
322
- )
323
- run.started_at = datetime.now(timezone.utc)
324
- self._logging_message += (
325
- f" & created Run(started_at={format_field_value(run.started_at)})"
326
- )
327
- # can only determine at ln.finish() if run was consecutive in
328
- # interactive session, otherwise, is consecutive
329
- run.is_consecutive = True if is_run_from_ipython else None
330
- # need to save in all cases
331
- run.save()
332
- if params is not None:
333
- run.params.add_values(params)
334
- self._run = run
335
- track_environment(run)
336
- logger.important(self._logging_message)
337
- self._logging_message = ""
338
-
339
- def _track_script(
340
- self,
341
- *,
342
- path: UPathStr | None,
343
- ) -> tuple[str, str, str, str]:
344
- if path is None:
345
- import inspect
346
-
347
- frame = inspect.stack()[2]
348
- module = inspect.getmodule(frame[0])
349
- self._path = Path(module.__file__)
350
- else:
351
- self._path = Path(path)
352
- name = self._path.name
353
- key = name
354
- reference = None
355
- reference_type = None
356
- if settings.sync_git_repo is not None:
357
- reference = get_transform_reference_from_git_repo(self._path)
358
- reference_type = "url"
359
- return name, key, reference, reference_type
360
-
361
- def _track_notebook(
362
- self,
363
- *,
364
- path: str | None,
365
- ):
366
- if path is None:
367
- path = get_notebook_path()
368
- key = Path(path).name
369
- if isinstance(path, (Path, PurePath)):
370
- path_str = path.as_posix() # type: ignore
371
- else:
372
- path_str = str(path)
373
- if path_str.endswith("Untitled.ipynb"):
374
- raise RuntimeError("Please rename your notebook before tracking it")
375
- if path_str.startswith("/fileId="):
376
- name = get_notebook_name_colab()
377
- key = f"{name}.ipynb"
378
- else:
379
- import nbproject
380
-
381
- try:
382
- nbproject_title = nbproject.meta.live.title
383
- except IndexError:
384
- raise NotebookNotSavedError(
385
- "The notebook is not saved, please save the notebook and"
386
- " rerun `ln.context.track()`"
387
- ) from None
388
- if nbproject_title is None:
389
- raise NoTitleError(
390
- "Please add a title to your notebook in a markdown cell: # Title"
391
- ) from None
392
- name = nbproject_title
393
- # log imported python packages
394
- if not path_str.startswith("/fileId="):
395
- try:
396
- from nbproject.dev._pypackage import infer_pypackages
397
-
398
- nb = nbproject.dev.read_notebook(path_str)
399
- logger.important(
400
- "notebook imports:"
401
- f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}"
402
- )
403
- except Exception:
404
- logger.debug("inferring imported packages failed")
405
- pass
406
- self._path = Path(path_str)
407
- return key, name
408
-
409
- def _create_or_load_transform(
410
- self,
411
- *,
412
- uid: str | None,
413
- stem_uid: str | None,
414
- version: str | None,
415
- name: str,
416
- transform_ref: str | None = None,
417
- transform_ref_type: str | None = None,
418
- key: str | None = None,
419
- transform_type: TransformType = None,
420
- transform: Transform | None = None,
421
- ):
422
- # make a new transform record
423
- if transform is None:
424
- if uid is None:
425
- uid = f"{stem_uid}{get_uid_ext(version)}"
426
- # note that here we're not passing revises because we're not querying it
427
- # hence, we need to do a revision family lookup based on key
428
- # hence, we need key to be not None
429
- assert key is not None # noqa: S101
430
- transform = Transform(
431
- uid=uid,
432
- version=version,
433
- name=name,
434
- key=key,
435
- reference=transform_ref,
436
- reference_type=transform_ref_type,
437
- type=transform_type,
438
- ).save()
439
- self._logging_message += f"created Transform(uid='{transform.uid}')"
440
- else:
441
- uid = transform.uid
442
- # check whether the transform file has been renamed
443
- if transform.key != key:
444
- suid = transform.stem_uid
445
- new_suid = ids.base62_12()
446
- transform_type = "Notebook" if is_run_from_ipython else "Script"
447
- note = message_update_key_in_version_family(
448
- suid=suid,
449
- existing_key=transform.key,
450
- new_key=key,
451
- registry="Transform",
452
- )
453
- raise UpdateContext(
454
- f"{transform_type} filename changed.\n\nEither init a new transform family by setting:\n\n"
455
- f'ln.context.uid = "{new_suid}0000"\n\n{note}'
456
- )
457
- elif transform.name != name:
458
- transform.name = name
459
- transform.save()
460
- self._logging_message += (
461
- "updated transform name, " # white space on purpose
462
- )
463
- # check whether transform source code was already saved
464
- if (
465
- transform._source_code_artifact_id is not None
466
- or transform.source_code is not None
467
- ):
468
- bump_revision = False
469
- if is_run_from_ipython:
470
- bump_revision = True
471
- else:
472
- hash, _ = hash_file(self._path) # ignore hash_type for now
473
- if transform.hash is not None:
474
- condition = hash != transform.hash
475
- else:
476
- condition = hash != transform._source_code_artifact.hash
477
- if condition:
478
- bump_revision = True
479
- else:
480
- self._logging_message += (
481
- f"loaded Transform(uid='{transform.uid}')"
482
- )
483
- if bump_revision:
484
- change_type = (
485
- "Re-running saved notebook"
486
- if is_run_from_ipython
487
- else "Source code changed"
488
- )
489
- suid, vuid = (
490
- uid[:-4],
491
- uid[-4:],
492
- )
493
- new_vuid = increment_base62(vuid)
494
- raise UpdateContext(
495
- f"{change_type}, bump revision by setting:\n\n"
496
- f'ln.context.uid = "{suid}{new_vuid}"'
497
- )
498
- else:
499
- self._logging_message += f"loaded Transform(uid='{transform.uid}')"
500
- self._transform = transform
501
-
502
- def finish(self, ignore_non_consecutive: None | bool = None) -> None:
503
- """Mark the run context as finished.
504
-
505
- - writes a timestamp: `run.finished_at`
506
- - saves the source code: `transform.source_code`
507
-
508
- When called in the last cell of a notebook:
509
-
510
- - prompts for user input if not consecutively executed
511
- - requires to save the notebook in your editor
512
- - saves a run report: `run.report`
513
-
514
- Args:
515
- ignore_non_consecutive: Whether to ignore if a notebook was non-consecutively executed.
516
-
517
- Examples:
518
-
519
- >>> import lamindb as ln
520
- >>> ln.context.track()
521
- >>> # do things while tracking data lineage
522
- >>> ln.context.finish()
523
-
524
- See Also:
525
- `lamin save script.py` or `lamin save notebook.ipynb` → `docs </cli#lamin-save>`__
526
-
527
- """
528
- from lamindb._finish import save_context_core
529
-
530
- def get_seconds_since_modified(filepath) -> float:
531
- return datetime.now().timestamp() - filepath.stat().st_mtime
532
-
533
- if context.run is None:
534
- raise TrackNotCalled("Please run `ln.context.track()` before `ln.finish()`")
535
- if context._path is None:
536
- if context.run.transform.type in {"script", "notebook"}:
537
- raise ValueError(
538
- f"Transform type is not allowed to be 'script' or 'notebook' but is {context.run.transform.type}."
539
- )
540
- context.run.finished_at = datetime.now(timezone.utc)
541
- context.run.save()
542
- # nothing else to do
543
- return None
544
- if is_run_from_ipython: # notebooks
545
- if get_seconds_since_modified(context._path) > 2 and not ln_setup._TESTING:
546
- raise NotebookFileNotSavedToDisk(
547
- "Please save the notebook manually in your editor right before running `ln.context.finish()`"
548
- )
549
- save_context_core(
550
- run=context.run,
551
- transform=context.run.transform,
552
- filepath=context._path,
553
- finished_at=True,
554
- ignore_non_consecutive=ignore_non_consecutive,
555
- )
556
-
557
-
558
- context = Context()
1
+ from __future__ import annotations
2
+
3
+ import builtins
4
+ import hashlib
5
+ from datetime import datetime, timezone
6
+ from pathlib import Path, PurePath
7
+ from typing import TYPE_CHECKING
8
+
9
+ import lamindb_setup as ln_setup
10
+ from lamin_utils import logger
11
+ from lamindb_setup.core.hashing import hash_file
12
+ from lnschema_core import Run, Transform, ids
13
+ from lnschema_core.ids import base62_12
14
+ from lnschema_core.models import format_field_value
15
+ from lnschema_core.users import current_user_id
16
+
17
+ from ._settings import settings
18
+ from ._sync_git import get_transform_reference_from_git_repo
19
+ from ._track_environment import track_environment
20
+ from .exceptions import (
21
+ MissingContextUID,
22
+ NotebookNotSaved,
23
+ NotebookNotSavedError,
24
+ NoTitleError,
25
+ TrackNotCalled,
26
+ UpdateContext,
27
+ )
28
+ from .subsettings._transform_settings import transform_settings
29
+ from .versioning import bump_version as bump_version_function
30
+ from .versioning import increment_base62, message_update_key_in_version_family
31
+
32
+ if TYPE_CHECKING:
33
+ from lamindb_setup.core.types import UPathStr
34
+ from lnschema_core.types import TransformType
35
+
36
+ is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
37
+
38
+ msg_path_failed = (
39
+ "failed to infer notebook path.\nfix: pass `path` to ln.context.track()"
40
+ )
41
+
42
+
43
+ def get_uid_ext(version: str) -> str:
44
+ from lamin_utils._base62 import encodebytes
45
+
46
+ # merely zero-padding the nbproject version such that the base62 encoding is
47
+ # at least 4 characters long doesn't yields sufficiently diverse hashes and
48
+ # leads to collisions; it'd be nice because the uid_ext would be ordered
49
+ return encodebytes(hashlib.md5(version.encode()).digest())[:4] # noqa: S324
50
+
51
+
52
+ def get_notebook_path():
53
+ from nbproject.dev._jupyter_communicate import (
54
+ notebook_path as get_notebook_path,
55
+ )
56
+
57
+ path = None
58
+ try:
59
+ path = get_notebook_path()
60
+ except Exception:
61
+ raise RuntimeError(msg_path_failed) from None
62
+ if path is None:
63
+ raise RuntimeError(msg_path_failed) from None
64
+ return path
65
+
66
+
67
+ # from https://stackoverflow.com/questions/61901628
68
+ def get_notebook_name_colab() -> str:
69
+ from socket import gethostbyname, gethostname # type: ignore
70
+
71
+ from requests import get # type: ignore
72
+
73
+ ip = gethostbyname(gethostname()) # 172.28.0.12
74
+ try:
75
+ name = get(f"http://{ip}:9000/api/sessions").json()[0]["name"] # noqa: S113
76
+ except Exception:
77
+ logger.warning(
78
+ "could not get notebook name from Google Colab, using: notebook.ipynb"
79
+ )
80
+ name = "notebook.ipynb"
81
+ return name.rstrip(".ipynb")
82
+
83
+
84
+ def raise_missing_context(transform_type: str, key: str) -> bool:
85
+ transform = Transform.filter(key=key).latest_version().first()
86
+ if transform is None:
87
+ new_uid = f"{base62_12()}0000"
88
+ message = f"To track this {transform_type}, copy & paste the below into the current cell and re-run it\n\n"
89
+ message += f'ln.context.uid = "{new_uid}"\nln.context.track()'
90
+ else:
91
+ uid = transform.uid
92
+ suid, vuid = uid[: Transform._len_stem_uid], uid[Transform._len_stem_uid :]
93
+ new_vuid = increment_base62(vuid)
94
+ new_uid = f"{suid}{new_vuid}"
95
+ message = f"You already have a version family with key '{key}' (stem_uid='{transform.stem_uid}').\n\n- to make a revision, set `ln.context.uid = '{new_uid}'`\n- to start a new version family, rename your file and rerun: `ln.context.track()`"
96
+ if transform_type == "notebook":
97
+ print(f"→ {message}\n")
98
+ response = input("→ Ready to re-run? (y/n)")
99
+ if response == "y":
100
+ logger.important(
101
+ "Note: Restart your notebook if you want consecutive cell execution"
102
+ )
103
+ return True
104
+ raise MissingContextUID("Please follow the instructions.")
105
+ else:
106
+ raise MissingContextUID(message)
107
+ return False
108
+
109
+
110
+ def pretty_pypackages(dependencies: dict) -> str:
111
+ deps_list = []
112
+ for pkg, ver in dependencies.items():
113
+ if ver != "":
114
+ deps_list.append(pkg + f"=={ver}")
115
+ else:
116
+ deps_list.append(pkg)
117
+ deps_list.sort()
118
+ return " ".join(deps_list)
119
+
120
+
121
+ class Context:
122
+ """Run context.
123
+
124
+ Enables convenient data lineage tracking by managing a transform & run
125
+ upon :meth:`~lamindb.core.Context.track` & :meth:`~lamindb.core.Context.finish`.
126
+
127
+ Examples:
128
+
129
+ Is typically used via :class:`~lamindb.context`:
130
+
131
+ >>> import lamindb as ln
132
+ >>> ln.context.track()
133
+ >>> # do things while tracking data lineage
134
+ >>> ln.context.finish()
135
+
136
+ """
137
+
138
+ def __init__(self):
139
+ self._uid: str | None = None
140
+ self._name: str | None = None
141
+ self._version: str | None = None
142
+ self._transform: Transform | None = None
143
+ self._run: Run | None = None
144
+ self._path: Path | None = None
145
+ """A local path to the script that's running."""
146
+ self._logging_message: str = ""
147
+
148
+ @property
149
+ def transform(self) -> Transform | None:
150
+ """Transform of context."""
151
+ return self._transform
152
+
153
+ @property
154
+ def uid(self) -> str | None:
155
+ """`uid` to create transform."""
156
+ return self._uid
157
+
158
+ @uid.setter
159
+ def uid(self, value: str | None):
160
+ self._uid = value
161
+
162
+ @property
163
+ def name(self) -> str | None:
164
+ """`name` to create transform."""
165
+ return self._name
166
+
167
+ @name.setter
168
+ def name(self, value: str | None):
169
+ self._name = value
170
+
171
+ @property
172
+ def version(self) -> str | None:
173
+ """`version` to create transform."""
174
+ return self._version
175
+
176
+ @version.setter
177
+ def version(self, value: str | None):
178
+ self._version = value
179
+
180
+ @property
181
+ def run(self) -> Run | None:
182
+ """Run of context."""
183
+ return self._run
184
+
185
+ def track(
186
+ self,
187
+ *,
188
+ params: dict | None = None,
189
+ new_run: bool | None = None,
190
+ path: str | None = None,
191
+ transform: Transform | None = None,
192
+ ) -> None:
193
+ """Starts data lineage tracking for a run.
194
+
195
+ - sets :attr:`~lamindb.core.Context.transform` &
196
+ :attr:`~lamindb.core.Context.run` by creating or loading `Transform` &
197
+ `Run` records
198
+ - saves compute environment as a `requirements.txt` file: `run.environment`
199
+
200
+ If :attr:`~lamindb.core.Settings.sync_git_repo` is set, checks whether a
201
+ script-like transform exists in a git repository and links it.
202
+
203
+ Args:
204
+ params: A dictionary of parameters to track for the run.
205
+ new_run: If `False`, loads latest run of transform
206
+ (default notebook), if `True`, creates new run (default pipeline).
207
+ path: Filepath of notebook or script. Only needed if it can't be
208
+ automatically detected.
209
+ transform: Useful to track an abstract pipeline.
210
+
211
+ Examples:
212
+
213
+ To track the run of a notebook or script, call:
214
+
215
+ >>> import lamindb as ln
216
+ >>> ln.context.track()
217
+
218
+ """
219
+ self._path = None
220
+ if transform is None:
221
+ is_tracked = False
222
+ transform_settings_are_set = (
223
+ transform_settings.stem_uid is not None
224
+ and transform_settings.version is not None
225
+ )
226
+ transform = None
227
+ stem_uid = None
228
+ if self.uid is not None:
229
+ transform = Transform.filter(uid=self.uid).one_or_none()
230
+ if self.version is not None:
231
+ # test inconsistent version passed
232
+ if (
233
+ transform is not None
234
+ and transform.version is not None
235
+ and self.version != transform.version
236
+ ):
237
+ raise SystemExit(
238
+ f"Please pass consistent version: ln.context.version = '{transform.version}'"
239
+ )
240
+ # test whether version was already used for another member of the family
241
+ suid, vuid = (
242
+ self.uid[: Transform._len_stem_uid],
243
+ self.uid[Transform._len_stem_uid :],
244
+ )
245
+ transform = Transform.filter(
246
+ uid__startswith=suid, version=self.version
247
+ ).one_or_none()
248
+ if (
249
+ transform is not None
250
+ and vuid != transform.uid[Transform._len_stem_uid :]
251
+ ):
252
+ better_version = bump_version_function(self.version)
253
+ raise SystemExit(
254
+ f"Version '{self.version}' is already taken by Transform(uid='{transform.uid}'); please set another version, e.g., ln.context.version = '{better_version}'"
255
+ )
256
+ elif transform_settings_are_set:
257
+ stem_uid, self.version = (
258
+ transform_settings.stem_uid,
259
+ transform_settings.version,
260
+ )
261
+ transform = Transform.filter(
262
+ uid__startswith=stem_uid, version=self.version
263
+ ).one_or_none()
264
+ if is_run_from_ipython:
265
+ key, name = self._track_notebook(path=path)
266
+ transform_type = "notebook"
267
+ transform_ref = None
268
+ transform_ref_type = None
269
+ else:
270
+ (name, key, transform_ref, transform_ref_type) = self._track_script(
271
+ path=path
272
+ )
273
+ transform_type = "script"
274
+ if self.uid is not None or transform_settings_are_set:
275
+ # overwrite whatever is auto-detected in the notebook or script
276
+ if self.name is not None:
277
+ name = self.name
278
+ self._create_or_load_transform(
279
+ uid=self.uid,
280
+ stem_uid=stem_uid,
281
+ version=self.version,
282
+ name=name,
283
+ transform_ref=transform_ref,
284
+ transform_ref_type=transform_ref_type,
285
+ transform_type=transform_type,
286
+ key=key,
287
+ transform=transform,
288
+ )
289
+ # if no error is raised, the transform is tracked
290
+ is_tracked = True
291
+ if not is_tracked:
292
+ early_return = raise_missing_context(transform_type, key)
293
+ if early_return:
294
+ return None
295
+ else:
296
+ if transform.type in {"notebook", "script"}:
297
+ raise ValueError(
298
+ "Use ln.context.track() without passing transform in a notebook or script"
299
+ " - metadata is automatically parsed"
300
+ )
301
+ transform_exists = None
302
+ if transform.id is not None:
303
+ # transform has an id but unclear whether already saved
304
+ transform_exists = Transform.filter(id=transform.id).first()
305
+ if transform_exists is None:
306
+ transform.save()
307
+ self._logging_message += f"created Transform(uid='{transform.uid}')"
308
+ transform_exists = transform
309
+ else:
310
+ self._logging_message += f"loaded Transform(uid='{transform.uid}')"
311
+ self._transform = transform_exists
312
+
313
+ if new_run is None: # for notebooks, default to loading latest runs
314
+ new_run = False if self._transform.type == "notebook" else True # type: ignore
315
+
316
+ run = None
317
+ if not new_run: # try loading latest run by same user
318
+ run = (
319
+ Run.filter(transform=self._transform, created_by_id=current_user_id())
320
+ .order_by("-created_at")
321
+ .first()
322
+ )
323
+ if run is not None: # loaded latest run
324
+ run.started_at = datetime.now(timezone.utc) # update run time
325
+ self._logging_message += (
326
+ f" & loaded Run(started_at={format_field_value(run.started_at)})"
327
+ )
328
+
329
+ if run is None: # create new run
330
+ run = Run(
331
+ transform=self._transform,
332
+ params=params,
333
+ )
334
+ run.started_at = datetime.now(timezone.utc)
335
+ self._logging_message += (
336
+ f" & created Run(started_at={format_field_value(run.started_at)})"
337
+ )
338
+ # can only determine at ln.finish() if run was consecutive in
339
+ # interactive session, otherwise, is consecutive
340
+ run.is_consecutive = True if is_run_from_ipython else None
341
+ # need to save in all cases
342
+ run.save()
343
+ if params is not None:
344
+ run.params.add_values(params)
345
+ self._run = run
346
+ track_environment(run)
347
+ logger.important(self._logging_message)
348
+ self._logging_message = ""
349
+
350
+ def _track_script(
351
+ self,
352
+ *,
353
+ path: UPathStr | None,
354
+ ) -> tuple[str, str, str, str]:
355
+ if path is None:
356
+ import inspect
357
+
358
+ frame = inspect.stack()[2]
359
+ module = inspect.getmodule(frame[0])
360
+ self._path = Path(module.__file__)
361
+ else:
362
+ self._path = Path(path)
363
+ name = self._path.name
364
+ key = name
365
+ reference = None
366
+ reference_type = None
367
+ if settings.sync_git_repo is not None:
368
+ reference = get_transform_reference_from_git_repo(self._path)
369
+ reference_type = "url"
370
+ return name, key, reference, reference_type
371
+
372
+ def _track_notebook(
373
+ self,
374
+ *,
375
+ path: str | None,
376
+ ):
377
+ if path is None:
378
+ path = get_notebook_path()
379
+ key = Path(path).name
380
+ if isinstance(path, (Path, PurePath)):
381
+ path_str = path.as_posix() # type: ignore
382
+ else:
383
+ path_str = str(path)
384
+ if path_str.endswith("Untitled.ipynb"):
385
+ raise RuntimeError("Please rename your notebook before tracking it")
386
+ if path_str.startswith("/fileId="):
387
+ name = get_notebook_name_colab()
388
+ key = f"{name}.ipynb"
389
+ else:
390
+ import nbproject
391
+
392
+ try:
393
+ nbproject_title = nbproject.meta.live.title
394
+ except IndexError:
395
+ raise NotebookNotSavedError(
396
+ "The notebook is not saved, please save the notebook and"
397
+ " rerun `ln.context.track()`"
398
+ ) from None
399
+ if nbproject_title is None:
400
+ raise NoTitleError(
401
+ "Please add a title to your notebook in a markdown cell: # Title"
402
+ ) from None
403
+ name = nbproject_title
404
+ # log imported python packages
405
+ if not path_str.startswith("/fileId="):
406
+ try:
407
+ from nbproject.dev._pypackage import infer_pypackages
408
+
409
+ nb = nbproject.dev.read_notebook(path_str)
410
+ logger.important(
411
+ "notebook imports:"
412
+ f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}"
413
+ )
414
+ except Exception:
415
+ logger.debug("inferring imported packages failed")
416
+ pass
417
+ self._path = Path(path_str)
418
+ return key, name
419
+
420
+ def _create_or_load_transform(
421
+ self,
422
+ *,
423
+ uid: str | None,
424
+ stem_uid: str | None,
425
+ version: str | None,
426
+ name: str,
427
+ transform_ref: str | None = None,
428
+ transform_ref_type: str | None = None,
429
+ key: str | None = None,
430
+ transform_type: TransformType = None,
431
+ transform: Transform | None = None,
432
+ ):
433
+ # make a new transform record
434
+ if transform is None:
435
+ if uid is None:
436
+ uid = f"{stem_uid}{get_uid_ext(version)}"
437
+ # note that here we're not passing revises because we're not querying it
438
+ # hence, we need to do a revision family lookup based on key
439
+ # hence, we need key to be not None
440
+ assert key is not None # noqa: S101
441
+ transform = Transform(
442
+ uid=uid,
443
+ version=version,
444
+ name=name,
445
+ key=key,
446
+ reference=transform_ref,
447
+ reference_type=transform_ref_type,
448
+ type=transform_type,
449
+ ).save()
450
+ self._logging_message += f"created Transform(uid='{transform.uid}')"
451
+ else:
452
+ uid = transform.uid
453
+ # check whether the transform file has been renamed
454
+ if transform.key != key:
455
+ suid = transform.stem_uid
456
+ new_suid = ids.base62_12()
457
+ transform_type = "Notebook" if is_run_from_ipython else "Script"
458
+ note = message_update_key_in_version_family(
459
+ suid=suid,
460
+ existing_key=transform.key,
461
+ new_key=key,
462
+ registry="Transform",
463
+ )
464
+ raise UpdateContext(
465
+ f"{transform_type} filename changed.\n\nEither init a new transform family by setting:\n\n"
466
+ f'ln.context.uid = "{new_suid}0000"\n\n{note}'
467
+ )
468
+ elif transform.name != name:
469
+ transform.name = name
470
+ transform.save()
471
+ self._logging_message += (
472
+ "updated transform name, " # white space on purpose
473
+ )
474
+ # check whether transform source code was already saved
475
+ if (
476
+ transform._source_code_artifact_id is not None
477
+ or transform.source_code is not None
478
+ ):
479
+ bump_revision = False
480
+ if is_run_from_ipython:
481
+ bump_revision = True
482
+ else:
483
+ hash, _ = hash_file(self._path) # ignore hash_type for now
484
+ if transform.hash is not None:
485
+ condition = hash != transform.hash
486
+ else:
487
+ condition = hash != transform._source_code_artifact.hash
488
+ if condition:
489
+ bump_revision = True
490
+ else:
491
+ self._logging_message += (
492
+ f"loaded Transform(uid='{transform.uid}')"
493
+ )
494
+ if bump_revision:
495
+ change_type = (
496
+ "Re-running saved notebook"
497
+ if is_run_from_ipython
498
+ else "Source code changed"
499
+ )
500
+ suid, vuid = (
501
+ uid[:-4],
502
+ uid[-4:],
503
+ )
504
+ new_vuid = increment_base62(vuid)
505
+ raise UpdateContext(
506
+ f"{change_type}, bump revision by setting:\n\n"
507
+ f'ln.context.uid = "{suid}{new_vuid}"'
508
+ )
509
+ else:
510
+ self._logging_message += f"loaded Transform(uid='{transform.uid}')"
511
+ self._transform = transform
512
+
513
+ def finish(self, ignore_non_consecutive: None | bool = None) -> None:
514
+ """Mark the run context as finished.
515
+
516
+ - writes a timestamp: `run.finished_at`
517
+ - saves the source code: `transform.source_code`
518
+
519
+ When called in the last cell of a notebook:
520
+
521
+ - prompts for user input if not consecutively executed
522
+ - requires to save the notebook in your editor right before
523
+ - saves a run report: `run.report`
524
+
525
+ Args:
526
+ ignore_non_consecutive: Whether to ignore if a notebook was non-consecutively executed.
527
+
528
+ Examples:
529
+
530
+ >>> import lamindb as ln
531
+ >>> ln.context.track()
532
+ >>> # do things while tracking data lineage
533
+ >>> ln.context.finish()
534
+
535
+ See Also:
536
+ `lamin save script.py` or `lamin save notebook.ipynb` `docs </cli#lamin-save>`__
537
+
538
+ """
539
+ from lamindb._finish import save_context_core
540
+
541
+ def get_seconds_since_modified(filepath) -> float:
542
+ return datetime.now().timestamp() - filepath.stat().st_mtime
543
+
544
+ def get_shortcut() -> str:
545
+ import platform
546
+
547
+ return "CMD + s" if platform.system() == "Darwin" else "CTRL + s"
548
+
549
+ if context.run is None:
550
+ raise TrackNotCalled("Please run `ln.context.track()` before `ln.finish()`")
551
+ if context._path is None:
552
+ if context.run.transform.type in {"script", "notebook"}:
553
+ raise ValueError(
554
+ f"Transform type is not allowed to be 'script' or 'notebook' but is {context.run.transform.type}."
555
+ )
556
+ context.run.finished_at = datetime.now(timezone.utc)
557
+ context.run.save()
558
+ # nothing else to do
559
+ return None
560
+ if is_run_from_ipython: # notebooks
561
+ if get_seconds_since_modified(context._path) > 2 and not ln_setup._TESTING:
562
+ raise NotebookNotSaved(
563
+ f"Please save the notebook in your editor (shortcut `{get_shortcut()}`) right before calling `ln.context.finish()`"
564
+ )
565
+ save_context_core(
566
+ run=context.run,
567
+ transform=context.run.transform,
568
+ filepath=context._path,
569
+ finished_at=True,
570
+ ignore_non_consecutive=ignore_non_consecutive,
571
+ )
572
+
573
+
574
+ context = Context()