lamindb 0.76.8__py3-none-any.whl → 0.76.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lamindb/__init__.py +114 -113
  2. lamindb/_artifact.py +1206 -1205
  3. lamindb/_can_validate.py +621 -579
  4. lamindb/_collection.py +390 -387
  5. lamindb/_curate.py +1603 -1601
  6. lamindb/_feature.py +155 -155
  7. lamindb/_feature_set.py +244 -242
  8. lamindb/_filter.py +23 -23
  9. lamindb/_finish.py +250 -256
  10. lamindb/_from_values.py +403 -382
  11. lamindb/_is_versioned.py +40 -40
  12. lamindb/_parents.py +476 -476
  13. lamindb/_query_manager.py +125 -125
  14. lamindb/_query_set.py +364 -362
  15. lamindb/_record.py +668 -649
  16. lamindb/_run.py +60 -57
  17. lamindb/_save.py +310 -308
  18. lamindb/_storage.py +14 -14
  19. lamindb/_transform.py +130 -127
  20. lamindb/_ulabel.py +56 -56
  21. lamindb/_utils.py +9 -9
  22. lamindb/_view.py +72 -72
  23. lamindb/core/__init__.py +94 -94
  24. lamindb/core/_context.py +590 -574
  25. lamindb/core/_data.py +510 -438
  26. lamindb/core/_django.py +209 -0
  27. lamindb/core/_feature_manager.py +994 -867
  28. lamindb/core/_label_manager.py +289 -253
  29. lamindb/core/_mapped_collection.py +631 -597
  30. lamindb/core/_settings.py +188 -187
  31. lamindb/core/_sync_git.py +138 -138
  32. lamindb/core/_track_environment.py +27 -27
  33. lamindb/core/datasets/__init__.py +59 -59
  34. lamindb/core/datasets/_core.py +581 -571
  35. lamindb/core/datasets/_fake.py +36 -36
  36. lamindb/core/exceptions.py +90 -90
  37. lamindb/core/fields.py +12 -12
  38. lamindb/core/loaders.py +164 -164
  39. lamindb/core/schema.py +56 -56
  40. lamindb/core/storage/__init__.py +25 -25
  41. lamindb/core/storage/_anndata_accessor.py +741 -740
  42. lamindb/core/storage/_anndata_sizes.py +41 -41
  43. lamindb/core/storage/_backed_access.py +98 -98
  44. lamindb/core/storage/_tiledbsoma.py +204 -204
  45. lamindb/core/storage/_valid_suffixes.py +21 -21
  46. lamindb/core/storage/_zarr.py +110 -110
  47. lamindb/core/storage/objects.py +62 -62
  48. lamindb/core/storage/paths.py +172 -172
  49. lamindb/core/subsettings/__init__.py +12 -12
  50. lamindb/core/subsettings/_creation_settings.py +38 -38
  51. lamindb/core/subsettings/_transform_settings.py +21 -21
  52. lamindb/core/types.py +19 -19
  53. lamindb/core/versioning.py +146 -158
  54. lamindb/integrations/__init__.py +12 -12
  55. lamindb/integrations/_vitessce.py +107 -107
  56. lamindb/setup/__init__.py +14 -14
  57. lamindb/setup/core/__init__.py +4 -4
  58. {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/LICENSE +201 -201
  59. {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/METADATA +8 -8
  60. lamindb-0.76.10.dist-info/RECORD +61 -0
  61. {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/WHEEL +1 -1
  62. lamindb-0.76.8.dist-info/RECORD +0 -60
lamindb/core/_context.py CHANGED
@@ -1,574 +1,590 @@
1
- from __future__ import annotations
2
-
3
- import builtins
4
- import hashlib
5
- from datetime import datetime, timezone
6
- from pathlib import Path, PurePath
7
- from typing import TYPE_CHECKING
8
-
9
- import lamindb_setup as ln_setup
10
- from lamin_utils import logger
11
- from lamindb_setup.core.hashing import hash_file
12
- from lnschema_core import Run, Transform, ids
13
- from lnschema_core.ids import base62_12
14
- from lnschema_core.models import format_field_value
15
- from lnschema_core.users import current_user_id
16
-
17
- from ._settings import settings
18
- from ._sync_git import get_transform_reference_from_git_repo
19
- from ._track_environment import track_environment
20
- from .exceptions import (
21
- MissingContextUID,
22
- NotebookNotSaved,
23
- NotebookNotSavedError,
24
- NoTitleError,
25
- TrackNotCalled,
26
- UpdateContext,
27
- )
28
- from .subsettings._transform_settings import transform_settings
29
- from .versioning import bump_version as bump_version_function
30
- from .versioning import increment_base62, message_update_key_in_version_family
31
-
32
- if TYPE_CHECKING:
33
- from lamindb_setup.core.types import UPathStr
34
- from lnschema_core.types import TransformType
35
-
36
- is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
37
-
38
- msg_path_failed = (
39
- "failed to infer notebook path.\nfix: pass `path` to ln.context.track()"
40
- )
41
-
42
-
43
- def get_uid_ext(version: str) -> str:
44
- from lamin_utils._base62 import encodebytes
45
-
46
- # merely zero-padding the nbproject version such that the base62 encoding is
47
- # at least 4 characters long doesn't yields sufficiently diverse hashes and
48
- # leads to collisions; it'd be nice because the uid_ext would be ordered
49
- return encodebytes(hashlib.md5(version.encode()).digest())[:4] # noqa: S324
50
-
51
-
52
- def get_notebook_path():
53
- from nbproject.dev._jupyter_communicate import (
54
- notebook_path as get_notebook_path,
55
- )
56
-
57
- path = None
58
- try:
59
- path = get_notebook_path()
60
- except Exception:
61
- raise RuntimeError(msg_path_failed) from None
62
- if path is None:
63
- raise RuntimeError(msg_path_failed) from None
64
- return path
65
-
66
-
67
- # from https://stackoverflow.com/questions/61901628
68
- def get_notebook_name_colab() -> str:
69
- from socket import gethostbyname, gethostname # type: ignore
70
-
71
- from requests import get # type: ignore
72
-
73
- ip = gethostbyname(gethostname()) # 172.28.0.12
74
- try:
75
- name = get(f"http://{ip}:9000/api/sessions").json()[0]["name"] # noqa: S113
76
- except Exception:
77
- logger.warning(
78
- "could not get notebook name from Google Colab, using: notebook.ipynb"
79
- )
80
- name = "notebook.ipynb"
81
- return name.rstrip(".ipynb")
82
-
83
-
84
- def raise_missing_context(transform_type: str, key: str) -> bool:
85
- transform = Transform.filter(key=key).latest_version().first()
86
- if transform is None:
87
- new_uid = f"{base62_12()}0000"
88
- message = f"To track this {transform_type}, copy & paste the below into the current cell and re-run it\n\n"
89
- message += f'ln.context.uid = "{new_uid}"\nln.context.track()'
90
- else:
91
- uid = transform.uid
92
- suid, vuid = uid[: Transform._len_stem_uid], uid[Transform._len_stem_uid :]
93
- new_vuid = increment_base62(vuid)
94
- new_uid = f"{suid}{new_vuid}"
95
- message = f"You already have a version family with key '{key}' (stem_uid='{transform.stem_uid}').\n\n- to make a revision, set `ln.context.uid = '{new_uid}'`\n- to start a new version family, rename your file and rerun: `ln.context.track()`"
96
- if transform_type == "notebook":
97
- print(f"→ {message}\n")
98
- response = input(" Ready to re-run? (y/n)")
99
- if response == "y":
100
- logger.important(
101
- "Note: Restart your notebook if you want consecutive cell execution"
102
- )
103
- return True
104
- raise MissingContextUID("Please follow the instructions.")
105
- else:
106
- raise MissingContextUID(message)
107
- return False
108
-
109
-
110
- def pretty_pypackages(dependencies: dict) -> str:
111
- deps_list = []
112
- for pkg, ver in dependencies.items():
113
- if ver != "":
114
- deps_list.append(pkg + f"=={ver}")
115
- else:
116
- deps_list.append(pkg)
117
- deps_list.sort()
118
- return " ".join(deps_list)
119
-
120
-
121
- class Context:
122
- """Run context.
123
-
124
- Enables convenient data lineage tracking by managing a transform & run
125
- upon :meth:`~lamindb.core.Context.track` & :meth:`~lamindb.core.Context.finish`.
126
-
127
- Examples:
128
-
129
- Is typically used via :class:`~lamindb.context`:
130
-
131
- >>> import lamindb as ln
132
- >>> ln.context.track()
133
- >>> # do things while tracking data lineage
134
- >>> ln.context.finish()
135
-
136
- """
137
-
138
- def __init__(self):
139
- self._uid: str | None = None
140
- self._name: str | None = None
141
- self._version: str | None = None
142
- self._transform: Transform | None = None
143
- self._run: Run | None = None
144
- self._path: Path | None = None
145
- """A local path to the script that's running."""
146
- self._logging_message: str = ""
147
-
148
- @property
149
- def transform(self) -> Transform | None:
150
- """Transform of context."""
151
- return self._transform
152
-
153
- @property
154
- def uid(self) -> str | None:
155
- """`uid` to create transform."""
156
- return self._uid
157
-
158
- @uid.setter
159
- def uid(self, value: str | None):
160
- self._uid = value
161
-
162
- @property
163
- def name(self) -> str | None:
164
- """`name` to create transform."""
165
- return self._name
166
-
167
- @name.setter
168
- def name(self, value: str | None):
169
- self._name = value
170
-
171
- @property
172
- def version(self) -> str | None:
173
- """`version` to create transform."""
174
- return self._version
175
-
176
- @version.setter
177
- def version(self, value: str | None):
178
- self._version = value
179
-
180
- @property
181
- def run(self) -> Run | None:
182
- """Run of context."""
183
- return self._run
184
-
185
- def track(
186
- self,
187
- *,
188
- params: dict | None = None,
189
- new_run: bool | None = None,
190
- path: str | None = None,
191
- transform: Transform | None = None,
192
- ) -> None:
193
- """Starts data lineage tracking for a run.
194
-
195
- - sets :attr:`~lamindb.core.Context.transform` &
196
- :attr:`~lamindb.core.Context.run` by creating or loading `Transform` &
197
- `Run` records
198
- - saves compute environment as a `requirements.txt` file: `run.environment`
199
-
200
- If :attr:`~lamindb.core.Settings.sync_git_repo` is set, checks whether a
201
- script-like transform exists in a git repository and links it.
202
-
203
- Args:
204
- params: A dictionary of parameters to track for the run.
205
- new_run: If `False`, loads latest run of transform
206
- (default notebook), if `True`, creates new run (default pipeline).
207
- path: Filepath of notebook or script. Only needed if it can't be
208
- automatically detected.
209
- transform: Useful to track an abstract pipeline.
210
-
211
- Examples:
212
-
213
- To track the run of a notebook or script, call:
214
-
215
- >>> import lamindb as ln
216
- >>> ln.context.track()
217
-
218
- """
219
- self._path = None
220
- if transform is None:
221
- is_tracked = False
222
- transform_settings_are_set = (
223
- transform_settings.stem_uid is not None
224
- and transform_settings.version is not None
225
- )
226
- transform = None
227
- stem_uid = None
228
- if self.uid is not None:
229
- transform = Transform.filter(uid=self.uid).one_or_none()
230
- if self.version is not None:
231
- # test inconsistent version passed
232
- if (
233
- transform is not None
234
- and transform.version is not None
235
- and self.version != transform.version
236
- ):
237
- raise SystemExit(
238
- f"Please pass consistent version: ln.context.version = '{transform.version}'"
239
- )
240
- # test whether version was already used for another member of the family
241
- suid, vuid = (
242
- self.uid[: Transform._len_stem_uid],
243
- self.uid[Transform._len_stem_uid :],
244
- )
245
- transform = Transform.filter(
246
- uid__startswith=suid, version=self.version
247
- ).one_or_none()
248
- if (
249
- transform is not None
250
- and vuid != transform.uid[Transform._len_stem_uid :]
251
- ):
252
- better_version = bump_version_function(self.version)
253
- raise SystemExit(
254
- f"Version '{self.version}' is already taken by Transform(uid='{transform.uid}'); please set another version, e.g., ln.context.version = '{better_version}'"
255
- )
256
- elif transform_settings_are_set:
257
- stem_uid, self.version = (
258
- transform_settings.stem_uid,
259
- transform_settings.version,
260
- )
261
- transform = Transform.filter(
262
- uid__startswith=stem_uid, version=self.version
263
- ).one_or_none()
264
- if is_run_from_ipython:
265
- key, name = self._track_notebook(path=path)
266
- transform_type = "notebook"
267
- transform_ref = None
268
- transform_ref_type = None
269
- else:
270
- (name, key, transform_ref, transform_ref_type) = self._track_script(
271
- path=path
272
- )
273
- transform_type = "script"
274
- if self.uid is not None or transform_settings_are_set:
275
- # overwrite whatever is auto-detected in the notebook or script
276
- if self.name is not None:
277
- name = self.name
278
- self._create_or_load_transform(
279
- uid=self.uid,
280
- stem_uid=stem_uid,
281
- version=self.version,
282
- name=name,
283
- transform_ref=transform_ref,
284
- transform_ref_type=transform_ref_type,
285
- transform_type=transform_type,
286
- key=key,
287
- transform=transform,
288
- )
289
- # if no error is raised, the transform is tracked
290
- is_tracked = True
291
- if not is_tracked:
292
- early_return = raise_missing_context(transform_type, key)
293
- if early_return:
294
- return None
295
- else:
296
- if transform.type in {"notebook", "script"}:
297
- raise ValueError(
298
- "Use ln.context.track() without passing transform in a notebook or script"
299
- " - metadata is automatically parsed"
300
- )
301
- transform_exists = None
302
- if transform.id is not None:
303
- # transform has an id but unclear whether already saved
304
- transform_exists = Transform.filter(id=transform.id).first()
305
- if transform_exists is None:
306
- transform.save()
307
- self._logging_message += f"created Transform(uid='{transform.uid}')"
308
- transform_exists = transform
309
- else:
310
- self._logging_message += f"loaded Transform(uid='{transform.uid}')"
311
- self._transform = transform_exists
312
-
313
- if new_run is None: # for notebooks, default to loading latest runs
314
- new_run = False if self._transform.type == "notebook" else True # type: ignore
315
-
316
- run = None
317
- if not new_run: # try loading latest run by same user
318
- run = (
319
- Run.filter(transform=self._transform, created_by_id=current_user_id())
320
- .order_by("-created_at")
321
- .first()
322
- )
323
- if run is not None: # loaded latest run
324
- run.started_at = datetime.now(timezone.utc) # update run time
325
- self._logging_message += (
326
- f" & loaded Run(started_at={format_field_value(run.started_at)})"
327
- )
328
-
329
- if run is None: # create new run
330
- run = Run(
331
- transform=self._transform,
332
- params=params,
333
- )
334
- run.started_at = datetime.now(timezone.utc)
335
- self._logging_message += (
336
- f" & created Run(started_at={format_field_value(run.started_at)})"
337
- )
338
- # can only determine at ln.finish() if run was consecutive in
339
- # interactive session, otherwise, is consecutive
340
- run.is_consecutive = True if is_run_from_ipython else None
341
- # need to save in all cases
342
- run.save()
343
- if params is not None:
344
- run.params.add_values(params)
345
- self._run = run
346
- track_environment(run)
347
- logger.important(self._logging_message)
348
- self._logging_message = ""
349
-
350
- def _track_script(
351
- self,
352
- *,
353
- path: UPathStr | None,
354
- ) -> tuple[str, str, str, str]:
355
- if path is None:
356
- import inspect
357
-
358
- frame = inspect.stack()[2]
359
- module = inspect.getmodule(frame[0])
360
- self._path = Path(module.__file__)
361
- else:
362
- self._path = Path(path)
363
- name = self._path.name
364
- key = name
365
- reference = None
366
- reference_type = None
367
- if settings.sync_git_repo is not None:
368
- reference = get_transform_reference_from_git_repo(self._path)
369
- reference_type = "url"
370
- return name, key, reference, reference_type
371
-
372
- def _track_notebook(
373
- self,
374
- *,
375
- path: str | None,
376
- ):
377
- if path is None:
378
- path = get_notebook_path()
379
- key = Path(path).name
380
- if isinstance(path, (Path, PurePath)):
381
- path_str = path.as_posix() # type: ignore
382
- else:
383
- path_str = str(path)
384
- if path_str.endswith("Untitled.ipynb"):
385
- raise RuntimeError("Please rename your notebook before tracking it")
386
- if path_str.startswith("/fileId="):
387
- name = get_notebook_name_colab()
388
- key = f"{name}.ipynb"
389
- else:
390
- import nbproject
391
-
392
- try:
393
- nbproject_title = nbproject.meta.live.title
394
- except IndexError:
395
- raise NotebookNotSavedError(
396
- "The notebook is not saved, please save the notebook and"
397
- " rerun `ln.context.track()`"
398
- ) from None
399
- if nbproject_title is None:
400
- raise NoTitleError(
401
- "Please add a title to your notebook in a markdown cell: # Title"
402
- ) from None
403
- name = nbproject_title
404
- # log imported python packages
405
- if not path_str.startswith("/fileId="):
406
- try:
407
- from nbproject.dev._pypackage import infer_pypackages
408
-
409
- nb = nbproject.dev.read_notebook(path_str)
410
- logger.important(
411
- "notebook imports:"
412
- f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}"
413
- )
414
- except Exception:
415
- logger.debug("inferring imported packages failed")
416
- pass
417
- self._path = Path(path_str)
418
- return key, name
419
-
420
- def _create_or_load_transform(
421
- self,
422
- *,
423
- uid: str | None,
424
- stem_uid: str | None,
425
- version: str | None,
426
- name: str,
427
- transform_ref: str | None = None,
428
- transform_ref_type: str | None = None,
429
- key: str | None = None,
430
- transform_type: TransformType = None,
431
- transform: Transform | None = None,
432
- ):
433
- # make a new transform record
434
- if transform is None:
435
- if uid is None:
436
- uid = f"{stem_uid}{get_uid_ext(version)}"
437
- # note that here we're not passing revises because we're not querying it
438
- # hence, we need to do a revision family lookup based on key
439
- # hence, we need key to be not None
440
- assert key is not None # noqa: S101
441
- transform = Transform(
442
- uid=uid,
443
- version=version,
444
- name=name,
445
- key=key,
446
- reference=transform_ref,
447
- reference_type=transform_ref_type,
448
- type=transform_type,
449
- ).save()
450
- self._logging_message += f"created Transform(uid='{transform.uid}')"
451
- else:
452
- uid = transform.uid
453
- # check whether the transform file has been renamed
454
- if transform.key != key:
455
- suid = transform.stem_uid
456
- new_suid = ids.base62_12()
457
- transform_type = "Notebook" if is_run_from_ipython else "Script"
458
- note = message_update_key_in_version_family(
459
- suid=suid,
460
- existing_key=transform.key,
461
- new_key=key,
462
- registry="Transform",
463
- )
464
- raise UpdateContext(
465
- f"{transform_type} filename changed.\n\nEither init a new transform family by setting:\n\n"
466
- f'ln.context.uid = "{new_suid}0000"\n\n{note}'
467
- )
468
- elif transform.name != name:
469
- transform.name = name
470
- transform.save()
471
- self._logging_message += (
472
- "updated transform name, " # white space on purpose
473
- )
474
- # check whether transform source code was already saved
475
- if (
476
- transform._source_code_artifact_id is not None
477
- or transform.source_code is not None
478
- ):
479
- bump_revision = False
480
- if is_run_from_ipython:
481
- bump_revision = True
482
- else:
483
- hash, _ = hash_file(self._path) # ignore hash_type for now
484
- if transform.hash is not None:
485
- condition = hash != transform.hash
486
- else:
487
- condition = hash != transform._source_code_artifact.hash
488
- if condition:
489
- bump_revision = True
490
- else:
491
- self._logging_message += (
492
- f"loaded Transform(uid='{transform.uid}')"
493
- )
494
- if bump_revision:
495
- change_type = (
496
- "Re-running saved notebook"
497
- if is_run_from_ipython
498
- else "Source code changed"
499
- )
500
- suid, vuid = (
501
- uid[:-4],
502
- uid[-4:],
503
- )
504
- new_vuid = increment_base62(vuid)
505
- raise UpdateContext(
506
- f"{change_type}, bump revision by setting:\n\n"
507
- f'ln.context.uid = "{suid}{new_vuid}"'
508
- )
509
- else:
510
- self._logging_message += f"loaded Transform(uid='{transform.uid}')"
511
- self._transform = transform
512
-
513
- def finish(self, ignore_non_consecutive: None | bool = None) -> None:
514
- """Mark the run context as finished.
515
-
516
- - writes a timestamp: `run.finished_at`
517
- - saves the source code: `transform.source_code`
518
-
519
- When called in the last cell of a notebook:
520
-
521
- - prompts for user input if not consecutively executed
522
- - requires to save the notebook in your editor right before
523
- - saves a run report: `run.report`
524
-
525
- Args:
526
- ignore_non_consecutive: Whether to ignore if a notebook was non-consecutively executed.
527
-
528
- Examples:
529
-
530
- >>> import lamindb as ln
531
- >>> ln.context.track()
532
- >>> # do things while tracking data lineage
533
- >>> ln.context.finish()
534
-
535
- See Also:
536
- `lamin save script.py` or `lamin save notebook.ipynb` → `docs </cli#lamin-save>`__
537
-
538
- """
539
- from lamindb._finish import save_context_core
540
-
541
- def get_seconds_since_modified(filepath) -> float:
542
- return datetime.now().timestamp() - filepath.stat().st_mtime
543
-
544
- def get_shortcut() -> str:
545
- import platform
546
-
547
- return "CMD + s" if platform.system() == "Darwin" else "CTRL + s"
548
-
549
- if context.run is None:
550
- raise TrackNotCalled("Please run `ln.context.track()` before `ln.finish()`")
551
- if context._path is None:
552
- if context.run.transform.type in {"script", "notebook"}:
553
- raise ValueError(
554
- f"Transform type is not allowed to be 'script' or 'notebook' but is {context.run.transform.type}."
555
- )
556
- context.run.finished_at = datetime.now(timezone.utc)
557
- context.run.save()
558
- # nothing else to do
559
- return None
560
- if is_run_from_ipython: # notebooks
561
- if get_seconds_since_modified(context._path) > 2 and not ln_setup._TESTING:
562
- raise NotebookNotSaved(
563
- f"Please save the notebook in your editor (shortcut `{get_shortcut()}`) right before calling `ln.context.finish()`"
564
- )
565
- save_context_core(
566
- run=context.run,
567
- transform=context.run.transform,
568
- filepath=context._path,
569
- finished_at=True,
570
- ignore_non_consecutive=ignore_non_consecutive,
571
- )
572
-
573
-
574
- context = Context()
1
+ from __future__ import annotations
2
+
3
+ import builtins
4
+ import hashlib
5
+ from datetime import datetime, timezone
6
+ from pathlib import Path, PurePath
7
+ from typing import TYPE_CHECKING
8
+
9
+ import lamindb_setup as ln_setup
10
+ from lamin_utils import logger
11
+ from lamindb_setup.core.hashing import hash_file
12
+ from lnschema_core import Run, Transform, ids
13
+ from lnschema_core.ids import base62_12
14
+ from lnschema_core.models import format_field_value
15
+
16
+ from ._settings import settings
17
+ from ._sync_git import get_transform_reference_from_git_repo
18
+ from ._track_environment import track_environment
19
+ from .exceptions import (
20
+ InconsistentKey,
21
+ MissingContextUID,
22
+ NotebookNotSaved,
23
+ NoTitleError,
24
+ TrackNotCalled,
25
+ UpdateContext,
26
+ )
27
+ from .subsettings._transform_settings import transform_settings
28
+ from .versioning import bump_version as bump_version_function
29
+ from .versioning import increment_base62, message_update_key_in_version_family
30
+
31
+ if TYPE_CHECKING:
32
+ from lamindb_setup.core.types import UPathStr
33
+ from lnschema_core.types import TransformType
34
+
35
+ is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
36
+
37
+ msg_path_failed = "failed to infer notebook path.\nfix: pass `path` to `ln.track()`"
38
+
39
+
40
+ def get_uid_ext(version: str) -> str:
41
+ from lamin_utils._base62 import encodebytes
42
+
43
+ # merely zero-padding the nbproject version such that the base62 encoding is
44
+ # at least 4 characters long doesn't yields sufficiently diverse hashes and
45
+ # leads to collisions; it'd be nice because the uid_ext would be ordered
46
+ return encodebytes(hashlib.md5(version.encode()).digest())[:4] # noqa: S324
47
+
48
+
49
+ def get_notebook_path():
50
+ from nbproject.dev._jupyter_communicate import (
51
+ notebook_path as get_notebook_path,
52
+ )
53
+
54
+ path = None
55
+ try:
56
+ path = get_notebook_path()
57
+ except Exception:
58
+ raise RuntimeError(msg_path_failed) from None
59
+ if path is None:
60
+ raise RuntimeError(msg_path_failed) from None
61
+ return path
62
+
63
+
64
+ # from https://stackoverflow.com/questions/61901628
65
+ def get_notebook_name_colab() -> str:
66
+ from socket import gethostbyname, gethostname # type: ignore
67
+
68
+ from requests import get # type: ignore
69
+
70
+ ip = gethostbyname(gethostname()) # 172.28.0.12
71
+ try:
72
+ name = get(f"http://{ip}:9000/api/sessions").json()[0]["name"] # noqa: S113
73
+ except Exception:
74
+ logger.warning(
75
+ "could not get notebook name from Google Colab, using: notebook.ipynb"
76
+ )
77
+ name = "notebook.ipynb"
78
+ return name.rstrip(".ipynb")
79
+
80
+
81
+ def raise_missing_context(transform_type: str, key: str) -> bool:
82
+ transform = Transform.filter(key=key).latest_version().first()
83
+ if transform is None:
84
+ new_uid = f"{base62_12()}0000"
85
+ message = f'to track this {transform_type}, copy & paste `ln.track("{new_uid}")` and re-run'
86
+ else:
87
+ uid = transform.uid
88
+ new_uid = f"{uid[:-4]}{increment_base62(uid[-4:])}"
89
+ message = f"you already have a transform with key '{key}' ('{transform.uid}')\n - to make a revision, call `ln.track('{new_uid}')`\n - to create a new transform, rename your file and run: `ln.track()`"
90
+ if transform_type == "notebook":
91
+ print(f"→ {message}")
92
+ response = input("→ Ready to re-run? (y/n)")
93
+ if response == "y":
94
+ logger.important(
95
+ "note: restart your notebook if you want consecutive cell execution"
96
+ )
97
+ return True
98
+ raise MissingContextUID("Please follow the instructions.")
99
+ else:
100
+ raise MissingContextUID(f"✗ {message}")
101
+ return False
102
+
103
+
104
+ def pretty_pypackages(dependencies: dict) -> str:
105
+ deps_list = []
106
+ for pkg, ver in dependencies.items():
107
+ if ver != "":
108
+ deps_list.append(pkg + f"=={ver}")
109
+ else:
110
+ deps_list.append(pkg)
111
+ deps_list.sort()
112
+ return " ".join(deps_list)
113
+
114
+
115
+ class Context:
116
+ """Run context.
117
+
118
+ Enables convenient data lineage tracking by managing a transform & run
119
+ upon :meth:`~lamindb.core.Context.track` & :meth:`~lamindb.core.Context.finish`.
120
+
121
+ Examples:
122
+
123
+ Is typically used via the global :class:`~lamindb.context` object via `ln.track()` and `ln.finish()`:
124
+
125
+ >>> import lamindb as ln
126
+ >>> ln.track()
127
+ >>> # do things
128
+ >>> ln.finish()
129
+
130
+ """
131
+
132
+ def __init__(self):
133
+ self._uid: str | None = None
134
+ self._name: str | None = None
135
+ self._version: str | None = None
136
+ self._transform: Transform | None = None
137
+ self._run: Run | None = None
138
+ self._path: Path | None = None
139
+ """A local path to the script that's running."""
140
+ self._logging_message: str = ""
141
+
142
+ @property
143
+ def transform(self) -> Transform | None:
144
+ """Managed transform of context."""
145
+ return self._transform
146
+
147
+ @property
148
+ def uid(self) -> str | None:
149
+ """`uid` argument for `context.transform`."""
150
+ return self._uid
151
+
152
+ @uid.setter
153
+ def uid(self, value: str | None):
154
+ self._uid = value
155
+
156
+ @property
157
+ def name(self) -> str | None:
158
+ """`name argument for `context.transform`."""
159
+ return self._name
160
+
161
+ @name.setter
162
+ def name(self, value: str | None):
163
+ self._name = value
164
+
165
+ @property
166
+ def version(self) -> str | None:
167
+ """`version` argument for `context.transform`."""
168
+ return self._version
169
+
170
+ @version.setter
171
+ def version(self, value: str | None):
172
+ self._version = value
173
+
174
+ @property
175
+ def run(self) -> Run | None:
176
+ """Managed run of context."""
177
+ return self._run
178
+
179
+ def track(
180
+ self,
181
+ uid: str | None = None,
182
+ *,
183
+ params: dict | None = None,
184
+ new_run: bool | None = None,
185
+ path: str | None = None,
186
+ transform: Transform | None = None,
187
+ ) -> None:
188
+ """Initiate a run with tracked data lineage.
189
+
190
+ - sets :attr:`~lamindb.core.Context.transform` &
191
+ :attr:`~lamindb.core.Context.run` by creating or loading `Transform` &
192
+ `Run` records
193
+ - saves compute environment as a `requirements.txt` file: `run.environment`
194
+
195
+ If :attr:`~lamindb.core.Settings.sync_git_repo` is set, checks whether a
196
+ script-like transform exists in a git repository and links it.
197
+
198
+ Args:
199
+ uid: A `uid` to create or load a transform.
200
+ params: A dictionary of parameters to track for the run.
201
+ new_run: If `False`, loads latest run of transform
202
+ (default notebook), if `True`, creates new run (default pipeline).
203
+ path: Filepath of notebook or script. Only needed if it can't be
204
+ automatically detected.
205
+ transform: Useful to track an abstract pipeline.
206
+
207
+ Examples:
208
+
209
+ To track the run of a notebook or script, call:
210
+
211
+ >>> import lamindb as ln
212
+ >>> ln.track()
213
+
214
+ """
215
+ if uid is not None:
216
+ self.uid = uid
217
+ self._path = None
218
+ if transform is None:
219
+ is_tracked = False
220
+ transform_settings_are_set = (
221
+ transform_settings.stem_uid is not None
222
+ and transform_settings.version is not None
223
+ )
224
+ transform = None
225
+ stem_uid = None
226
+ if uid is not None or self.uid is not None:
227
+ transform = Transform.filter(uid=self.uid).one_or_none()
228
+ if self.version is not None:
229
+ # test inconsistent version passed
230
+ if (
231
+ transform is not None
232
+ and transform.version is not None
233
+ and self.version != transform.version
234
+ ):
235
+ raise SystemExit(
236
+ f"Please pass consistent version: ln.context.version = '{transform.version}'"
237
+ )
238
+ # test whether version was already used for another member of the family
239
+ suid, vuid = (
240
+ self.uid[: Transform._len_stem_uid],
241
+ self.uid[Transform._len_stem_uid :],
242
+ )
243
+ transform = Transform.filter(
244
+ uid__startswith=suid, version=self.version
245
+ ).one_or_none()
246
+ if (
247
+ transform is not None
248
+ and vuid != transform.uid[Transform._len_stem_uid :]
249
+ ):
250
+ better_version = bump_version_function(self.version)
251
+ raise SystemExit(
252
+ f"Version '{self.version}' is already taken by Transform(uid='{transform.uid}'); please set another version, e.g., ln.context.version = '{better_version}'"
253
+ )
254
+ elif transform_settings_are_set:
255
+ stem_uid, self.version = (
256
+ transform_settings.stem_uid,
257
+ transform_settings.version,
258
+ )
259
+ transform = Transform.filter(
260
+ uid__startswith=stem_uid, version=self.version
261
+ ).one_or_none()
262
+ if is_run_from_ipython:
263
+ key, name = self._track_notebook(path=path)
264
+ transform_type = "notebook"
265
+ transform_ref = None
266
+ transform_ref_type = None
267
+ else:
268
+ (name, key, transform_ref, transform_ref_type) = self._track_script(
269
+ path=path
270
+ )
271
+ transform_type = "script"
272
+ if self.uid is not None or transform_settings_are_set:
273
+ # overwrite whatever is auto-detected in the notebook or script
274
+ if self.name is not None:
275
+ name = self.name
276
+ self._create_or_load_transform(
277
+ uid=self.uid,
278
+ stem_uid=stem_uid,
279
+ version=self.version,
280
+ name=name,
281
+ transform_ref=transform_ref,
282
+ transform_ref_type=transform_ref_type,
283
+ transform_type=transform_type,
284
+ key=key,
285
+ transform=transform,
286
+ )
287
+ # if no error is raised, the transform is tracked
288
+ is_tracked = True
289
+ if not is_tracked:
290
+ early_return = raise_missing_context(transform_type, key)
291
+ if early_return:
292
+ return None
293
+ else:
294
+ if transform.type in {"notebook", "script"}:
295
+ raise ValueError(
296
+ "Use `ln.track()` without passing transform in a notebook or script"
297
+ " - metadata is automatically parsed"
298
+ )
299
+ transform_exists = None
300
+ if transform.id is not None:
301
+ # transform has an id but unclear whether already saved
302
+ transform_exists = Transform.filter(id=transform.id).first()
303
+ if transform_exists is None:
304
+ transform.save()
305
+ self._logging_message += f"created Transform('{transform.uid[:8]}')"
306
+ transform_exists = transform
307
+ else:
308
+ self._logging_message += f"loaded Transform('{transform.uid[:8]}')"
309
+ self._transform = transform_exists
310
+
311
+ if new_run is None: # for notebooks, default to loading latest runs
312
+ new_run = False if self._transform.type == "notebook" else True # type: ignore
313
+
314
+ run = None
315
+ if not new_run: # try loading latest run by same user
316
+ run = (
317
+ Run.filter(
318
+ transform=self._transform, created_by_id=ln_setup.settings.user.id
319
+ )
320
+ .order_by("-created_at")
321
+ .first()
322
+ )
323
+ if run is not None: # loaded latest run
324
+ run.started_at = datetime.now(timezone.utc) # update run time
325
+ self._logging_message += f", started Run('{run.uid[:8]}') at {format_field_value(run.started_at)}"
326
+
327
+ if run is None: # create new run
328
+ run = Run(
329
+ transform=self._transform,
330
+ params=params,
331
+ )
332
+ run.started_at = datetime.now(timezone.utc)
333
+ self._logging_message += f", started new Run('{run.uid[:8]}') at {format_field_value(run.started_at)}"
334
+ # can only determine at ln.finish() if run was consecutive in
335
+ # interactive session, otherwise, is consecutive
336
+ run.is_consecutive = True if is_run_from_ipython else None
337
+ # need to save in all cases
338
+ run.save()
339
+ if params is not None:
340
+ run.params.add_values(params)
341
+ self._logging_message += "\n→ params: " + " ".join(
342
+ f"{key}='{value}'" for key, value in params.items()
343
+ )
344
+ self._run = run
345
+ track_environment(run)
346
+ logger.important(self._logging_message)
347
+ self._logging_message = ""
348
+
349
+ def _track_script(
350
+ self,
351
+ *,
352
+ path: UPathStr | None,
353
+ ) -> tuple[str, str, str, str]:
354
+ if path is None:
355
+ import inspect
356
+
357
+ frame = inspect.stack()[2]
358
+ module = inspect.getmodule(frame[0])
359
+ self._path = Path(module.__file__)
360
+ else:
361
+ self._path = Path(path)
362
+ name = self._path.name
363
+ key = name
364
+ reference = None
365
+ reference_type = None
366
+ if settings.sync_git_repo is not None:
367
+ reference = get_transform_reference_from_git_repo(self._path)
368
+ reference_type = "url"
369
+ return name, key, reference, reference_type
370
+
371
+ def _track_notebook(
372
+ self,
373
+ *,
374
+ path: str | None,
375
+ ):
376
+ if path is None:
377
+ path = get_notebook_path()
378
+ key = Path(path).name
379
+ if isinstance(path, (Path, PurePath)):
380
+ path_str = path.as_posix() # type: ignore
381
+ else:
382
+ path_str = str(path)
383
+ if path_str.endswith("Untitled.ipynb"):
384
+ raise RuntimeError("Please rename your notebook before tracking it")
385
+ if path_str.startswith("/fileId="):
386
+ name = get_notebook_name_colab()
387
+ key = f"{name}.ipynb"
388
+ else:
389
+ import nbproject
390
+
391
+ try:
392
+ nbproject_title = nbproject.meta.live.title
393
+ except IndexError:
394
+ raise NotebookNotSaved(
395
+ "The notebook is not saved, please save the notebook and"
396
+ " rerun ``"
397
+ ) from None
398
+ if nbproject_title is None:
399
+ raise NoTitleError(
400
+ "Please add a title to your notebook in a markdown cell: # Title"
401
+ ) from None
402
+ name = nbproject_title
403
+ # log imported python packages
404
+ if not path_str.startswith("/fileId="):
405
+ try:
406
+ from nbproject.dev._pypackage import infer_pypackages
407
+
408
+ nb = nbproject.dev.read_notebook(path_str)
409
+ logger.important(
410
+ "notebook imports:"
411
+ f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}"
412
+ )
413
+ except Exception:
414
+ logger.debug("inferring imported packages failed")
415
+ pass
416
+ self._path = Path(path_str)
417
+ return key, name
418
+
419
+ def _create_or_load_transform(
420
+ self,
421
+ *,
422
+ uid: str | None,
423
+ stem_uid: str | None,
424
+ version: str | None,
425
+ name: str,
426
+ transform_ref: str | None = None,
427
+ transform_ref_type: str | None = None,
428
+ key: str | None = None,
429
+ transform_type: TransformType = None,
430
+ transform: Transform | None = None,
431
+ ):
432
+ def get_key_clashing_message(transform: Transform, key: str) -> str:
433
+ update_key_note = message_update_key_in_version_family(
434
+ suid=transform.stem_uid,
435
+ existing_key=transform.key,
436
+ new_key=key,
437
+ registry="Transform",
438
+ )
439
+ return (
440
+ f'Filename "{key}" clashes with the existing key "{transform.key}" for uid "{transform.uid[:-4]}...."\n\nEither init a new transform with a new uid:\n\n'
441
+ f'ln.track("{ids.base62_12()}0000)"\n\n{update_key_note}'
442
+ )
443
+
444
+ # make a new transform record
445
+ if transform is None:
446
+ if uid is None:
447
+ uid = f"{stem_uid}{get_uid_ext(version)}"
448
+ # let's query revises so that we can pass it to the constructor and use it for error handling
449
+ revises = (
450
+ Transform.filter(uid__startswith=uid[:-4], is_latest=True)
451
+ .order_by("-created_at")
452
+ .first()
453
+ )
454
+ # note that here we're not passing revises because we're not querying it
455
+ # hence, we need to do a revision family lookup based on key
456
+ # hence, we need key to be not None
457
+ assert key is not None # noqa: S101
458
+ raise_update_context = False
459
+ try:
460
+ transform = Transform(
461
+ uid=uid,
462
+ version=version,
463
+ name=name,
464
+ key=key,
465
+ reference=transform_ref,
466
+ reference_type=transform_ref_type,
467
+ type=transform_type,
468
+ revises=revises,
469
+ ).save()
470
+ except InconsistentKey:
471
+ raise_update_context = True
472
+ if raise_update_context:
473
+ raise UpdateContext(get_key_clashing_message(revises, key))
474
+ self._logging_message += f"created Transform('{transform.uid[:8]}')"
475
+ else:
476
+ uid = transform.uid
477
+ # transform was already saved via `finish()`
478
+ transform_was_saved = (
479
+ transform._source_code_artifact_id is not None
480
+ or transform.source_code is not None
481
+ )
482
+ # check whether the transform.key is consistent
483
+ if transform.key != key:
484
+ raise UpdateContext(get_key_clashing_message(transform, key))
485
+ elif transform.name != name:
486
+ transform.name = name
487
+ transform.save()
488
+ self._logging_message += (
489
+ "updated transform name, " # white space on purpose
490
+ )
491
+ elif (
492
+ transform.created_by_id != ln_setup.settings.user.id
493
+ and not transform_was_saved
494
+ ):
495
+ raise UpdateContext(
496
+ f'{transform.created_by.name} ({transform.created_by.handle}) already works on this draft {transform.type}.\n\nPlease create a revision via `ln.track("{uid[:-4]}{increment_base62(uid[-4:])}")` or a new transform with a *different* filename and `ln.track("{ids.base62_12()}0000")`.'
497
+ )
498
+ # check whether transform source code was already saved
499
+ if transform_was_saved:
500
+ bump_revision = False
501
+ if is_run_from_ipython:
502
+ bump_revision = True
503
+ else:
504
+ hash, _ = hash_file(self._path) # ignore hash_type for now
505
+ if transform.hash is not None:
506
+ condition = hash != transform.hash
507
+ else:
508
+ condition = hash != transform._source_code_artifact.hash
509
+ if condition:
510
+ bump_revision = True
511
+ else:
512
+ self._logging_message += (
513
+ f"loaded Transform('{transform.uid[:8]}')"
514
+ )
515
+ if bump_revision:
516
+ change_type = (
517
+ "Re-running saved notebook"
518
+ if is_run_from_ipython
519
+ else "Source code changed"
520
+ )
521
+ raise UpdateContext(
522
+ f"{change_type}, bump revision by setting:\n\n"
523
+ f'ln.track("{uid[:-4]}{increment_base62(uid[-4:])}")'
524
+ )
525
+ else:
526
+ self._logging_message += f"loaded Transform('{transform.uid[:8]}')"
527
+ self._transform = transform
528
+
529
+ def finish(self, ignore_non_consecutive: None | bool = None) -> None:
530
+ """Finish a tracked run.
531
+
532
+ - writes a timestamp: `run.finished_at`
533
+ - saves the source code: `transform.source_code`
534
+
535
+ When called in the last cell of a notebook:
536
+
537
+ - prompts for user input if not consecutively executed
538
+ - requires to save the notebook in your editor right before
539
+ - saves a run report: `run.report`
540
+
541
+ Args:
542
+ ignore_non_consecutive: Whether to ignore if a notebook was non-consecutively executed.
543
+
544
+ Examples:
545
+
546
+ >>> import lamindb as ln
547
+ >>> ln.track()
548
+ >>> # do things while tracking data lineage
549
+ >>> ln.finish()
550
+
551
+ See Also:
552
+ `lamin save script.py` or `lamin save notebook.ipynb` `docs </cli#lamin-save>`__
553
+
554
+ """
555
+ from lamindb._finish import save_context_core
556
+
557
+ def get_seconds_since_modified(filepath) -> float:
558
+ return datetime.now().timestamp() - filepath.stat().st_mtime
559
+
560
+ def get_shortcut() -> str:
561
+ import platform
562
+
563
+ return "CMD + s" if platform.system() == "Darwin" else "CTRL + s"
564
+
565
+ if context.run is None:
566
+ raise TrackNotCalled("Please run `ln.track()` before `ln.finish()`")
567
+ if context._path is None:
568
+ if context.run.transform.type in {"script", "notebook"}:
569
+ raise ValueError(
570
+ f"Transform type is not allowed to be 'script' or 'notebook' but is {context.run.transform.type}."
571
+ )
572
+ context.run.finished_at = datetime.now(timezone.utc)
573
+ context.run.save()
574
+ # nothing else to do
575
+ return None
576
+ if is_run_from_ipython: # notebooks
577
+ if get_seconds_since_modified(context._path) > 2 and not ln_setup._TESTING:
578
+ raise NotebookNotSaved(
579
+ f"Please save the notebook in your editor (shortcut `{get_shortcut()}`) right before calling `ln.finish()`"
580
+ )
581
+ save_context_core(
582
+ run=context.run,
583
+ transform=context.run.transform,
584
+ filepath=context._path,
585
+ finished_at=True,
586
+ ignore_non_consecutive=ignore_non_consecutive,
587
+ )
588
+
589
+
590
+ context = Context()