lamindb 0.63.5__py3-none-any.whl → 0.64.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/_dataset.py CHANGED
@@ -1,15 +1,10 @@
1
1
  from collections import defaultdict
2
- from pathlib import Path
3
2
  from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union
4
3
 
5
4
  import anndata as ad
6
5
  import pandas as pd
7
6
  from lamin_utils import logger
8
- from lamindb_setup._init_instance import register_storage
9
- from lamindb_setup.dev import StorageSettings
10
7
  from lamindb_setup.dev._docs import doc_args
11
- from lamindb_setup.dev._hub_utils import get_storage_region
12
- from lamindb_setup.dev.upath import UPath
13
8
  from lnschema_core.models import Dataset, Feature, FeatureSet
14
9
  from lnschema_core.types import AnnDataLike, DataLike, FieldAttr, VisibilityChoice
15
10
 
@@ -19,8 +14,8 @@ from lamindb.dev._mapped_dataset import MappedDataset
19
14
  from lamindb.dev.storage._backed_access import AnnDataAccessor, BackedAccessor
20
15
  from lamindb.dev.versioning import get_ids_from_old_version, init_uid
21
16
 
22
- from . import _TESTING, File, Run
23
- from ._file import parse_feature_sets_from_anndata
17
+ from . import _TESTING, Artifact, Run
18
+ from ._artifact import parse_feature_sets_from_anndata
24
19
  from ._registry import init_self_from_db
25
20
  from .dev._data import (
26
21
  add_transform_to_kwargs,
@@ -42,7 +37,7 @@ def __init__(
42
37
  # now we proceed with the user-facing constructor
43
38
  if len(args) > 1:
44
39
  raise ValueError("Only one non-keyword arg allowed: data")
45
- data: Union[pd.DataFrame, ad.AnnData, File, Iterable[File]] = (
40
+ data: Union[pd.DataFrame, ad.AnnData, Artifact, Iterable[Artifact]] = (
46
41
  kwargs.pop("data") if len(args) == 0 else args[0]
47
42
  )
48
43
  meta: Optional[str] = kwargs.pop("meta") if "meta" in kwargs else None
@@ -96,70 +91,56 @@ def __init__(
96
91
 
97
92
  run = get_run(run)
98
93
  data_init_complete = False
99
- file = None
100
- files = None
101
- storage = None
102
- # init from directory or bucket
103
- if isinstance(data, (str, Path, UPath)):
104
- upath = UPath(data)
105
- # below frequently times out on GCP
106
- # comment this and corresponding test out
107
- # if not upath.is_dir():
108
- # raise ValueError(f"Can only pass buckets or directories, not {data}")
109
- upath_str = upath.as_posix().rstrip("/")
110
- region = get_storage_region(upath_str)
111
- storage_settings = StorageSettings(upath_str, region)
112
- storage = register_storage(storage_settings)
113
- hash = None
114
- data_init_complete = True
94
+ artifact = None
95
+ artifacts = None
115
96
  # now handle potential metadata
116
97
  if meta is not None:
117
- if not isinstance(meta, (pd.DataFrame, ad.AnnData, File)):
98
+ if not isinstance(meta, (pd.DataFrame, ad.AnnData, Artifact)):
118
99
  raise ValueError(
119
- "meta has to be of type `(pd.DataFrame, ad.AnnData, File)`"
100
+ "meta has to be of type `(pd.DataFrame, ad.AnnData, Artifact)`"
120
101
  )
121
102
  data = meta
122
- # init file - is either data or metadata
123
- if isinstance(data, (pd.DataFrame, ad.AnnData, File)):
124
- if isinstance(data, File):
125
- file = data
126
- if file._state.adding:
127
- raise ValueError("Save file before creating dataset!")
103
+ # init artifact - is either data or metadata
104
+ if isinstance(data, (pd.DataFrame, ad.AnnData, Artifact)):
105
+ if isinstance(data, Artifact):
106
+ artifact = data
107
+ if artifact._state.adding:
108
+ raise ValueError("Save artifact before creating dataset!")
128
109
  if not feature_sets:
129
- feature_sets = file.features._feature_set_by_slot
110
+ feature_sets = artifact.features._feature_set_by_slot
130
111
  else:
131
- if len(file.features._feature_set_by_slot) > 0:
132
- logger.info("overwriting feature sets linked to file")
112
+ if len(artifact.features._feature_set_by_slot) > 0:
113
+ logger.info("overwriting feature sets linked to artifact")
133
114
  else:
134
115
  log_hint = True if feature_sets is None else False
135
- file_is_new_version_of = (
136
- is_new_version_of.file if is_new_version_of is not None else None
116
+ artifact_is_new_version_of = (
117
+ is_new_version_of.artifact if is_new_version_of is not None else None
137
118
  )
138
- file = File(
119
+ artifact = Artifact(
139
120
  data,
140
121
  run=run,
141
122
  description="tmp",
142
123
  log_hint=log_hint,
143
124
  version=version,
144
- is_new_version_of=file_is_new_version_of,
125
+ is_new_version_of=artifact_is_new_version_of,
145
126
  )
146
- # do we really want to update the file here?
127
+ # do we really want to update the artifact here?
147
128
  if feature_sets:
148
- file._feature_sets = feature_sets
149
- hash = file.hash # type: ignore
150
- provisional_uid = file.uid # type: ignore
151
- if file.description is None or file.description == "tmp":
152
- file.description = f"See dataset {provisional_uid}" # type: ignore
129
+ artifact._feature_sets = feature_sets
130
+ hash = artifact.hash # type: ignore
131
+ provisional_uid = artifact.uid # type: ignore
132
+ if artifact.description is None or artifact.description == "tmp":
133
+ artifact.description = f"See dataset {provisional_uid}" # type: ignore
153
134
  data_init_complete = True
154
135
  if not data_init_complete:
155
136
  if hasattr(data, "__getitem__"):
156
- assert isinstance(data[0], File) # type: ignore
157
- files = data
158
- hash, feature_sets = from_files(files) # type: ignore
137
+ assert isinstance(data[0], Artifact) # type: ignore
138
+ artifacts = data
139
+ hash, feature_sets = from_artifacts(artifacts) # type: ignore
159
140
  data_init_complete = True
160
141
  else:
161
142
  raise ValueError(
162
- "Only DataFrame, AnnData, folder or list of File is allowed."
143
+ "Only DataFrame, AnnData, Artifact or list of artifacts is allowed."
163
144
  )
164
145
  # we ignore datasets in trash containing the same hash
165
146
  if hash is not None:
@@ -183,8 +164,7 @@ def __init__(
183
164
  description=description,
184
165
  reference=reference,
185
166
  reference_type=reference_type,
186
- file=file,
187
- storage=storage,
167
+ artifact=artifact,
188
168
  hash=hash,
189
169
  run=run,
190
170
  version=version,
@@ -192,15 +172,15 @@ def __init__(
192
172
  visibility=visibility,
193
173
  **kwargs,
194
174
  )
195
- dataset._files = files
175
+ dataset._artifacts = artifacts
196
176
  dataset._feature_sets = feature_sets
197
177
  # register provenance
198
178
  if is_new_version_of is not None:
199
179
  _track_run_input(is_new_version_of, run=run)
200
- if file is not None and file.run != run:
201
- _track_run_input(file, run=run)
202
- elif files is not None:
203
- _track_run_input(files, run=run)
180
+ if artifact is not None and artifact.run != run:
181
+ _track_run_input(artifact, run=run)
182
+ elif artifacts is not None:
183
+ _track_run_input(artifacts, run=run)
204
184
 
205
185
 
206
186
  @classmethod # type: ignore
@@ -215,7 +195,7 @@ def from_df(
215
195
  reference: Optional[str] = None,
216
196
  reference_type: Optional[str] = None,
217
197
  version: Optional[str] = None,
218
- is_new_version_of: Optional["File"] = None,
198
+ is_new_version_of: Optional["Artifact"] = None,
219
199
  **kwargs,
220
200
  ) -> "Dataset":
221
201
  """{}"""
@@ -250,11 +230,11 @@ def from_anndata(
250
230
  reference: Optional[str] = None,
251
231
  reference_type: Optional[str] = None,
252
232
  version: Optional[str] = None,
253
- is_new_version_of: Optional["File"] = None,
233
+ is_new_version_of: Optional["Artifact"] = None,
254
234
  **kwargs,
255
235
  ) -> "Dataset":
256
236
  """{}"""
257
- if isinstance(adata, File):
237
+ if isinstance(adata, Artifact):
258
238
  assert not adata._state.adding
259
239
  assert adata.accessor == "AnnData"
260
240
  adata_parse = adata.path
@@ -276,23 +256,24 @@ def from_anndata(
276
256
 
277
257
 
278
258
  # internal function, not exposed to user
279
- def from_files(files: Iterable[File]) -> Tuple[str, Dict[str, str]]:
280
- # assert all files are already saved
259
+ def from_artifacts(artifacts: Iterable[Artifact]) -> Tuple[str, Dict[str, str]]:
260
+ # assert all artifacts are already saved
281
261
  logger.debug("check not saved")
282
- saved = not any([file._state.adding for file in files])
262
+ saved = not any([artifact._state.adding for artifact in artifacts])
283
263
  if not saved:
284
- raise ValueError("Not all files are yet saved, please save them")
285
- # query all feature sets of files
286
- logger.debug("file ids")
287
- file_ids = [file.id for file in files]
288
- # query all feature sets at the same time rather than making a single query per file
289
- logger.debug("feature_set_file_links")
290
- feature_set_file_links = File.feature_sets.through.objects.filter(
291
- file_id__in=file_ids
264
+ raise ValueError("Not all artifacts are yet saved, please save them")
265
+ # query all feature sets of artifacts
266
+ logger.debug("artifact ids")
267
+ artifact_ids = [artifact.id for artifact in artifacts]
268
+ # query all feature sets at the same time rather
269
+ # than making a single query per artifact
270
+ logger.debug("feature_set_artifact_links")
271
+ feature_set_artifact_links = Artifact.feature_sets.through.objects.filter(
272
+ artifact_id__in=artifact_ids
292
273
  )
293
274
  feature_sets_by_slots = defaultdict(list)
294
275
  logger.debug("slots")
295
- for link in feature_set_file_links:
276
+ for link in feature_set_artifact_links:
296
277
  feature_sets_by_slots[link.slot].append(link.feature_set_id)
297
278
  feature_sets_union = {}
298
279
  logger.debug("union")
@@ -318,14 +299,14 @@ def from_files(files: Iterable[File]) -> Tuple[str, Dict[str, str]]:
318
299
  # validate consistency of hashes
319
300
  # we do not allow duplicate hashes
320
301
  logger.debug("hashes")
321
- # file.hash is None for zarr
302
+ # artifact.hash is None for zarr
322
303
  # todo: more careful handling of such cases
323
- hashes = [file.hash for file in files if file.hash is not None]
304
+ hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None]
324
305
  if len(hashes) != len(set(hashes)):
325
306
  seen = set()
326
307
  non_unique = [x for x in hashes if x in seen or seen.add(x)] # type: ignore
327
308
  raise ValueError(
328
- "Please pass files with distinct hashes: these ones are non-unique"
309
+ "Please pass artifacts with distinct hashes: these ones are non-unique"
329
310
  f" {non_unique}"
330
311
  )
331
312
  time = logger.debug("hash")
@@ -346,14 +327,14 @@ def mapped(
346
327
  ) -> "MappedDataset":
347
328
  _track_run_input(self, is_run_input)
348
329
  path_list = []
349
- for file in self.files.all():
350
- if file.suffix not in {".h5ad", ".zrad", ".zarr"}:
351
- logger.warning(f"Ignoring file with suffix {file.suffix}")
330
+ for artifact in self.artifacts.all():
331
+ if artifact.suffix not in {".h5ad", ".zrad", ".zarr"}:
332
+ logger.warning(f"Ignoring artifact with suffix {artifact.suffix}")
352
333
  continue
353
- elif not stream and file.suffix == ".h5ad":
354
- path_list.append(file.stage())
334
+ elif not stream and artifact.suffix == ".h5ad":
335
+ path_list.append(artifact.stage())
355
336
  else:
356
- path_list.append(file.path)
337
+ path_list.append(artifact.path)
357
338
  return MappedDataset(path_list, label_keys, join_vars, encode_labels, parallel)
358
339
 
359
340
 
@@ -362,9 +343,9 @@ def backed(
362
343
  self, is_run_input: Optional[bool] = None
363
344
  ) -> Union["AnnDataAccessor", "BackedAccessor"]:
364
345
  _track_run_input(self, is_run_input)
365
- if self.file is None:
366
- raise RuntimeError("Can only call backed() for datasets with a single file")
367
- return self.file.backed()
346
+ if self.artifact is None:
347
+ raise RuntimeError("Can only call backed() for datasets with a single artifact")
348
+ return self.artifact.backed()
368
349
 
369
350
 
370
351
  # docstring handled through attach_func_to_class_method
@@ -375,25 +356,25 @@ def load(
375
356
  **kwargs,
376
357
  ) -> DataLike:
377
358
  # cannot call _track_run_input here, see comment further down
378
- if self.file is not None:
359
+ if self.artifact is not None:
379
360
  _track_run_input(self, is_run_input)
380
- return self.file.load()
361
+ return self.artifact.load()
381
362
  else:
382
- all_files = self.files.all()
383
- suffixes = [file.suffix for file in all_files]
363
+ all_artifacts = self.artifacts.all()
364
+ suffixes = [artifact.suffix for artifact in all_artifacts]
384
365
  if len(set(suffixes)) != 1:
385
366
  raise RuntimeError(
386
- "Can only load datasets where all files have the same suffix"
367
+ "Can only load datasets where all artifacts have the same suffix"
387
368
  )
388
369
  # because we're tracking data flow on the dataset-level, here, we don't
389
- # want to track it on the file-level
390
- objects = [file.load(is_run_input=False) for file in all_files]
391
- file_uids = [file.uid for file in all_files]
370
+ # want to track it on the artifact-level
371
+ objects = [artifact.load(is_run_input=False) for artifact in all_artifacts]
372
+ artifact_uids = [artifact.uid for artifact in all_artifacts]
392
373
  if isinstance(objects[0], pd.DataFrame):
393
374
  concat_object = pd.concat(objects, join=join)
394
375
  elif isinstance(objects[0], ad.AnnData):
395
376
  concat_object = ad.concat(
396
- objects, join=join, label="file_uid", keys=file_uids
377
+ objects, join=join, label="artifact_uid", keys=artifact_uids
397
378
  )
398
379
  # only call it here because there might be errors during concat
399
380
  _track_run_input(self, is_run_input)
@@ -409,10 +390,10 @@ def delete(
409
390
  self.visibility = VisibilityChoice.trash.value
410
391
  self.save()
411
392
  logger.warning("moved dataset to trash.")
412
- if self.file is not None:
413
- self.file.visibility = VisibilityChoice.trash.value
414
- self.file.save()
415
- logger.warning("moved dataset.file to trash.")
393
+ if self.artifact is not None:
394
+ self.artifact.visibility = VisibilityChoice.trash.value
395
+ self.artifact.save()
396
+ logger.warning("moved dataset.artifact to trash.")
416
397
  return
417
398
 
418
399
  # permanent delete
@@ -427,38 +408,30 @@ def delete(
427
408
 
428
409
  if delete_record:
429
410
  super(Dataset, self).delete()
430
- if self.file is not None:
431
- self.file.delete(permanent=permanent, storage=storage)
411
+ if self.artifact is not None:
412
+ self.artifact.delete(permanent=permanent, storage=storage)
432
413
 
433
414
 
434
415
  # docstring handled through attach_func_to_class_method
435
416
  def save(self, *args, **kwargs) -> None:
436
- if self.file is not None:
437
- self.file.save()
417
+ if self.artifact is not None:
418
+ self.artifact.save()
438
419
  # we don't need to save feature sets again
439
420
  save_feature_sets(self)
440
421
  super(Dataset, self).save()
441
- if hasattr(self, "_files"):
442
- if self._files is not None and len(self._files) > 0:
443
- self.files.set(self._files)
422
+ if hasattr(self, "_artifacts"):
423
+ if self._artifacts is not None and len(self._artifacts) > 0:
424
+ self.artifacts.set(self._artifacts)
444
425
  save_feature_set_links(self)
445
426
 
446
427
 
447
- @property # type: ignore
448
- @doc_args(Dataset.path.__doc__)
449
- def path(self) -> Union[Path, UPath]:
450
- """{}"""
451
- _track_run_input(self)
452
- return self.storage.path
453
-
454
-
455
428
  # docstring handled through attach_func_to_class_method
456
429
  def restore(self) -> None:
457
430
  self.visibility = VisibilityChoice.default.value
458
431
  self.save()
459
- if self.file is not None:
460
- self.file.visibility = VisibilityChoice.default.value
461
- self.file.save()
432
+ if self.artifact is not None:
433
+ self.artifact.visibility = VisibilityChoice.default.value
434
+ self.artifact.save()
462
435
 
463
436
 
464
437
  METHOD_NAMES = [
@@ -485,6 +458,5 @@ if _TESTING:
485
458
  for name in METHOD_NAMES:
486
459
  attach_func_to_class_method(name, Dataset, globals())
487
460
 
488
- setattr(Dataset, "path", path)
489
461
  # this seems a Django-generated function
490
462
  delattr(Dataset, "get_visibility_display")
lamindb/_delete.py CHANGED
@@ -38,12 +38,12 @@ def delete( # type: ignore
38
38
 
39
39
  Delete files (delete the metadata record and the file in storage):
40
40
 
41
- >>> file = ln.filter(File, id=file_id).one()
41
+ >>> file = ln.filter(File, id=artifact_id).one()
42
42
  >>> ln.delete(file)
43
43
  >>> # deleting the record occurs automatically
44
44
  >>> # you will be asked whether to delete the file in storage
45
45
  >>> # for more control, use:
46
- >>> file.delete(storage=True)
46
+ >>> artifact.delete(storage=True)
47
47
 
48
48
  Bulk delete via QuerySet:
49
49
 
lamindb/_filter.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from typing import Type
2
2
 
3
- from lnschema_core import Dataset, File, Registry
3
+ from lnschema_core import Artifact, Dataset, Registry
4
4
  from lnschema_core.types import VisibilityChoice
5
5
 
6
6
  from lamindb._query_set import QuerySet
@@ -8,7 +8,7 @@ from lamindb._query_set import QuerySet
8
8
 
9
9
  def filter(Registry: Type[Registry], **expressions) -> QuerySet:
10
10
  """See :meth:`~lamindb.dev.Registry.filter`."""
11
- if Registry in {File, Dataset}:
11
+ if Registry in {Artifact, Dataset}:
12
12
  # visibility is set to 0 unless expressions contains id or uid equality
13
13
  if not ("id" in expressions or "uid" in expressions):
14
14
  visibility = "visibility"
lamindb/_parents.py CHANGED
@@ -2,7 +2,7 @@ import builtins
2
2
  from typing import List, Optional, Set, Union
3
3
 
4
4
  from lamin_utils import logger
5
- from lnschema_core import Dataset, File, Registry, Run, Transform
5
+ from lnschema_core import Artifact, Dataset, Registry, Run, Transform
6
6
  from lnschema_core.models import HasParents, format_field_value
7
7
 
8
8
  from lamindb._utils import attach_func_to_class_method
@@ -61,7 +61,7 @@ def view_parents(
61
61
  )
62
62
 
63
63
 
64
- def view_flow(data: Union[File, Dataset], with_children: bool = True) -> None:
64
+ def view_flow(data: Union[Artifact, Dataset], with_children: bool = True) -> None:
65
65
  """Graph of data flow.
66
66
 
67
67
  Notes:
@@ -69,7 +69,7 @@ def view_flow(data: Union[File, Dataset], with_children: bool = True) -> None:
69
69
 
70
70
  Examples:
71
71
  >>> dataset.view_flow()
72
- >>> file.view_flow()
72
+ >>> artifact.view_flow()
73
73
  """
74
74
  import graphviz
75
75
 
@@ -81,7 +81,7 @@ def view_flow(data: Union[File, Dataset], with_children: bool = True) -> None:
81
81
  data_label = _record_label(data)
82
82
 
83
83
  def add_node(
84
- record: Union[Run, File, Dataset],
84
+ record: Union[Run, Artifact, Dataset],
85
85
  node_id: str,
86
86
  node_label: str,
87
87
  u: graphviz.Digraph,
@@ -257,7 +257,7 @@ def _df_edges_from_parents(
257
257
 
258
258
 
259
259
  def _record_label(record: Registry, field: Optional[str] = None):
260
- if isinstance(record, File):
260
+ if isinstance(record, Artifact):
261
261
  if record.description is None:
262
262
  name = record.key
263
263
  else:
@@ -305,7 +305,7 @@ def _add_emoji(record: Registry, label: str):
305
305
  return f"{emoji} {label}"
306
306
 
307
307
 
308
- def _get_all_parent_runs(data: Union[File, Dataset]) -> List:
308
+ def _get_all_parent_runs(data: Union[Artifact, Dataset]) -> List:
309
309
  """Get all input file/dataset runs recursively."""
310
310
  name = data._meta.model_name
311
311
  run_inputs_outputs = []
@@ -331,7 +331,7 @@ def _get_all_parent_runs(data: Union[File, Dataset]) -> List:
331
331
  return run_inputs_outputs
332
332
 
333
333
 
334
- def _get_all_child_runs(data: Union[File, Dataset]) -> List:
334
+ def _get_all_child_runs(data: Union[Artifact, Dataset]) -> List:
335
335
  """Get all output file/dataset runs recursively."""
336
336
  name = data._meta.model_name
337
337
  all_runs: Set[Run] = set()
lamindb/_query_manager.py CHANGED
@@ -30,7 +30,10 @@ class QueryManager(models.Manager):
30
30
 
31
31
  def _track_run_input_manager(self):
32
32
  if hasattr(self, "source_field_name") and hasattr(self, "target_field_name"):
33
- if self.source_field_name == "dataset" and self.target_field_name == "file":
33
+ if (
34
+ self.source_field_name == "dataset"
35
+ and self.target_field_name == "artifact"
36
+ ):
34
37
  from lamindb.dev._data import WARNING_RUN_TRANSFORM, _track_run_input
35
38
  from lamindb.dev._run_context import run_context
36
39
 
@@ -95,7 +98,7 @@ class QueryManager(models.Manager):
95
98
  target_field_name = self.target_field_name
96
99
 
97
100
  if (
98
- source_field_name in {"file", "dataset"}
101
+ source_field_name in {"artifact", "dataset"}
99
102
  and target_field_name == "feature_set"
100
103
  ):
101
104
  return get_feature_set_by_slot(host=self.instance).get(item)
lamindb/_registry.py CHANGED
@@ -184,8 +184,8 @@ def _search(
184
184
  case_sensitive=case_sensitive,
185
185
  )
186
186
 
187
- # search in both key and description fields for file
188
- if orm._meta.model.__name__ == "File" and field is None:
187
+ # search in both key and description fields for Artifact
188
+ if orm._meta.model.__name__ == "Artifact" and field is None:
189
189
  field = ["key", "description"]
190
190
 
191
191
  if not isinstance(field, List):
@@ -405,7 +405,7 @@ def transfer_fk_to_default_db_bulk(records: List):
405
405
  "bionty_source",
406
406
  "initial_version",
407
407
  "latest_report", # Transform
408
- "source_file", # Transform
408
+ "source_code", # Transform
409
409
  "report", # Run
410
410
  "file", # Dataset
411
411
  ]: