lamindb 0.64.2__py3-none-any.whl → 0.65.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -40,12 +40,12 @@ class _Connect:
40
40
  self.conn.close()
41
41
 
42
42
 
43
- class MappedDataset:
44
- """Map-style dataset for use in data loaders.
43
+ class MappedCollection:
44
+ """Map-style collection for use in data loaders.
45
45
 
46
46
  This currently only works for collections of `AnnData` objects.
47
47
 
48
- For an example, see :meth:`~lamindb.Dataset.mapped`.
48
+ For an example, see :meth:`~lamindb.Collection.mapped`.
49
49
 
50
50
  .. note::
51
51
 
@@ -57,10 +57,14 @@ class MappedDataset:
57
57
  self,
58
58
  path_list: List[Union[str, PathLike]],
59
59
  label_keys: Optional[Union[str, List[str]]] = None,
60
- join_vars: Optional[Literal["auto", "inner"]] = "auto",
60
+ join_vars: Optional[Literal["auto", "inner", "outer"]] = "auto",
61
61
  encode_labels: bool = True,
62
+ cache_categories: bool = True,
62
63
  parallel: bool = False,
64
+ dtype: Optional[str] = None,
63
65
  ):
66
+ assert join_vars in {None, "auto", "inner", "outer"}
67
+
64
68
  self.storages = [] # type: ignore
65
69
  self.conns = [] # type: ignore
66
70
  self.parallel = parallel
@@ -86,8 +90,15 @@ class MappedDataset:
86
90
 
87
91
  self.encode_labels = encode_labels
88
92
  self.label_keys = [label_keys] if isinstance(label_keys, str) else label_keys
89
- if self.label_keys is not None and self.encode_labels:
90
- self._make_encoders(self.label_keys)
93
+ if self.label_keys is not None:
94
+ if cache_categories:
95
+ self._cache_categories(self.label_keys)
96
+ else:
97
+ self._cache_cats: dict = {}
98
+ if self.encode_labels:
99
+ self._make_encoders(self.label_keys)
100
+
101
+ self._dtype = dtype
91
102
 
92
103
  self._closed = False
93
104
 
@@ -104,6 +115,18 @@ class MappedDataset:
104
115
  self.conns.append(conn)
105
116
  self.storages.append(storage)
106
117
 
118
+ def _cache_categories(self, label_keys: list):
119
+ self._cache_cats = {}
120
+ decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)
121
+ for label in label_keys:
122
+ self._cache_cats[label] = []
123
+ for storage in self.storages:
124
+ with _Connect(storage) as store:
125
+ cats = self.get_categories(store, label)
126
+ if cats is not None:
127
+ cats = decode(cats) if isinstance(cats[0], bytes) else cats[...]
128
+ self._cache_cats[label].append(cats)
129
+
107
130
  def _make_encoders(self, label_keys: list):
108
131
  self.encoders = []
109
132
  for label in label_keys:
@@ -115,20 +138,31 @@ class MappedDataset:
115
138
  for storage in self.storages:
116
139
  with _Connect(storage) as store:
117
140
  var_list.append(_safer_read_index(store["var"]))
141
+
142
+ self.var_joint = None
118
143
  if self.join_vars == "auto":
119
144
  vars_eq = all(var_list[0].equals(vrs) for vrs in var_list[1:])
120
145
  if vars_eq:
121
146
  self.join_vars = None
122
147
  return
123
148
  else:
124
- self.join_vars = "inner"
149
+ self.var_joint = reduce(pd.Index.intersection, var_list)
150
+ if len(self.var_joint) > 0:
151
+ self.join_vars = "inner"
152
+ else:
153
+ self.join_vars = "outer"
154
+
125
155
  if self.join_vars == "inner":
126
- self.var_joint = reduce(pd.Index.intersection, var_list)
127
- if len(self.var_joint) == 0:
128
- raise ValueError(
129
- "The provided AnnData objects don't have shared varibales."
130
- )
156
+ if self.var_joint is None:
157
+ self.var_joint = reduce(pd.Index.intersection, var_list)
158
+ if len(self.var_joint) == 0:
159
+ raise ValueError(
160
+ "The provided AnnData objects don't have shared varibales."
161
+ )
131
162
  self.var_indices = [vrs.get_indexer(self.var_joint) for vrs in var_list]
163
+ elif self.join_vars == "outer":
164
+ self.var_joint = reduce(pd.Index.union, var_list)
165
+ self.var_indices = [self.var_joint.get_indexer(vrs) for vrs in var_list]
132
166
 
133
167
  def __len__(self):
134
168
  return self.n_obs
@@ -137,15 +171,21 @@ class MappedDataset:
137
171
  obs_idx = self.indices[idx]
138
172
  storage_idx = self.storage_idx[idx]
139
173
  if self.var_indices is not None:
140
- var_idxs = self.var_indices[storage_idx]
174
+ var_idxs_join = self.var_indices[storage_idx]
141
175
  else:
142
- var_idxs = None
176
+ var_idxs_join = None
143
177
 
144
178
  with _Connect(self.storages[storage_idx]) as store:
145
- out = [self.get_data_idx(store, obs_idx, var_idxs)]
179
+ out = [self.get_data_idx(store, obs_idx, var_idxs_join)]
146
180
  if self.label_keys is not None:
147
181
  for i, label in enumerate(self.label_keys):
148
- label_idx = self.get_label_idx(store, obs_idx, label)
182
+ if label in self._cache_cats:
183
+ cats = self._cache_cats[label][storage_idx]
184
+ if cats is None:
185
+ cats = []
186
+ else:
187
+ cats = None
188
+ label_idx = self.get_label_idx(store, obs_idx, label, cats)
149
189
  if self.encode_labels:
150
190
  label_idx = self.encoders[i][label_idx]
151
191
  out.append(label_idx)
@@ -155,26 +195,50 @@ class MappedDataset:
155
195
  self,
156
196
  storage: StorageType, # type: ignore
157
197
  idx: int,
158
- var_idxs: Optional[list] = None,
198
+ var_idxs_join: Optional[list] = None,
159
199
  layer_key: Optional[str] = None,
160
200
  ):
161
201
  """Get the index for the data."""
162
202
  layer = storage["X"] if layer_key is None else storage["layers"][layer_key] # type: ignore
163
203
  if isinstance(layer, ArrayTypes): # type: ignore
164
- # todo: better way to select variables
165
- return layer[idx] if var_idxs is None else layer[idx][var_idxs]
204
+ layer_idx = layer[idx]
205
+ if self.join_vars is None:
206
+ result = layer_idx
207
+ if self._dtype is not None:
208
+ result = result.astype(self._dtype, copy=False)
209
+ elif self.join_vars == "outer":
210
+ dtype = layer_idx.dtype if self._dtype is None else self._dtype
211
+ result = np.zeros(len(self.var_joint), dtype=dtype)
212
+ result[var_idxs_join] = layer_idx
213
+ else: # inner join
214
+ result = layer_idx[var_idxs_join]
215
+ if self._dtype is not None:
216
+ result = result.astype(self._dtype, copy=False)
217
+ return result
166
218
  else: # assume csr_matrix here
167
219
  data = layer["data"]
168
220
  indices = layer["indices"]
169
221
  indptr = layer["indptr"]
170
222
  s = slice(*(indptr[idx : idx + 2]))
171
- # this requires more memory than csr_matrix when var_idxs is not None
172
- # but it is faster
173
- layer_idx = np.zeros(layer.attrs["shape"][1])
174
- layer_idx[indices[s]] = data[s]
175
- return layer_idx if var_idxs is None else layer_idx[var_idxs]
223
+ data_s = data[s]
224
+ dtype = data_s.dtype if self._dtype is None else self._dtype
225
+ if self.join_vars == "outer":
226
+ layer_idx = np.zeros(len(self.var_joint), dtype=dtype)
227
+ layer_idx[var_idxs_join[indices[s]]] = data_s
228
+ else:
229
+ layer_idx = np.zeros(layer.attrs["shape"][1], dtype=dtype)
230
+ layer_idx[indices[s]] = data_s
231
+ if self.join_vars == "inner":
232
+ layer_idx = layer_idx[var_idxs_join]
233
+ return layer_idx
176
234
 
177
- def get_label_idx(self, storage: StorageType, idx: int, label_key: str): # type: ignore
235
+ def get_label_idx(
236
+ self,
237
+ storage: StorageType,
238
+ idx: int,
239
+ label_key: str,
240
+ categories: Optional[list] = None,
241
+ ):
178
242
  """Get the index for the label by key."""
179
243
  obs = storage["obs"] # type: ignore
180
244
  # how backwards compatible do we want to be here actually?
@@ -186,9 +250,11 @@ class MappedDataset:
186
250
  label = labels[idx]
187
251
  else:
188
252
  label = labels["codes"][idx]
189
-
190
- cats = self.get_categories(storage, label_key)
191
- if cats is not None:
253
+ if categories is not None:
254
+ cats = categories
255
+ else:
256
+ cats = self.get_categories(storage, label_key)
257
+ if cats is not None and len(cats) > 0:
192
258
  label = cats[label]
193
259
  if isinstance(label, bytes):
194
260
  label = label.decode("utf-8")
@@ -215,11 +281,14 @@ class MappedDataset:
215
281
  """Get merged labels."""
216
282
  labels_merge = []
217
283
  decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)
218
- for storage in self.storages:
284
+ for i, storage in enumerate(self.storages):
219
285
  with _Connect(storage) as store:
220
286
  codes = self.get_codes(store, label_key)
221
287
  labels = decode(codes) if isinstance(codes[0], bytes) else codes
222
- cats = self.get_categories(store, label_key)
288
+ if label_key in self._cache_cats:
289
+ cats = self._cache_cats[label_key][i]
290
+ else:
291
+ cats = self.get_categories(store, label_key)
223
292
  if cats is not None:
224
293
  cats = decode(cats) if isinstance(cats[0], bytes) else cats
225
294
  labels = cats[labels]
@@ -230,9 +299,12 @@ class MappedDataset:
230
299
  """Get merged categories."""
231
300
  cats_merge = set()
232
301
  decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)
233
- for storage in self.storages:
302
+ for i, storage in enumerate(self.storages):
234
303
  with _Connect(storage) as store:
235
- cats = self.get_categories(store, label_key)
304
+ if label_key in self._cache_cats:
305
+ cats = self._cache_cats[label_key][i]
306
+ else:
307
+ cats = self.get_categories(store, label_key)
236
308
  if cats is not None:
237
309
  cats = decode(cats) if isinstance(cats[0], bytes) else cats
238
310
  cats_merge.update(cats)
@@ -13,7 +13,7 @@ from lnschema_core import Run, Transform, ids
13
13
  from lnschema_core.types import TransformType
14
14
  from lnschema_core.users import current_user_id
15
15
 
16
- from lamindb.dev.versioning import get_ids_from_old_version
16
+ from lamindb.dev.versioning import get_uid_from_old_version
17
17
 
18
18
  from .hashing import to_b64_str
19
19
 
@@ -33,7 +33,9 @@ msg_manual_init = (
33
33
  )
34
34
 
35
35
 
36
- class UpdateNbWithNonInteractiveEditorError(Exception):
36
+ # we don't want a real error here, as this is so frequent
37
+ # in VSCode
38
+ class UpdateNbWithNonInteractiveEditor(SystemExit):
37
39
  pass
38
40
 
39
41
 
@@ -82,21 +84,21 @@ def update_notebook_metadata(
82
84
  from nbproject._header import _filepath
83
85
 
84
86
  notebook = nb_dev.read_notebook(_filepath)
85
- uid_prefix = notebook.metadata["nbproject"]["id"]
87
+ stem_uid = notebook.metadata["nbproject"]["id"]
86
88
  version = notebook.metadata["nbproject"]["version"]
87
89
 
88
- updated, new_uid_prefix, new_version = update_transform_source_metadata(
90
+ updated, new_stem_uid, new_version = update_transform_source_metadata(
89
91
  notebook, _filepath, bump_version=bump_version, run_from_cli=False
90
92
  )
91
93
 
92
94
  if version != new_version:
93
95
  notebook.metadata["nbproject"]["version"] = new_version
94
- new_uid, _, _ = get_ids_from_old_version(
95
- is_new_version_of=transform, version=new_version, n_full_id=14
96
+ new_uid, _ = get_uid_from_old_version(
97
+ is_new_version_of=transform, version=new_version, n_full_id=16
96
98
  )
97
99
  else:
98
- notebook.metadata["nbproject"]["id"] = uid_prefix
99
- new_uid = new_uid_prefix + ids.base62(n_char=2)
100
+ notebook.metadata["nbproject"]["id"] = stem_uid
101
+ new_uid = new_stem_uid + ids.base62(n_char=4)
100
102
 
101
103
  # here we check that responses to both inputs (for new id and version) were not 'n'
102
104
  if updated:
@@ -123,20 +125,22 @@ def get_notebook_name_colab() -> str:
123
125
  return name.rstrip(".ipynb")
124
126
 
125
127
 
126
- def get_transform_kwargs_from_uid_prefix(
128
+ def get_transform_kwargs_from_stem_uid(
127
129
  nbproject_id: str,
128
130
  nbproject_version: str,
129
- ) -> Tuple[Optional[Transform], str, str, Optional[Transform]]:
130
- id_ext = to_b64_str(hashlib.md5(nbproject_version.encode()).digest())[:2]
131
- uid = nbproject_id + id_ext
132
- version = nbproject_version
131
+ ) -> Tuple[Optional[Transform], str, str]:
132
+ from lamin_utils._base62 import encodebytes
133
+
134
+ # merely zero-padding the nbproject version such that the base62 encoding is at
135
+ # least 4 characters long does yield sufficiently diverse hashes within 4 characters
136
+ # it'd be nice because the uid_ext would be ordered, but it leads to collisions
137
+ uid_ext = encodebytes(hashlib.md5(nbproject_version.encode()).digest())[:4]
138
+ new_uid = nbproject_id + uid_ext
139
+ assert len(new_uid) == 16
133
140
  transform = Transform.filter(
134
- uid__startswith=nbproject_id, version=version
141
+ uid__startswith=nbproject_id, version=nbproject_version
135
142
  ).one_or_none()
136
- old_version_of = None
137
- if transform is None:
138
- old_version_of = Transform.filter(uid__startswith=nbproject_id).first()
139
- return transform, uid, version, old_version_of
143
+ return transform, new_uid, nbproject_version
140
144
 
141
145
 
142
146
  class run_context:
@@ -228,7 +232,7 @@ class run_context:
228
232
  "it looks like you are running ln.track() from a "
229
233
  "notebook!\nplease install nbproject: pip install nbproject"
230
234
  )
231
- elif isinstance(e, UpdateNbWithNonInteractiveEditorError):
235
+ elif isinstance(e, UpdateNbWithNonInteractiveEditor):
232
236
  raise e
233
237
  elif isinstance(e, (NotebookNotSavedError, NoTitleError)):
234
238
  raise e
@@ -245,7 +249,7 @@ class run_context:
245
249
  is_tracked = False
246
250
  else:
247
251
  name = Path(module.__file__).stem # type: ignore
248
- if not hasattr(module, "__lamindb_uid_prefix__"):
252
+ if not hasattr(module, "__transform_stem_uid__"):
249
253
  raise RuntimeError(
250
254
  "no automated tracking because no uid attached to script!\n"
251
255
  f"please run: lamin track {module.__file__}\n"
@@ -254,9 +258,8 @@ class run_context:
254
258
  transform,
255
259
  uid,
256
260
  version,
257
- old_version_of,
258
- ) = get_transform_kwargs_from_uid_prefix(
259
- module.__lamindb_uid_prefix__,
261
+ ) = get_transform_kwargs_from_stem_uid(
262
+ module.__transform_stem_uid__,
260
263
  module.__version__, # type: ignore
261
264
  )
262
265
  short_name = Path(module.__file__).name # type: ignore
@@ -265,7 +268,6 @@ class run_context:
265
268
  version=version,
266
269
  name=name,
267
270
  reference=reference,
268
- is_new_version_of=old_version_of,
269
271
  transform_type=TransformType.pipeline,
270
272
  short_name=short_name,
271
273
  is_interactive=False,
@@ -321,6 +323,10 @@ class run_context:
321
323
  logger.important(f"saved: {run}")
322
324
  cls.run = run
323
325
 
326
+ from ._track_environment import track_environment
327
+
328
+ track_environment(run)
329
+
324
330
  # at this point, we have a transform can display its parents if there are any
325
331
  parents = cls.transform.parents.all() if cls.transform is not None else []
326
332
  if len(parents) > 0:
@@ -431,7 +437,7 @@ class run_context:
431
437
  cls._notebook_meta = metadata # type: ignore
432
438
  else:
433
439
  msg = msg_manual_init.format(notebook_path=notebook_path_str)
434
- raise UpdateNbWithNonInteractiveEditorError(msg)
440
+ raise UpdateNbWithNonInteractiveEditor(msg)
435
441
 
436
442
  if _env in ("lab", "notebook"):
437
443
  # save the notebook in case that title was updated
@@ -446,7 +452,7 @@ class run_context:
446
452
  is_interactive = _seconds_modified(_filepath) < 1.5 # should be ~1 sec
447
453
  if not is_interactive and needs_init:
448
454
  msg = msg_manual_init.format(notebook_path=_filepath)
449
- raise UpdateNbWithNonInteractiveEditorError(msg)
455
+ raise UpdateNbWithNonInteractiveEditor(msg)
450
456
 
451
457
  nbproject_id = metadata["id"]
452
458
  nbproject_version = metadata["version"]
@@ -468,7 +474,6 @@ class run_context:
468
474
  transform = Transform.filter(uid=uid).one_or_none()
469
475
  name = filestem
470
476
  short_name = None
471
- old_version_of = None
472
477
  # nbproject parsing successful
473
478
  elif nbproject_id is not None:
474
479
  name = nbproject_title
@@ -476,21 +481,18 @@ class run_context:
476
481
  transform,
477
482
  uid,
478
483
  version,
479
- old_version_of,
480
- ) = get_transform_kwargs_from_uid_prefix(nbproject_id, nbproject_version)
484
+ ) = get_transform_kwargs_from_stem_uid(nbproject_id, nbproject_version)
481
485
  short_name = filestem
482
486
  cls._create_or_load_transform(
483
487
  uid=uid,
484
488
  version=version,
485
489
  name=name,
486
490
  reference=reference,
487
- is_new_version_of=old_version_of,
488
491
  transform_type=TransformType.notebook,
489
492
  short_name=short_name,
490
493
  is_interactive=is_interactive,
491
494
  filepath=notebook_path,
492
495
  transform=transform,
493
- metadata=metadata,
494
496
  )
495
497
 
496
498
  @classmethod
@@ -509,7 +511,7 @@ class run_context:
509
511
  cls._notebook_meta = metadata # type: ignore
510
512
  else:
511
513
  msg = msg_manual_init.format(notebook_path=filepath)
512
- raise UpdateNbWithNonInteractiveEditorError(msg)
514
+ raise UpdateNbWithNonInteractiveEditor(msg)
513
515
  else:
514
516
  from lamin_cli._transform import update_transform_source_metadata
515
517
 
@@ -533,13 +535,11 @@ class run_context:
533
535
  version: Optional[str],
534
536
  name: str,
535
537
  reference: Optional[str],
536
- is_new_version_of: Optional[Transform],
537
538
  short_name: Optional[str],
538
539
  transform_type: TransformType,
539
540
  is_interactive: bool,
540
541
  filepath: str,
541
542
  transform: Optional[Transform] = None,
542
- metadata: Optional[Dict] = None,
543
543
  ) -> bool:
544
544
  # make a new transform record
545
545
  if transform is None:
@@ -549,7 +549,6 @@ class run_context:
549
549
  name=name,
550
550
  short_name=short_name,
551
551
  reference=reference,
552
- is_new_version_of=is_new_version_of,
553
552
  type=transform_type,
554
553
  )
555
554
  transform.save()
@@ -0,0 +1,18 @@
1
+ import subprocess
2
+
3
+ import lamindb_setup as ln_setup
4
+ from lamin_utils import logger
5
+ from lnschema_core.models import Run
6
+
7
+
8
+ def track_environment(run: Run) -> None:
9
+ filepath = ln_setup.settings.storage.cache_dir / f"run_env_pip_{run.uid}.txt"
10
+ # create a requirements.txt
11
+ # we don't create a conda environment.yml mostly for its slowness
12
+ try:
13
+ result = subprocess.run(f"pip freeze > {str(filepath)}", shell=True)
14
+ except OSError as e:
15
+ result = None
16
+ logger.warning(f"could not run pip freeze with error {e}")
17
+ if result is not None and result.returncode == 0:
18
+ logger.info(f"tracked pip freeze > {str(filepath)}")
@@ -1,4 +1,4 @@
1
- """Test datasets.
1
+ """Test collections.
2
2
 
3
3
  .. autosummary::
4
4
  :toctree: .
@@ -144,7 +144,7 @@ def dir_iris_images() -> UPath: # pragma: no cover
144
144
  def anndata_mouse_sc_lymph_node(
145
145
  populate_registries: bool = False,
146
146
  ) -> ad.AnnData: # pragma: no cover
147
- """Mouse lymph node scRNA-seq dataset from EBI.
147
+ """Mouse lymph node scRNA-seq collection from EBI.
148
148
 
149
149
  Subsampled to 10k genes.
150
150
 
@@ -226,11 +226,11 @@ def anndata_mouse_sc_lymph_node(
226
226
 
227
227
 
228
228
  def anndata_pbmc68k_reduced() -> ad.AnnData:
229
- """Modified from scanpy.datasets.pbmc68k_reduced().
229
+ """Modified from scanpy.collections.pbmc68k_reduced().
230
230
 
231
231
  This code was run::
232
232
 
233
- pbmc68k = sc.datasets.pbmc68k_reduced()
233
+ pbmc68k = sc.collections.pbmc68k_reduced()
234
234
  pbmc68k.obs.rename(columns={"bulk_labels": "cell_type"}, inplace=True)
235
235
  pbmc68k.obs["cell_type"] = pbmc68k.obs["cell_type"].cat.rename_categories(
236
236
  {"Dendritic": "Dendritic cells", "CD14+ Monocyte": "CD14+ Monocytes"}
@@ -254,7 +254,7 @@ def anndata_pbmc68k_reduced() -> ad.AnnData:
254
254
 
255
255
 
256
256
  def anndata_file_pbmc68k_test() -> Path:
257
- """Modified from scanpy.datasets.pbmc68k_reduced().
257
+ """Modified from scanpy.collections.pbmc68k_reduced().
258
258
 
259
259
  Additional slots were added for testing purposes. Returns the filepath.
260
260
 
@@ -291,7 +291,7 @@ def anndata_human_immune_cells(
291
291
  """Cross-tissue immune cell analysis reveals tissue-specific features in humans.
292
292
 
293
293
  From: https://cellxgene.cziscience.com/collections/62ef75e4-cbea-454e-a0ce-998ec40223d3 # noqa
294
- Dataset: Global
294
+ Collection: Global
295
295
 
296
296
  To reproduce the subsample::
297
297
 
@@ -404,18 +404,18 @@ def mudata_papalexi21_subset(): # pragma: no cover
404
404
 
405
405
 
406
406
  def df_iris() -> pd.DataFrame:
407
- """The iris dataset as in sklearn.
407
+ """The iris collection as in sklearn.
408
408
 
409
409
  Original code::
410
410
 
411
- sklearn.datasets.load_iris(as_frame=True).frame
411
+ sklearn.collections.load_iris(as_frame=True).frame
412
412
  """
413
413
  filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/iris.parquet")
414
414
  return pd.read_parquet(filepath)
415
415
 
416
416
 
417
417
  def df_iris_in_meter() -> pd.DataFrame:
418
- """The iris dataset with lengths in meter."""
418
+ """The iris collection with lengths in meter."""
419
419
  df = df_iris()
420
420
  # rename columns
421
421
  df.rename(
@@ -436,13 +436,13 @@ def df_iris_in_meter() -> pd.DataFrame:
436
436
 
437
437
 
438
438
  def df_iris_in_meter_study1() -> pd.DataFrame:
439
- """The iris dataset with lengths in meter."""
439
+ """The iris collection with lengths in meter."""
440
440
  df_iris = df_iris_in_meter()
441
441
  return df_iris.iloc[: len(df_iris) // 2]
442
442
 
443
443
 
444
444
  def df_iris_in_meter_study2() -> pd.DataFrame:
445
- """The iris dataset with lengths in meter."""
445
+ """The iris collection with lengths in meter."""
446
446
  df_iris = df_iris_in_meter()
447
447
  return df_iris.iloc[len(df_iris) // 2 :]
448
448
 
@@ -500,7 +500,7 @@ def dir_scrnaseq_cellranger(
500
500
 
501
501
 
502
502
  def schmidt22_crispra_gws_IFNG(basedir=".") -> Path: # pragma: no cover
503
- """CRISPRi screen dataset of Schmidt22.
503
+ """CRISPRi screen collection of Schmidt22.
504
504
 
505
505
  Originally from: https://zenodo.org/record/5784651
506
506
  """
@@ -512,7 +512,7 @@ def schmidt22_crispra_gws_IFNG(basedir=".") -> Path: # pragma: no cover
512
512
 
513
513
 
514
514
  def schmidt22_perturbseq(basedir=".") -> Path: # pragma: no cover
515
- """Perturb-seq dataset of Schmidt22.
515
+ """Perturb-seq collection of Schmidt22.
516
516
 
517
517
  Subsampled and converted to h5ad from R file: https://zenodo.org/record/5784651
518
518
 
@@ -48,7 +48,10 @@ else:
48
48
  from anndata._core.sparse_dataset import (
49
49
  BaseCompressedSparseDataset as SparseDataset,
50
50
  )
51
- from anndata._core.sparse_dataset import CSRDataset, sparse_dataset # type: ignore
51
+ from anndata._core.sparse_dataset import ( # type: ignore
52
+ CSRDataset,
53
+ sparse_dataset,
54
+ )
52
55
 
53
56
  def _check_group_format(*args):
54
57
  pass
@@ -72,7 +72,10 @@ def write_adata_zarr(
72
72
 
73
73
  if chunks is not None and not isinstance(adata.X, sparse.spmatrix):
74
74
  _write_elem_cb(
75
- f, "X", adata.X, dataset_kwargs=dict(chunks=chunks, **dataset_kwargs)
75
+ f,
76
+ "X",
77
+ adata.X,
78
+ dataset_kwargs=dict(chunks=chunks, **dataset_kwargs),
76
79
  )
77
80
  else:
78
81
  _write_elem_cb(f, "X", adata.X, dataset_kwargs=dataset_kwargs)
lamindb/dev/versioning.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from typing import Optional, Tuple, Union
2
2
 
3
3
  from lnschema_core import ids
4
- from lnschema_core.models import Artifact, Transform
4
+ from lnschema_core.models import IsVersioned
5
5
 
6
6
 
7
7
  def set_version(version: Optional[str] = None, previous_version: Optional[str] = None):
@@ -32,33 +32,29 @@ def init_uid(
32
32
  *,
33
33
  version: Optional[str] = None,
34
34
  n_full_id: int = 20,
35
+ is_new_version_of: Optional[IsVersioned] = None,
35
36
  ) -> str:
36
- if n_full_id == 20:
37
- gen_full_id = ids.base62_20
38
- elif n_full_id == 14:
39
- gen_full_id = ids.base62_14
37
+ if is_new_version_of is not None:
38
+ stem_uid = is_new_version_of.stem_uid
39
+ else:
40
+ if n_full_id == 20:
41
+ stem_uid = ids.base62_16()
42
+ elif n_full_id == 16:
43
+ stem_uid = ids.base62_12()
40
44
  if version is not None:
41
45
  if not isinstance(version, str):
42
46
  raise ValueError(
43
47
  "`version` parameter must be `None` or `str`, e.g., '0.1', '1', '2',"
44
48
  " etc."
45
49
  )
46
- return gen_full_id()
47
-
50
+ return stem_uid + ids.base62_4()
48
51
 
49
- def get_initial_version_id(is_new_version_of: Union[Artifact, Transform]):
50
- if is_new_version_of.initial_version_id is None:
51
- initial_version_id = is_new_version_of.id
52
- else:
53
- initial_version_id = is_new_version_of.initial_version_id
54
- return initial_version_id
55
52
 
56
-
57
- def get_ids_from_old_version(
58
- is_new_version_of: Union[Artifact, Transform],
53
+ def get_uid_from_old_version(
54
+ is_new_version_of: IsVersioned,
59
55
  version: Optional[str],
60
56
  n_full_id: int = 20,
61
- ) -> Tuple[str, int, str]:
57
+ ) -> Tuple[str, str]:
62
58
  """{}."""
63
59
  msg = ""
64
60
  if is_new_version_of.version is None:
@@ -67,18 +63,15 @@ def get_ids_from_old_version(
67
63
  else:
68
64
  previous_version = is_new_version_of.version
69
65
  version = set_version(version, previous_version)
70
- initial_version_id = get_initial_version_id(is_new_version_of)
71
66
  new_uid = init_uid(
72
67
  version=version,
73
68
  n_full_id=n_full_id,
69
+ is_new_version_of=is_new_version_of,
74
70
  )
75
71
  # the following covers the edge case where the old file was unversioned
76
72
  if is_new_version_of.version is None:
77
73
  is_new_version_of.version = previous_version
78
74
  is_new_version_of.save()
79
75
  if msg != "":
80
- msg += (
81
- f"& new version to '{version}' (initial_version_id ="
82
- f" '{initial_version_id}')"
83
- )
84
- return new_uid, initial_version_id, version # type: ignore
76
+ msg += f"& new version to '{version}'"
77
+ return new_uid, version