lamindb 1.7.1__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/__init__.py CHANGED
@@ -108,7 +108,7 @@ Backwards compatibility.
108
108
 
109
109
  # ruff: noqa: I001
110
110
  # denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
111
- __version__ = "1.7.1"
111
+ __version__ = "1.9.0"
112
112
 
113
113
  import warnings
114
114
 
lamindb/_finish.py CHANGED
@@ -260,9 +260,9 @@ def save_context_core(
260
260
  is_r_notebook = filepath.suffix in {".qmd", ".Rmd"}
261
261
  source_code_path = filepath
262
262
  report_path: Path | None = None
263
- save_source_code_and_report = True
263
+ save_source_code_and_report = filepath.exists()
264
264
  if (
265
- is_run_from_ipython and notebook_runner != "nbconvert"
265
+ is_run_from_ipython and notebook_runner != "nbconvert" and filepath.exists()
266
266
  ): # python notebooks in interactive session
267
267
  import nbproject
268
268
 
@@ -281,7 +281,7 @@ def save_context_core(
281
281
  logger.warning(
282
282
  "the notebook on disk wasn't saved within the last 10 sec"
283
283
  )
284
- if is_ipynb: # could be from CLI outside interactive session
284
+ if is_ipynb and filepath.exists(): # could be from CLI outside interactive session
285
285
  try:
286
286
  import jupytext # noqa: F401
287
287
  from nbproject.dev import (
@@ -315,6 +315,8 @@ def save_context_core(
315
315
  ".ipynb", ".py"
316
316
  )
317
317
  notebook_to_script(transform.description, filepath, source_code_path)
318
+ elif is_ipynb and not filepath.exists():
319
+ logger.warning("notebook file does not exist in compute environment")
318
320
  elif is_r_notebook:
319
321
  if filepath.with_suffix(".nb.html").exists():
320
322
  report_path = filepath.with_suffix(".nb.html")
@@ -365,6 +367,9 @@ def save_context_core(
365
367
  base_path = ln_setup.settings.cache_dir / "environments" / f"run_{run.uid}"
366
368
  paths = [base_path / "run_env_pip.txt", base_path / "r_pak_lockfile.json"]
367
369
  existing_paths = [path for path in paths if path.exists()]
370
+ if len(existing_paths) == 2:
371
+ # let's not store the python environment for an R session for now
372
+ existing_paths = [base_path / "r_pak_lockfile.json"]
368
373
 
369
374
  if existing_paths:
370
375
  overwrite_env = True
lamindb/core/_context.py CHANGED
@@ -17,20 +17,18 @@ from lamin_utils import logger
17
17
  from lamindb_setup.core import deprecated
18
18
  from lamindb_setup.core.hashing import hash_file
19
19
 
20
- from lamindb.base import ids
21
- from lamindb.base.ids import base62_12
22
- from lamindb.models import Run, Transform, format_field_value
23
-
24
- from ..core._settings import settings
20
+ from ..base.ids import base62_12
25
21
  from ..errors import (
26
22
  InvalidArgument,
27
23
  TrackNotCalled,
28
24
  UpdateContext,
29
25
  )
26
+ from ..models import Run, Transform, format_field_value
30
27
  from ..models._is_versioned import bump_version as bump_version_function
31
28
  from ..models._is_versioned import (
32
29
  increment_base62,
33
30
  )
31
+ from ._settings import is_read_only_connection, settings
34
32
  from ._sync_git import get_transform_reference_from_git_repo
35
33
  from ._track_environment import track_python_environment
36
34
 
@@ -324,6 +322,7 @@ class Context:
324
322
  params: dict | None = None,
325
323
  new_run: bool | None = None,
326
324
  path: str | None = None,
325
+ pypackages: bool | None = None,
327
326
  ) -> None:
328
327
  """Track a run of your notebook or script.
329
328
 
@@ -343,6 +342,7 @@ class Context:
343
342
  (default notebook), if `True`, creates new run (default non-notebook).
344
343
  path: Filepath of notebook or script. Only needed if it can't be
345
344
  automatically detected.
345
+ pypackages: If `True` or `None`, infers Python packages used in a notebook.
346
346
 
347
347
  Examples:
348
348
 
@@ -365,10 +365,8 @@ class Context:
365
365
  save_context_core,
366
366
  )
367
367
 
368
- instance_settings = ln_setup.settings.instance
369
368
  # similar logic here: https://github.com/laminlabs/lamindb/pull/2527
370
- # TODO: refactor upon new access management
371
- if instance_settings.dialect == "postgresql" and "read" in instance_settings.db:
369
+ if is_read_only_connection():
372
370
  logger.warning("skipping track(), connected in read-only mode")
373
371
  return None
374
372
  if project is None:
@@ -428,7 +426,9 @@ class Context:
428
426
  if transform is None:
429
427
  description = None
430
428
  if is_run_from_ipython:
431
- self._path, description = self._track_notebook(path_str=path)
429
+ self._path, description = self._track_notebook(
430
+ path_str=path, pypackages=pypackages
431
+ )
432
432
  transform_type = "notebook"
433
433
  transform_ref = None
434
434
  transform_ref_type = None
@@ -591,11 +591,14 @@ class Context:
591
591
  self,
592
592
  *,
593
593
  path_str: str | None,
594
+ pypackages: bool | None = None,
594
595
  ) -> tuple[Path, str | None]:
595
596
  if path_str is None:
596
597
  path, self._notebook_runner = get_notebook_path()
597
598
  else:
598
599
  path = Path(path_str)
600
+ if pypackages is None:
601
+ pypackages = True
599
602
  description = None
600
603
  path_str = path.as_posix()
601
604
  if path_str.endswith("Untitled.ipynb"):
@@ -616,10 +619,11 @@ class Context:
616
619
  if nbproject_title is not None:
617
620
  description = nbproject_title
618
621
 
619
- self._logging_message_imports += (
620
- "notebook imports:"
621
- f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}"
622
- )
622
+ if pypackages:
623
+ self._logging_message_imports += (
624
+ "notebook imports:"
625
+ f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}"
626
+ )
623
627
  except Exception:
624
628
  logger.debug("reading the notebook file failed")
625
629
  pass
@@ -689,10 +693,21 @@ class Context:
689
693
  source_code_path = ln_setup.settings.cache_dir / self._path.name.replace(
690
694
  ".ipynb", ".py"
691
695
  )
692
- notebook_to_script(description, self._path, source_code_path)
693
- transform_hash, _ = hash_file(source_code_path)
696
+ if (
697
+ self._path.exists()
698
+ ): # notebook kernel might be running on a different machine
699
+ notebook_to_script(description, self._path, source_code_path)
700
+ transform_hash, _ = hash_file(source_code_path)
701
+ else:
702
+ logger.debug(
703
+ "skipping notebook hash comparison, notebook kernel running on a different machine"
704
+ )
705
+ transform_hash = None
694
706
  # see whether we find a transform with the exact same hash
695
- aux_transform = Transform.filter(hash=transform_hash).one_or_none()
707
+ if transform_hash is not None:
708
+ aux_transform = Transform.filter(hash=transform_hash).one_or_none()
709
+ else:
710
+ aux_transform = None
696
711
  # if the user did not pass a uid and there is no matching aux_transform
697
712
  # need to search for the transform based on the filename
698
713
  if self.uid is None and aux_transform is None:
@@ -856,7 +871,7 @@ class Context:
856
871
  and not transform_was_saved
857
872
  ):
858
873
  raise UpdateContext(
859
- f'{transform.created_by.name} ({transform.created_by.handle}) already works on this draft {transform.type}.\n\nPlease create a revision via `ln.track("{uid[:-4]}{increment_base62(uid[-4:])}")` or a new transform with a *different* key and `ln.track("{ids.base62_12()}0000")`.'
874
+ f'{transform.created_by.name} ({transform.created_by.handle}) already works on this draft {transform.type}.\n\nPlease create a revision via `ln.track("{uid[:-4]}{increment_base62(uid[-4:])}")` or a new transform with a *different* key and `ln.track("{base62_12()}0000")`.'
860
875
  )
861
876
  # check whether transform source code was already saved
862
877
  if transform_was_saved:
lamindb/core/_settings.py CHANGED
@@ -1,12 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import os
4
+ import sys
4
5
  from typing import TYPE_CHECKING
5
6
 
6
7
  import lamindb_setup as ln_setup
7
8
  from lamin_utils import colors, logger
9
+ from lamindb_setup import settings as setup_settings
8
10
  from lamindb_setup._set_managed_storage import set_managed_storage
9
- from lamindb_setup.core._settings import settings as setup_settings
11
+ from lamindb_setup.core import deprecated
10
12
  from lamindb_setup.core._settings_instance import sanitize_git_repo_url
11
13
 
12
14
  from .subsettings._annotation_settings import AnnotationSettings, annotation_settings
@@ -19,6 +21,15 @@ if TYPE_CHECKING:
19
21
  from lamindb_setup.core._settings_storage import StorageSettings
20
22
  from upath import UPath
21
23
 
24
+
25
+ def is_read_only_connection() -> bool:
26
+ instance = setup_settings.instance
27
+ if instance.dialect == "postgresql":
28
+ db_url = instance.db
29
+ return "read" in db_url or "public" in db_url
30
+ return False
31
+
32
+
22
33
  VERBOSITY_TO_INT = {
23
34
  "error": 0, # 40
24
35
  "warning": 1, # 30
@@ -44,6 +55,9 @@ class Settings:
44
55
  self._sync_git_repo: str | None = None
45
56
 
46
57
  def __repr__(self) -> str: # pragma: no cover
58
+ if "sphinx" in sys.modules:
59
+ return object.__repr__(self)
60
+
47
61
  cls_name = colors.green(self.__class__.__name__)
48
62
  verbosity_color = colors.yellow if self.verbosity == "warning" else colors.green
49
63
  verbosity_str = verbosity_color(self.verbosity)
@@ -181,6 +195,8 @@ class Settings:
181
195
  def storage(self, path_kwargs: str | Path | UPath | tuple[str | UPath, Mapping]):
182
196
  if isinstance(path_kwargs, tuple):
183
197
  path, kwargs = path_kwargs
198
+ if isinstance(kwargs, str):
199
+ kwargs = {"host": kwargs}
184
200
  else:
185
201
  path, kwargs = path_kwargs, {}
186
202
  set_managed_storage(path, **kwargs)
@@ -196,18 +212,28 @@ class Settings:
196
212
  return ln_setup.settings.cache_dir
197
213
 
198
214
  @property
199
- def storage_local(self) -> StorageSettings:
215
+ def local_storage(self) -> StorageSettings:
200
216
  """An additional local default storage (a path to its root).
201
217
 
202
218
  Is only available if :attr:`~lamindb.setup.core.InstanceSettings.keep_artifacts_local` is enabled.
203
219
 
204
220
  Guide: :doc:`faq/keep-artifacts-local`
205
221
  """
206
- return ln_setup.settings.instance.storage_local
222
+ return ln_setup.settings.instance.local_storage
223
+
224
+ @local_storage.setter
225
+ def local_storage(self, local_root: Path):
226
+ ln_setup.settings.instance.local_storage = local_root
227
+
228
+ @property
229
+ @deprecated("local_storage")
230
+ def storage_local(self) -> StorageSettings:
231
+ return self.local_storage
207
232
 
208
233
  @storage_local.setter
209
- def storage_local(self, local_root: Path):
210
- ln_setup.settings.instance.storage_local = local_root
234
+ @deprecated("local_storage")
235
+ def storage_local(self, local_root_host: tuple[Path | str, str]):
236
+ self.local_storage = local_root_host # type: ignore
211
237
 
212
238
  @property
213
239
  def verbosity(self) -> str:
@@ -9,35 +9,65 @@ import pandas as pd
9
9
 
10
10
  def small_dataset3_cellxgene(
11
11
  otype: Literal["DataFrame", "AnnData"] = "AnnData",
12
+ with_obs_defaults: bool = False,
13
+ with_obs_typo: bool = False,
12
14
  ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
13
15
  # TODO: consider other ids for other organisms
14
16
  # "ENSMUSG00002076988"
15
17
  var_ids = ["invalid_ensembl_id", "ENSG00000000419", "ENSG00000139618"]
16
- dataset_dict = {
17
- var_ids[0]: [2, 3, 3],
18
- var_ids[1]: [3, 4, 5],
19
- var_ids[2]: [4, 2, 3],
20
- "disease_ontology_term_id": ["MONDO:0004975", "MONDO:0004980", "MONDO:0004980"],
21
- "organism": ["human", "human", "human"],
22
- "sex": ["female", "male", "unknown"],
23
- "sex_ontology_term_id": ["PATO:0000383", "PATO:0000384", "unknown"],
24
- "tissue": ["lungg", "lungg", "heart"],
25
- "donor": ["-1", "1", "2"],
26
- }
27
- dataset_df = pd.DataFrame(
28
- dataset_dict,
18
+
19
+ lung_id = "UBERON:0002048XXX" if with_obs_typo else "UBERON:0002048"
20
+ obs_df = pd.DataFrame(
21
+ {
22
+ "disease_ontology_term_id": [
23
+ "MONDO:0004975",
24
+ "MONDO:0004980",
25
+ "MONDO:0004980",
26
+ ],
27
+ "development_stage_ontology_term_id": ["unknown", "unknown", "unknown"],
28
+ "organism": ["human", "human", "human"],
29
+ "sex_ontology_term_id": ["PATO:0000383", "PATO:0000384", "unknown"],
30
+ "tissue_ontology_term_id": [lung_id, lung_id, "UBERON:0000948"],
31
+ "cell_type": ["T cell", "B cell", "B cell"],
32
+ "self_reported_ethnicity": ["South Asian", "South Asian", "South Asian"],
33
+ "donor_id": ["-1", "1", "2"],
34
+ "is_primary_data": [False, False, False],
35
+ "suspension_type": ["cell", "cell", "cell"],
36
+ "tissue_type": ["tissue", "tissue", "tissue"],
37
+ },
29
38
  index=["barcode1", "barcode2", "barcode3"],
30
39
  )
31
- dataset_df["tissue"] = dataset_df["tissue"].astype("category")
32
- ad.AnnData(
33
- dataset_df[var_ids],
34
- obs=dataset_df[[key for key in dataset_dict if key not in var_ids]],
40
+
41
+ var_df = pd.DataFrame(
42
+ index=var_ids, data={"feature_is_filtered": [False, False, False]}
35
43
  )
44
+
45
+ X = pd.DataFrame(
46
+ {
47
+ var_ids[0]: [2, 3, 3],
48
+ var_ids[1]: [3, 4, 5],
49
+ var_ids[2]: [4, 2, 3],
50
+ },
51
+ index=["barcode1", "barcode2", "barcode3"],
52
+ dtype="float32",
53
+ )
54
+
55
+ obs_df["donor_id"] = obs_df["donor_id"].astype("category")
56
+
36
57
  if otype == "DataFrame":
37
- return dataset_df
58
+ return pd.concat([X, obs_df], axis=1)
38
59
  else:
39
- dataset_ad = ad.AnnData(dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:])
40
- return dataset_ad
60
+ adata = ad.AnnData(X=X, obs=obs_df, var=var_df)
61
+ adata.uns["title"] = "CELLxGENE example"
62
+ adata.obsm["X_pca"] = np.array(
63
+ [[-1.2, 0.8], [0.5, -0.3], [0.7, -0.5]], dtype="float32"
64
+ )
65
+ # CELLxGENE requires the `.raw` slot to be set - https://github.com/chanzuckerberg/single-cell-curation/issues/1304
66
+ adata.raw = adata.copy()
67
+ adata.raw.var.drop(columns="feature_is_filtered", inplace=True)
68
+ if with_obs_defaults:
69
+ adata.obs["assay"] = "single-cell RNA sequencing"
70
+ return adata
41
71
 
42
72
 
43
73
  def anndata_with_obs() -> ad.AnnData:
@@ -163,6 +163,11 @@ def _open_dataframe(
163
163
  engine: Literal["pyarrow", "polars"] = "pyarrow",
164
164
  **kwargs,
165
165
  ) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
166
+ if engine not in {"pyarrow", "polars"}:
167
+ raise ValueError(
168
+ f"Unknown engine: {engine}. It should be 'pyarrow' or 'polars'."
169
+ )
170
+
166
171
  df_suffix: str
167
172
  if suffix is None:
168
173
  df_suffixes = _flat_suffixes(paths)
@@ -175,34 +180,37 @@ def _open_dataframe(
175
180
  else:
176
181
  df_suffix = suffix
177
182
 
178
- if engine == "pyarrow":
179
- if df_suffix not in PYARROW_SUFFIXES:
180
- raise ValueError(
181
- f"{df_suffix} files are not supported by pyarrow, "
182
- f"they should have one of these formats: {', '.join(PYARROW_SUFFIXES)}."
183
- )
184
- # this checks that the filesystem is the same for all paths
185
- # this is a requirement of pyarrow.dataset.dataset
186
- if not isinstance(paths, Path): # is a list then
187
- fs = getattr(paths[0], "fs", None)
188
- for path in paths[1:]:
189
- # this assumes that the filesystems are cached by fsspec
190
- if getattr(path, "fs", None) is not fs:
191
- raise ValueError(
192
- "The collection has artifacts with different filesystems, "
193
- "this is not supported by pyarrow."
194
- )
195
- dataframe = _open_pyarrow_dataset(paths, **kwargs)
196
- elif engine == "polars":
197
- if df_suffix not in POLARS_SUFFIXES:
198
- raise ValueError(
199
- f"{df_suffix} files are not supported by polars, "
200
- f"they should have one of these formats: {', '.join(POLARS_SUFFIXES)}."
201
- )
202
- dataframe = _open_polars_lazy_df(paths, **kwargs)
203
- else:
183
+ if engine == "pyarrow" and df_suffix not in PYARROW_SUFFIXES:
204
184
  raise ValueError(
205
- f"Unknown engine: {engine}. It should be 'pyarrow' or 'polars'."
185
+ f"{df_suffix} files are not supported by pyarrow, "
186
+ f"they should have one of these formats: {', '.join(PYARROW_SUFFIXES)}."
187
+ )
188
+ elif engine == "polars" and df_suffix not in POLARS_SUFFIXES:
189
+ raise ValueError(
190
+ f"{df_suffix} files are not supported by polars, "
191
+ f"they should have one of these formats: {', '.join(POLARS_SUFFIXES)}."
206
192
  )
207
193
 
208
- return dataframe
194
+ polars_without_fsspec = engine == "polars" and not kwargs.get("use_fsspec", False)
195
+ if (engine == "pyarrow" or polars_without_fsspec) and not isinstance(paths, Path):
196
+ # this checks that the filesystem is the same for all paths
197
+ # this is a requirement of pyarrow.dataset.dataset
198
+ fs = getattr(paths[0], "fs", None)
199
+ for path in paths[1:]:
200
+ # this assumes that the filesystems are cached by fsspec
201
+ if getattr(path, "fs", None) is not fs:
202
+ engine_msg = (
203
+ "polars engine without passing `use_fsspec=True`"
204
+ if engine == "polars"
205
+ else "pyarrow engine"
206
+ )
207
+ raise ValueError(
208
+ "The collection has artifacts with different filesystems, "
209
+ f"this is not supported for {engine_msg}."
210
+ )
211
+
212
+ return (
213
+ _open_pyarrow_dataset(paths, **kwargs)
214
+ if engine == "pyarrow"
215
+ else _open_polars_lazy_df(paths, **kwargs)
216
+ )
@@ -4,6 +4,8 @@ from contextlib import contextmanager
4
4
  from pathlib import Path
5
5
  from typing import TYPE_CHECKING
6
6
 
7
+ from lamindb_setup.core.upath import get_storage_region
8
+
7
9
  if TYPE_CHECKING:
8
10
  from collections.abc import Iterator
9
11
 
@@ -13,9 +15,35 @@ if TYPE_CHECKING:
13
15
  POLARS_SUFFIXES = (".parquet", ".csv", ".ndjson", ".ipc")
14
16
 
15
17
 
18
+ def _polars_storage_options(storepath: UPath) -> dict[str, str | bool]:
19
+ storage_options: dict[str, str | bool] = {}
20
+ s3fs_options = storepath.storage_options
21
+
22
+ endpoint_url = s3fs_options.get("endpoint_url", None)
23
+ if endpoint_url is not None:
24
+ storage_options["aws_virtual_hosted_style_request"] = False
25
+ storage_options["aws_endpoint_url"] = endpoint_url
26
+ if endpoint_url.startswith("http://"):
27
+ storage_options["aws_allow_http"] = True
28
+ else:
29
+ storage_options["aws_region"] = get_storage_region(storepath)
30
+
31
+ if s3fs_options.get("anon", False):
32
+ storage_options["aws_skip_signature"] = True
33
+ else:
34
+ if "key" in s3fs_options:
35
+ storage_options["aws_access_key_id"] = s3fs_options["key"]
36
+ if "secret" in s3fs_options:
37
+ storage_options["aws_secret_access_key"] = s3fs_options["secret"]
38
+ if "token" in s3fs_options:
39
+ storage_options["aws_session_token"] = s3fs_options["token"]
40
+
41
+ return storage_options
42
+
43
+
16
44
  @contextmanager
17
45
  def _open_polars_lazy_df(
18
- paths: UPath | list[UPath], **kwargs
46
+ paths: UPath | list[UPath], use_fsspec: bool = False, **kwargs
19
47
  ) -> Iterator[PolarsLazyFrame]:
20
48
  try:
21
49
  import polars as pl
@@ -38,14 +66,25 @@ def _open_polars_lazy_df(
38
66
  path_list += [p for p in path.rglob("*") if p.suffix != ""]
39
67
  else:
40
68
  path_list.append(path)
69
+ # assume the filesystem is the same for all
70
+ # it is checked in _open_dataframe
71
+ path0 = path_list[0]
72
+ storage_options = None
73
+ if not use_fsspec:
74
+ storage_options = kwargs.pop("storage_options", None)
75
+ if path0.protocol == "s3" and storage_options is None:
76
+ storage_options = _polars_storage_options(path0)
41
77
 
42
78
  open_files = []
43
79
 
44
80
  try:
45
81
  for path in path_list:
46
- open_files.append(path.open(mode="rb"))
82
+ open_files.append(path.open(mode="rb") if use_fsspec else path.as_posix())
47
83
 
48
- yield scans[path_list[0].suffix](open_files, **kwargs)
84
+ yield scans[path_list[0].suffix](
85
+ open_files, storage_options=storage_options, **kwargs
86
+ )
49
87
  finally:
50
- for open_file in open_files:
51
- open_file.close()
88
+ if use_fsspec:
89
+ for open_file in open_files:
90
+ open_file.close()
@@ -8,8 +8,7 @@ import pyarrow as pa
8
8
  from anndata import AnnData, read_h5ad
9
9
  from lamin_utils import logger
10
10
  from lamindb_setup import settings as setup_settings
11
- from lamindb_setup.core._settings_storage import get_storage_region
12
- from lamindb_setup.core.upath import LocalPathClasses, create_path
11
+ from lamindb_setup.core.upath import LocalPathClasses, create_path, get_storage_region
13
12
  from packaging import version
14
13
 
15
14
  if TYPE_CHECKING:
@@ -18,10 +18,6 @@ Modules.
18
18
 
19
19
  """
20
20
 
21
- from ._legacy import ( # backward compat
22
- CellxGeneAnnDataCatManager,
23
- PertAnnDataCatManager,
24
- )
25
21
  from .core import (
26
22
  AnnDataCurator,
27
23
  DataFrameCurator,
@@ -31,8 +27,6 @@ from .core import (
31
27
  )
32
28
 
33
29
  __all__ = [
34
- "CellxGeneAnnDataCatManager",
35
- "PertAnnDataCatManager",
36
30
  "AnnDataCurator",
37
31
  "DataFrameCurator",
38
32
  "MuDataCurator",