lamindb 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. lamindb/__init__.py +14 -5
  2. lamindb/_artifact.py +174 -57
  3. lamindb/_can_curate.py +27 -8
  4. lamindb/_collection.py +85 -51
  5. lamindb/_feature.py +177 -41
  6. lamindb/_finish.py +222 -81
  7. lamindb/_from_values.py +83 -98
  8. lamindb/_parents.py +4 -4
  9. lamindb/_query_set.py +59 -17
  10. lamindb/_record.py +171 -53
  11. lamindb/_run.py +4 -4
  12. lamindb/_save.py +33 -10
  13. lamindb/_schema.py +135 -38
  14. lamindb/_storage.py +1 -1
  15. lamindb/_tracked.py +106 -0
  16. lamindb/_transform.py +21 -8
  17. lamindb/_ulabel.py +5 -14
  18. lamindb/base/validation.py +2 -6
  19. lamindb/core/__init__.py +13 -14
  20. lamindb/core/_context.py +39 -36
  21. lamindb/core/_data.py +29 -25
  22. lamindb/core/_describe.py +1 -1
  23. lamindb/core/_django.py +1 -1
  24. lamindb/core/_feature_manager.py +54 -44
  25. lamindb/core/_label_manager.py +4 -4
  26. lamindb/core/_mapped_collection.py +20 -7
  27. lamindb/core/datasets/__init__.py +6 -1
  28. lamindb/core/datasets/_core.py +12 -11
  29. lamindb/core/datasets/_small.py +66 -20
  30. lamindb/core/exceptions.py +1 -90
  31. lamindb/core/loaders.py +7 -13
  32. lamindb/core/relations.py +6 -4
  33. lamindb/core/storage/_anndata_accessor.py +41 -0
  34. lamindb/core/storage/_backed_access.py +2 -2
  35. lamindb/core/storage/_pyarrow_dataset.py +25 -15
  36. lamindb/core/storage/_tiledbsoma.py +56 -12
  37. lamindb/core/storage/paths.py +41 -22
  38. lamindb/core/subsettings/_creation_settings.py +4 -16
  39. lamindb/curators/__init__.py +2168 -833
  40. lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
  41. lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
  42. lamindb/errors.py +96 -0
  43. lamindb/integrations/_vitessce.py +3 -3
  44. lamindb/migrations/0069_squashed.py +76 -75
  45. lamindb/migrations/0075_lamindbv1_part5.py +4 -5
  46. lamindb/migrations/0082_alter_feature_dtype.py +21 -0
  47. lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
  48. lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
  49. lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
  50. lamindb/migrations/0086_various.py +95 -0
  51. lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
  52. lamindb/migrations/0088_schema_components.py +273 -0
  53. lamindb/migrations/0088_squashed.py +4372 -0
  54. lamindb/models.py +423 -156
  55. {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/METADATA +10 -7
  56. lamindb-1.1.0.dist-info/RECORD +95 -0
  57. lamindb/curators/_spatial.py +0 -528
  58. lamindb/migrations/0052_squashed.py +0 -1261
  59. lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
  60. lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
  61. lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
  62. lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
  63. lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
  64. lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
  65. lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
  66. lamindb/migrations/0060_alter_artifact__actions.py +0 -22
  67. lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
  68. lamindb/migrations/0062_add_is_latest_field.py +0 -32
  69. lamindb/migrations/0063_populate_latest_field.py +0 -45
  70. lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
  71. lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
  72. lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
  73. lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
  74. lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
  75. lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
  76. lamindb-1.0.4.dist-info/RECORD +0 -102
  77. {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
  78. {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0
@@ -87,7 +87,7 @@ class MappedCollection:
87
87
  obs_keys: Keys from the ``.obs`` slots.
88
88
  obs_filter: Select only observations with these values for the given obs columns.
89
89
  Should be a dictionary with obs column names as keys
90
- and filtering values (a string or a tuple of strings) as values.
90
+ and filtering values (a string or a list of strings) as values.
91
91
  join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
92
92
  does not join.
93
93
  encode_labels: Encode labels into integers.
@@ -106,7 +106,7 @@ class MappedCollection:
106
106
  layers_keys: str | list[str] | None = None,
107
107
  obs_keys: str | list[str] | None = None,
108
108
  obsm_keys: str | list[str] | None = None,
109
- obs_filter: dict[str, str | tuple[str, ...]] | None = None,
109
+ obs_filter: dict[str, str | list[str]] | None = None,
110
110
  join: Literal["inner", "outer"] | None = "inner",
111
111
  encode_labels: bool | list[str] = True,
112
112
  unknown_label: str | dict[str, str] | None = None,
@@ -184,9 +184,14 @@ class MappedCollection:
184
184
  if self.filtered:
185
185
  indices_storage_mask = None
186
186
  for obs_filter_key, obs_filter_values in obs_filter.items():
187
- obs_filter_mask = np.isin(
188
- self._get_labels(store, obs_filter_key), obs_filter_values
189
- )
187
+ if isinstance(obs_filter_values, tuple):
188
+ obs_filter_values = list(obs_filter_values)
189
+ elif not isinstance(obs_filter_values, list):
190
+ obs_filter_values = [obs_filter_values]
191
+ obs_labels = self._get_labels(store, obs_filter_key)
192
+ obs_filter_mask = np.isin(obs_labels, obs_filter_values)
193
+ if pd.isna(obs_filter_values).any():
194
+ obs_filter_mask |= pd.isna(obs_labels)
190
195
  if indices_storage_mask is None:
191
196
  indices_storage_mask = obs_filter_mask
192
197
  else:
@@ -296,7 +301,7 @@ class MappedCollection:
296
301
  self.var_joint = reduce(pd.Index.intersection, self.var_list)
297
302
  if len(self.var_joint) == 0:
298
303
  raise ValueError(
299
- "The provided AnnData objects don't have shared varibales.\n"
304
+ "The provided AnnData objects don't have shared variables.\n"
300
305
  "Use join='outer'."
301
306
  )
302
307
  self.var_indices = [
@@ -389,7 +394,7 @@ class MappedCollection:
389
394
  else:
390
395
  cats = None
391
396
  label_idx = self._get_obs_idx(store, obs_idx, label, cats)
392
- if label in self.encoders:
397
+ if label in self.encoders and label_idx is not np.nan:
393
398
  label_idx = self.encoders[label][label_idx]
394
399
  out[label] = label_idx
395
400
  return out
@@ -453,6 +458,8 @@ class MappedCollection:
453
458
  label = labels[idx]
454
459
  else:
455
460
  label = labels["codes"][idx]
461
+ if label == -1:
462
+ return np.nan
456
463
  if categories is not None:
457
464
  cats = categories
458
465
  else:
@@ -589,7 +596,13 @@ class MappedCollection:
589
596
  cats = self._get_categories(storage, label_key)
590
597
  if cats is not None:
591
598
  cats = _decode(cats) if isinstance(cats[0], bytes) else cats
599
+ # NaN is coded as -1
600
+ nans = labels == -1
592
601
  labels = cats[labels]
602
+ # detect and replace nans
603
+ if nans.any():
604
+ labels[nans] = np.nan
605
+
593
606
  return labels
594
607
 
595
608
  def close(self):
@@ -85,4 +85,9 @@ from ._core import (
85
85
  schmidt22_perturbseq,
86
86
  )
87
87
  from ._fake import fake_bio_notebook_titles
88
- from ._small import anndata_with_obs, small_dataset1, small_dataset2
88
+ from ._small import (
89
+ anndata_with_obs,
90
+ small_dataset1,
91
+ small_dataset2,
92
+ small_dataset3_cellxgene,
93
+ )
@@ -18,7 +18,8 @@ if TYPE_CHECKING:
18
18
  def file_fcs() -> Path:
19
19
  """Example FCS artifact."""
20
20
  filepath, _ = urlretrieve(
21
- "https://lamindb-test.s3.amazonaws.com/example.fcs", "example.fcs"
21
+ "https://lamindb-dev-datasets.s3.amazonaws.com/.lamindb/DBNEczSgBui0bbzBXMGH.fcs",
22
+ "example.fcs",
22
23
  )
23
24
  return Path(filepath)
24
25
 
@@ -48,8 +49,8 @@ def file_fcs_alpert19(populate_registries: bool = False) -> Path: # pragma: no
48
49
  bt.CellMarker.public().inspect(std, "name").validated, "name"
49
50
  )
50
51
  )
51
- ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
52
- ln.Feature(name="organism", dtype=[bt.Organism]).save()
52
+ ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
53
+ ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore
53
54
  ln.settings.verbosity = verbosity
54
55
  return Path(filepath)
55
56
 
@@ -84,8 +85,8 @@ def file_tsv_rnaseq_nfcore_salmon_merged_gene_counts(
84
85
 
85
86
  verbosity = ln.settings.verbosity
86
87
  ln.settings.verbosity = "error"
87
- ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
88
- ln.Feature(name="organism", dtype=[bt.Organism]).save()
88
+ ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
89
+ ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore
89
90
  bt.ExperimentalFactor.from_source(ontology_id="EFO:0008896").save()
90
91
  ln.settings.verbosity = verbosity
91
92
 
@@ -207,7 +208,7 @@ def anndata_mouse_sc_lymph_node(
207
208
  # cell types
208
209
  ln.save(bt.CellType.from_values(["CL:0000115", "CL:0000738"], "ontology_id"))
209
210
  # assays
210
- ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
211
+ ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
211
212
  bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
212
213
  # genes
213
214
  validated = bt.Gene.public(organism="mouse").validate(
@@ -330,11 +331,11 @@ def anndata_human_immune_cells(
330
331
  ln.save(bt.CellType.from_values(adata.obs.cell_type, field="name"))
331
332
  ln.save(bt.ExperimentalFactor.from_values(adata.obs.assay, field="name"))
332
333
  ln.save(bt.Tissue.from_values(adata.obs.tissue, field="name"))
333
- ln.Feature(name="cell_type", dtype=[bt.CellType]).save()
334
- ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
335
- ln.Feature(name="tissue", dtype=[bt.Tissue]).save()
336
- ln.Feature(name="organism", dtype=[bt.Organism]).save()
337
- ln.Feature(name="donor", dtype=[ln.ULabel]).save()
334
+ ln.Feature(name="cell_type", dtype=[bt.CellType]).save() # type: ignore
335
+ ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore
336
+ ln.Feature(name="tissue", dtype=[bt.Tissue]).save() # type: ignore
337
+ ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore
338
+ ln.Feature(name="donor", dtype=[ln.ULabel]).save() # type: ignore
338
339
  bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
339
340
  ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
340
341
  ln.settings.verbosity = verbosity
@@ -8,20 +8,25 @@ import pandas as pd
8
8
 
9
9
 
10
10
  def small_dataset1(
11
- format: Literal["df", "anndata"],
11
+ otype: Literal["DataFrame", "AnnData"],
12
+ gene_symbols_in_index: bool = False,
12
13
  with_typo: bool = False,
13
- ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
14
+ ) -> pd.DataFrame | ad.AnnData:
14
15
  # define the data in the dataset
15
16
  # it's a mix of numerical measurements and observation-level metadata
16
17
  ifng = "IFNJ" if with_typo else "IFNG"
18
+ if gene_symbols_in_index:
19
+ var_ids = ["CD8A", "CD4", "CD14"]
20
+ else:
21
+ var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000170458"]
17
22
  dataset_dict = {
18
- "CD8A": [1, 2, 3],
19
- "CD4": [3, 4, 5],
20
- "CD14": [5, 6, 7],
21
- "cell_medium": ["DMSO", ifng, "DMSO"],
23
+ var_ids[0]: [1, 2, 3],
24
+ var_ids[1]: [3, 4, 5],
25
+ var_ids[2]: [5, 6, 7],
26
+ "cell_medium": pd.Categorical(["DMSO", ifng, "DMSO"]),
22
27
  "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
23
- "cell_type_by_expert": ["B cell", "T cell", "T cell"],
24
- "cell_type_by_model": ["B cell", "T cell", "T cell"],
28
+ "cell_type_by_expert": pd.Categorical(["B cell", "T cell", "T cell"]),
29
+ "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
25
30
  }
26
31
  # define the dataset-level metadata
27
32
  metadata = {
@@ -32,8 +37,10 @@ def small_dataset1(
32
37
  }
33
38
  # the dataset as DataFrame
34
39
  dataset_df = pd.DataFrame(dataset_dict, index=["sample1", "sample2", "sample3"])
35
- if format == "df":
36
- return dataset_df, metadata
40
+ if otype == "DataFrame":
41
+ for key, value in metadata.items():
42
+ dataset_df.attrs[key] = value
43
+ return dataset_df
37
44
  else:
38
45
  dataset_ad = ad.AnnData(
39
46
  dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
@@ -42,14 +49,19 @@ def small_dataset1(
42
49
 
43
50
 
44
51
  def small_dataset2(
45
- format: Literal["df", "anndata"],
46
- ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
52
+ otype: Literal["DataFrame", "AnnData"],
53
+ gene_symbols_in_index: bool = False,
54
+ ) -> pd.DataFrame | ad.AnnData:
55
+ if gene_symbols_in_index:
56
+ var_ids = ["CD8A", "CD4", "CD38"]
57
+ else:
58
+ var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
47
59
  dataset_dict = {
48
- "CD8A": [2, 3, 3],
49
- "CD4": [3, 4, 5],
50
- "CD38": [4, 2, 3],
51
- "cell_medium": ["DMSO", "IFNG", "IFNG"],
52
- "cell_type_by_model": ["B cell", "T cell", "T cell"],
60
+ var_ids[0]: [2, 3, 3],
61
+ var_ids[1]: [3, 4, 5],
62
+ var_ids[2]: [4, 2, 3],
63
+ "cell_medium": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
64
+ "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
53
65
  }
54
66
  metadata = {
55
67
  "temperature": 22.6,
@@ -61,11 +73,13 @@ def small_dataset2(
61
73
  index=["sample4", "sample5", "sample6"],
62
74
  )
63
75
  ad.AnnData(
64
- dataset_df[["CD8A", "CD4", "CD38"]],
76
+ dataset_df[var_ids],
65
77
  obs=dataset_df[["cell_medium", "cell_type_by_model"]],
66
78
  )
67
- if format == "df":
68
- return dataset_df, metadata
79
+ if otype == "DataFrame":
80
+ for key, value in metadata.items():
81
+ dataset_df.attrs[key] = value
82
+ return dataset_df
69
83
  else:
70
84
  dataset_ad = ad.AnnData(
71
85
  dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
@@ -73,6 +87,38 @@ def small_dataset2(
73
87
  return dataset_ad
74
88
 
75
89
 
90
+ def small_dataset3_cellxgene(
91
+ otype: Literal["DataFrame", "AnnData"] = "AnnData",
92
+ ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
93
+ # TODO: consider other ids for other organisms
94
+ # "ENSMUSG00002076988"
95
+ var_ids = ["invalid_ensembl_id", "ENSG00000000419", "ENSG00000139618"]
96
+ dataset_dict = {
97
+ var_ids[0]: [2, 3, 3],
98
+ var_ids[1]: [3, 4, 5],
99
+ var_ids[2]: [4, 2, 3],
100
+ "disease_ontology_term_id": ["MONDO:0004975", "MONDO:0004980", "MONDO:0004980"],
101
+ "organism": ["human", "human", "human"],
102
+ "sex": ["female", "male", "unknown"],
103
+ "tissue": ["lungg", "lungg", "heart"],
104
+ "donor": ["-1", "1", "2"],
105
+ }
106
+ dataset_df = pd.DataFrame(
107
+ dataset_dict,
108
+ index=["barcode1", "barcode2", "barcode3"],
109
+ )
110
+ dataset_df["tissue"] = dataset_df["tissue"].astype("category")
111
+ ad.AnnData(
112
+ dataset_df[var_ids],
113
+ obs=dataset_df[[key for key in dataset_dict if key not in var_ids]],
114
+ )
115
+ if otype == "DataFrame":
116
+ return dataset_df
117
+ else:
118
+ dataset_ad = ad.AnnData(dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:])
119
+ return dataset_ad
120
+
121
+
76
122
  def anndata_with_obs() -> ad.AnnData:
77
123
  """Create a mini anndata with cell_type, disease and tissue."""
78
124
  import anndata as ad
@@ -1,90 +1 @@
1
- """Exceptions.
2
-
3
- .. autosummary::
4
- :toctree: .
5
-
6
- InvalidArgument
7
- DoesNotExist
8
- ValidationError
9
- NotebookNotSaved
10
- MissingContextUID
11
- UpdateContext
12
- IntegrityError
13
- RecordNameChangeIntegrityError
14
-
15
- """
16
-
17
- # inheriting from SystemExit has the sole purpose of suppressing
18
- # the traceback - this isn't optimal but the current best solution
19
- # https://laminlabs.slack.com/archives/C04A0RMA0SC/p1726856875597489
20
-
21
-
22
- class InvalidArgument(SystemExit):
23
- """Invalid method or function argument."""
24
-
25
- pass
26
-
27
-
28
- class TrackNotCalled(SystemExit):
29
- """`ln.track()` wasn't called."""
30
-
31
- pass
32
-
33
-
34
- class NotebookNotSaved(SystemExit):
35
- """Notebook wasn't saved."""
36
-
37
- pass
38
-
39
-
40
- class ValidationError(SystemExit):
41
- """Validation error: not mapped in registry."""
42
-
43
- pass
44
-
45
-
46
- # inspired by Django's DoesNotExist
47
- # equivalent to SQLAlchemy's NoResultFound
48
- class DoesNotExist(SystemExit):
49
- """No record found."""
50
-
51
- pass
52
-
53
-
54
- class InconsistentKey(Exception):
55
- """Inconsistent transform or artifact `key`."""
56
-
57
- pass
58
-
59
-
60
- class RecordNameChangeIntegrityError(SystemExit):
61
- """Custom exception for name change errors."""
62
-
63
- pass
64
-
65
-
66
- # -------------------------------------------------------------------------------------
67
- # run context
68
- # -------------------------------------------------------------------------------------
69
-
70
-
71
- class IntegrityError(Exception):
72
- """Integrity error.
73
-
74
- For instance, it's not allowed to delete artifacts outside managed storage
75
- locations.
76
- """
77
-
78
- pass
79
-
80
-
81
- class MissingContextUID(SystemExit):
82
- """User didn't define transform settings."""
83
-
84
- pass
85
-
86
-
87
- class UpdateContext(SystemExit):
88
- """Transform settings require update."""
89
-
90
- pass
1
+ from ..errors import * # noqa: F403 backward compat
lamindb/core/loaders.py CHANGED
@@ -40,7 +40,7 @@ try:
40
40
  except ImportError:
41
41
 
42
42
  def load_anndata_zarr(storepath): # type: ignore
43
- raise ImportError("Please install zarr: pip install zarr")
43
+ raise ImportError("Please install zarr: pip install zarr<=2.18.4")
44
44
 
45
45
 
46
46
  is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
@@ -109,19 +109,13 @@ def load_json(path: UPathStr) -> dict:
109
109
  return data
110
110
 
111
111
 
112
- def load_yaml(path: UPathStr) -> dict | UPathStr:
112
+ def load_yaml(path: UPathStr) -> dict:
113
113
  """Load `.yaml` to `dict`."""
114
- try:
115
- import yaml # type: ignore
116
-
117
- with open(path) as f:
118
- data = yaml.safe_load(f)
119
- return data
120
- except ImportError:
121
- logger.warning(
122
- "Please install PyYAML (`pip install PyYAML`) to load `.yaml` files."
123
- )
124
- return path
114
+ import yaml # type: ignore
115
+
116
+ with open(path) as f:
117
+ data = yaml.safe_load(f)
118
+ return data
125
119
 
126
120
 
127
121
  def load_image(path: UPathStr) -> None | UPathStr:
lamindb/core/relations.py CHANGED
@@ -8,7 +8,7 @@ from lamindb_setup._connect_instance import (
8
8
  )
9
9
  from lamindb_setup.core._settings_store import instance_settings_file
10
10
 
11
- from lamindb.models import LinkORM, Record, Schema
11
+ from lamindb.models import LinkORM, Record, Registry, Schema
12
12
 
13
13
 
14
14
  def get_schema_modules(instance: str | None) -> set[str]:
@@ -35,9 +35,11 @@ def get_schema_modules(instance: str | None) -> set[str]:
35
35
  return shared_schema_modules
36
36
 
37
37
 
38
+ # this function here should likely be renamed
39
+ # it maps the __get_name_with_module__() onto the actual model
38
40
  def dict_module_name_to_model_name(
39
- registry: type[Record], instance: str | None = None
40
- ) -> dict[str, Record]:
41
+ registry: Registry, instance: str | None = None
42
+ ) -> dict[str, Registry]:
41
43
  schema_modules = get_schema_modules(instance)
42
44
  d: dict = {
43
45
  i.related_model.__get_name_with_module__(): i.related_model
@@ -92,7 +94,7 @@ def get_related_name(features_type: type[Record]) -> str:
92
94
  f"Can't create feature sets from {features_type.__name__} because it's not"
93
95
  " related to it!\nYou need to create a link model between Schema and"
94
96
  " your Record in your custom module.\nTo do so, add a"
95
- " line:\n_schemas_m2m = models.ManyToMany(Schema,"
97
+ " line:\n_feature_sets = models.ManyToMany(Schema,"
96
98
  " related_name='mythings')\n"
97
99
  )
98
100
  return candidates[0]
@@ -19,6 +19,7 @@ from fsspec.implementations.local import LocalFileSystem
19
19
  from lamin_utils import logger
20
20
  from lamindb_setup.core.upath import create_mapper, infer_filesystem
21
21
  from packaging import version
22
+ from upath import UPath
22
23
 
23
24
  if TYPE_CHECKING:
24
25
  from collections.abc import Mapping
@@ -741,3 +742,43 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
741
742
  return AnnDataRawAccessor(
742
743
  self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
743
744
  )
745
+
746
+
747
+ # get the number of observations in an anndata object or file fast and safely
748
+ def _anndata_n_observations(object: UPathStr | AnnData) -> int | None:
749
+ if isinstance(object, AnnData):
750
+ return object.n_obs
751
+
752
+ try:
753
+ objectpath = UPath(object)
754
+ suffix = objectpath.suffix
755
+ conn_module = {".h5ad": "h5py", ".zarr": "zarr"}.get(suffix, suffix[1:])
756
+ conn, storage = registry.open(conn_module, objectpath, mode="r")
757
+ except Exception as e:
758
+ logger.warning(f"Could not open {object} to read n_observations: {e}")
759
+ return None
760
+
761
+ n_observations: int | None = None
762
+ try:
763
+ obs = storage["obs"]
764
+ if isinstance(obs, GroupTypes): # type: ignore
765
+ if "_index" in obs.attrs:
766
+ elem_key = _read_attr(obs.attrs, "_index")
767
+ else:
768
+ elem_key = next(iter(obs))
769
+ elem = obs[elem_key]
770
+ if isinstance(elem, ArrayTypes): # type: ignore
771
+ n_observations = elem.shape[0]
772
+ else:
773
+ # assume standard obs group
774
+ n_observations = elem["codes"].shape[0]
775
+ else:
776
+ n_observations = obs.shape[0]
777
+ except Exception as e:
778
+ logger.warning(f"Could not read n_observations from anndata {object}: {e}")
779
+ finally:
780
+ if hasattr(storage, "close"):
781
+ storage.close()
782
+ if hasattr(conn, "close"):
783
+ conn.close()
784
+ return n_observations
@@ -94,8 +94,8 @@ def backed_access(
94
94
  return _open_pyarrow_dataset(objectpath)
95
95
  else:
96
96
  raise ValueError(
97
- "object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix, not"
98
- f" {suffix}."
97
+ "The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "
98
+ f"or be compatible with pyarrow.dataset.dataset, instead of being {suffix} object."
99
99
  )
100
100
 
101
101
  is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
@@ -6,26 +6,36 @@ import pyarrow.dataset
6
6
  from lamindb_setup.core.upath import LocalPathClasses
7
7
 
8
8
  if TYPE_CHECKING:
9
+ from pyarrow.dataset import Dataset as PyArrowDataset
9
10
  from upath import UPath
10
11
 
11
12
 
12
- PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather")
13
+ PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather", ".ipc")
13
14
 
14
15
 
15
- def _is_pyarrow_dataset(path: UPath) -> bool:
16
- # it is assumed here that path exists
17
- if path.is_file():
18
- return path.suffix in PYARROW_SUFFIXES
16
+ def _is_pyarrow_dataset(paths: UPath | list[UPath]) -> bool:
17
+ # it is assumed here that the paths exist
18
+ # we don't check here that the filesystem is the same
19
+ # but this is a requirement for pyarrow.dataset.dataset
20
+ if isinstance(paths, list):
21
+ suffixes = {path.suffix for path in paths}
22
+ elif paths.is_file():
23
+ suffixes = {paths.suffix}
19
24
  else:
20
- objects = path.rglob("*")
21
- suffixes = {object.suffix for object in objects if object.suffix != ""}
22
- return len(suffixes) == 1 and suffixes.pop() in PYARROW_SUFFIXES
23
-
24
-
25
- def _open_pyarrow_dataset(path: UPath) -> pyarrow.dataset.Dataset:
26
- if isinstance(path, LocalPathClasses):
27
- path_str, filesystem = path.as_posix(), None
25
+ suffixes = {path.suffix for path in paths.rglob("*") if path.suffix != ""}
26
+ return len(suffixes) == 1 and suffixes.pop() in PYARROW_SUFFIXES
27
+
28
+
29
+ def _open_pyarrow_dataset(paths: UPath | list[UPath]) -> PyArrowDataset:
30
+ if isinstance(paths, list):
31
+ path0 = paths[0]
32
+ if isinstance(path0, LocalPathClasses):
33
+ paths_str, filesystem = [path.as_posix() for path in paths], None
34
+ else:
35
+ paths_str, filesystem = [path.path for path in paths], path0.fs
36
+ elif isinstance(paths, LocalPathClasses):
37
+ paths_str, filesystem = paths.as_posix(), None
28
38
  else:
29
- path_str, filesystem = path.path, path.fs
39
+ paths_str, filesystem = paths.path, paths.fs
30
40
 
31
- return pyarrow.dataset.dataset(path_str, filesystem=filesystem)
41
+ return pyarrow.dataset.dataset(paths_str, filesystem=filesystem)