scdataloader 1.8.0__tar.gz → 1.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scdataloader
3
- Version: 1.8.0
3
+ Version: 1.9.0
4
4
  Summary: a dataloader for single cell data in lamindb
5
5
  Project-URL: repository, https://github.com/jkobject/scDataLoader
6
6
  Author-email: jkobject <jkobject@gmail.com>
@@ -14,13 +14,14 @@ Requires-Dist: cellxgene-census>=0.1.0
14
14
  Requires-Dist: django>=4.0.0
15
15
  Requires-Dist: harmonypy>=0.0.10
16
16
  Requires-Dist: ipykernel>=6.20.0
17
+ Requires-Dist: jupytext>=1.16.0
17
18
  Requires-Dist: lamindb[bionty,cellregistry,jupyter,ourprojects,zarr]<2,>=1.0.4
18
19
  Requires-Dist: leidenalg>=0.8.0
19
- Requires-Dist: lightning>=2.0.0
20
20
  Requires-Dist: matplotlib>=3.5.0
21
21
  Requires-Dist: numpy==1.26.0
22
22
  Requires-Dist: palantir>=1.3.3
23
23
  Requires-Dist: pandas>=2.0.0
24
+ Requires-Dist: pytorch-lightning>=2.3.0
24
25
  Requires-Dist: scikit-misc>=0.5.0
25
26
  Requires-Dist: seaborn>=0.11.0
26
27
  Requires-Dist: torch==2.2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "scdataloader"
3
- version = "1.8.0"
3
+ version = "1.9.0"
4
4
  description = "a dataloader for single cell data in lamindb"
5
5
  authors = [
6
6
  {name = "jkobject", email = "jkobject@gmail.com"}
@@ -14,7 +14,7 @@ dependencies = [
14
14
  "lamindb[bionty,ourprojects,jupyter,cellregistry,zarr]>=1.0.4,<2",
15
15
  "cellxgene-census>=0.1.0",
16
16
  "torch==2.2.0",
17
- "lightning>=2.0.0",
17
+ "pytorch-lightning>=2.3.0",
18
18
  "anndata>=0.9.0",
19
19
  "zarr>=2.10.0",
20
20
  "matplotlib>=3.5.0",
@@ -28,6 +28,8 @@ dependencies = [
28
28
  "scikit-misc>=0.5.0",
29
29
  "palantir>=1.3.3",
30
30
  "harmonypy>=0.0.10",
31
+ "jupytext>=1.16.0",
32
+
31
33
  ]
32
34
 
33
35
  [project.optional-dependencies]
@@ -0,0 +1 @@
1
+ 1.9.0
@@ -2,5 +2,6 @@ from .collator import Collator
2
2
  from .data import Dataset, SimpleAnnDataset
3
3
  from .datamodule import DataModule
4
4
  from .preprocess import Preprocessor
5
+ from importlib.metadata import version
5
6
 
6
- __version__ = "1.7.0"
7
+ __version__ = version("scdataloader")
@@ -24,7 +24,6 @@ class Collator:
24
24
  genelist: list[str] = [],
25
25
  downsample: Optional[float] = None, # don't use it for training!
26
26
  save_output: Optional[str] = None,
27
- metacell_mode: bool = False,
28
27
  ):
29
28
  """
30
29
  This class is responsible for collating data for the scPRINT model. It handles the
@@ -62,7 +61,6 @@ class Collator:
62
61
  This is usually done by the scPRINT model during training but this option allows you to do it directly from the collator
63
62
  save_output (str, optional): If not None, saves the output to a file. Defaults to None.
64
63
  This is mainly for debugging purposes
65
- metacell_mode (bool, optional): Whether to sample a metacell. Defaults to False.
66
64
  """
67
65
  self.organisms = organisms
68
66
  self.genedf = load_genes(organisms)
@@ -82,7 +80,6 @@ class Collator:
82
80
  self.accepted_genes = {}
83
81
  self.downsample = downsample
84
82
  self.to_subset = {}
85
- self.metacell_mode = metacell_mode
86
83
  self._setup(org_to_id, valid_genes, genelist)
87
84
 
88
85
  def _setup(self, org_to_id=None, valid_genes=[], genelist=[]):
@@ -135,6 +132,7 @@ class Collator:
135
132
  dataset = []
136
133
  nnz_loc = []
137
134
  is_meta = []
135
+ knn_cells = []
138
136
  for elem in batch:
139
137
  organism_id = elem[self.organism_name]
140
138
  if organism_id not in self.organism_ids:
@@ -145,10 +143,20 @@ class Collator:
145
143
  total_count.append(expr.sum())
146
144
  if len(self.accepted_genes) > 0:
147
145
  expr = expr[self.accepted_genes[organism_id]]
146
+ if "knn_cells" in elem:
147
+ elem["knn_cells"] = elem["knn_cells"][
148
+ :, self.accepted_genes[organism_id]
149
+ ]
148
150
  if self.how == "most expr":
149
151
  nnz_loc = np.where(expr > 0)[0]
150
- ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
151
- loc = np.argsort(expr)[-(ma):][::-1]
152
+ if "knn_cells" in elem:
153
+ nnz_loc = np.where(expr + elem["knn_cells"].sum(0) > 0)[0]
154
+ ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
155
+ loc = np.argsort(expr + elem["knn_cells"].mean(0))[-(ma):][::-1]
156
+ else:
157
+ nnz_loc = np.where(expr > 0)[0]
158
+ ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
159
+ loc = np.argsort(expr)[-(ma):][::-1]
152
160
  # nnz_loc = [1] * 30_000
153
161
  # loc = np.argsort(expr)[-(self.max_len) :][::-1]
154
162
  elif self.how == "random expr":
@@ -171,33 +179,49 @@ class Collator:
171
179
  "all",
172
180
  "some",
173
181
  ]:
174
- zero_loc = np.where(expr == 0)[0]
175
- zero_loc = zero_loc[
176
- np.random.choice(
177
- len(zero_loc),
178
- self.add_zero_genes
179
- + (
180
- 0
181
- if self.max_len < len(nnz_loc)
182
- else self.max_len - len(nnz_loc)
183
- ),
184
- replace=False,
185
- )
186
- ]
182
+ if "knn_cells" in elem:
183
+ # we complete with genes expressed in the knn
184
+ nnz_loc = np.where(elem["knn_cells"].sum(0) > 0)[0]
185
+ ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
186
+ # which is not a zero_loc in this context
187
+ zero_loc = np.argsort(elem["knn_cells"].sum(0))[-(ma):][::-1]
188
+ else:
189
+ zero_loc = np.where(expr == 0)[0]
190
+ zero_loc = zero_loc[
191
+ np.random.choice(
192
+ len(zero_loc),
193
+ self.add_zero_genes
194
+ + (
195
+ 0
196
+ if self.max_len < len(nnz_loc)
197
+ else self.max_len - len(nnz_loc)
198
+ ),
199
+ replace=False,
200
+ )
201
+ ]
187
202
  loc = np.concatenate((loc, zero_loc), axis=None)
188
203
  expr = expr[loc]
189
- loc = loc + self.start_idx[organism_id]
204
+ if "knn_cells" in elem:
205
+ elem["knn_cells"] = elem["knn_cells"][:, loc]
190
206
  if self.how == "some":
207
+ if "knn_cells" in elem:
208
+ elem["knn_cells"] = elem["knn_cells"][
209
+ :, self.to_subset[organism_id]
210
+ ]
191
211
  expr = expr[self.to_subset[organism_id]]
192
212
  loc = loc[self.to_subset[organism_id]]
193
213
  exprs.append(expr)
194
- gene_locs.append(loc)
214
+ if "knn_cells" in elem:
215
+ knn_cells.append(elem["knn_cells"])
216
+ # then we need to add the start_idx to the loc to give it the correct index
217
+ # according to the model
218
+ gene_locs.append(loc + self.start_idx[organism_id])
195
219
 
196
220
  if self.tp_name is not None:
197
221
  tp.append(elem[self.tp_name])
198
222
  else:
199
223
  tp.append(0)
200
- if self.metacell_mode:
224
+ if "is_meta" in elem:
201
225
  is_meta.append(elem["is_meta"])
202
226
  other_classes.append([elem[i] for i in self.class_names])
203
227
  expr = np.array(exprs)
@@ -207,6 +231,7 @@ class Collator:
207
231
  other_classes = np.array(other_classes)
208
232
  dataset = np.array(dataset)
209
233
  is_meta = np.array(is_meta)
234
+ knn_cells = np.array(knn_cells)
210
235
  # normalize counts
211
236
  if self.norm_to is not None:
212
237
  expr = (expr * self.norm_to) / total_count[:, None]
@@ -217,15 +242,6 @@ class Collator:
217
242
  if self.n_bins:
218
243
  pass
219
244
 
220
- # find the associated gene ids (given the species)
221
-
222
- # get the NN cells
223
-
224
- # do encoding / selection a la scGPT
225
-
226
- # do encoding of graph location
227
- # encode all the edges in some sparse way
228
- # normalizing total counts between 0,1
229
245
  ret = {
230
246
  "x": Tensor(expr),
231
247
  "genes": Tensor(gene_locs).int(),
@@ -233,8 +249,10 @@ class Collator:
233
249
  "tp": Tensor(tp),
234
250
  "depth": Tensor(total_count),
235
251
  }
236
- if self.metacell_mode:
252
+ if len(is_meta) > 0:
237
253
  ret.update({"is_meta": Tensor(is_meta).int()})
254
+ if len(knn_cells) > 0:
255
+ ret.update({"knn_cells": Tensor(knn_cells)})
238
256
  if len(dataset) > 0:
239
257
  ret.update({"dataset": Tensor(dataset).to(long)})
240
258
  if self.downsample is not None:
@@ -242,6 +260,8 @@ class Collator:
242
260
  if self.save_output is not None:
243
261
  with open(self.save_output, "a") as f:
244
262
  np.savetxt(f, ret["x"].numpy())
263
+ with open(self.save_output + "_loc", "a") as f:
264
+ np.savetxt(f, gene_locs)
245
265
  return ret
246
266
 
247
267
 
@@ -118,7 +118,7 @@ MAIN_HUMAN_MOUSE_DEV_STAGE_MAP = {
118
118
  ],
119
119
  "HsapDv:0000258": [ # mature stage
120
120
  "MmusDv:0000110", # mature stage
121
- "HsapDv:0000204",
121
+ "HsapDv:0000204", #
122
122
  ],
123
123
  "HsapDv:0000227": [ # late adult stage
124
124
  "MmusDv:0000091", # 20 month-old stage
@@ -58,6 +58,7 @@ class Dataset(torchDataset):
58
58
  hierarchical_clss: Optional[list[str]] = field(default_factory=list)
59
59
  join_vars: Literal["inner", "outer"] | None = None
60
60
  metacell_mode: float = 0.0
61
+ get_knn_cells: bool = False
61
62
 
62
63
  def __post_init__(self):
63
64
  self.mapped_dataset = mapped(
@@ -69,6 +70,7 @@ class Dataset(torchDataset):
69
70
  stream=True,
70
71
  parallel=True,
71
72
  metacell_mode=self.metacell_mode,
73
+ get_knn_cells=self.get_knn_cells,
72
74
  )
73
75
  print(
74
76
  "won't do any check but we recommend to have your dataset coming from local storage"
@@ -371,6 +373,7 @@ def mapped(
371
373
  is_run_input: bool | None = None,
372
374
  metacell_mode: bool = False,
373
375
  meta_assays: list[str] = ["EFO:0022857", "EFO:0010961"],
376
+ get_knn_cells: bool = False,
374
377
  ) -> MappedCollection:
375
378
  path_list = []
376
379
  for artifact in dataset.artifacts.all():
@@ -397,5 +400,6 @@ def mapped(
397
400
  dtype=dtype,
398
401
  meta_assays=meta_assays,
399
402
  metacell_mode=metacell_mode,
403
+ get_knn_cells=get_knn_cells,
400
404
  )
401
405
  return ds
@@ -52,6 +52,7 @@ class DataModule(L.LightningDataModule):
52
52
  # "EFO:0030062", # slide-seq
53
53
  ],
54
54
  metacell_mode: float = 0.0,
55
+ get_knn_cells: bool = False,
55
56
  modify_seed_on_requeue: bool = True,
56
57
  **kwargs,
57
58
  ):
@@ -88,6 +89,7 @@ class DataModule(L.LightningDataModule):
88
89
  metacell_mode (float, optional): The probability of using metacell mode. Defaults to 0.0.
89
90
  clss_to_predict (list, optional): List of classes to predict. Defaults to ["organism_ontology_term_id"].
90
91
  modify_seed_on_requeue (bool, optional): Whether to modify the seed on requeue. Defaults to True.
92
+ get_knn_cells (bool, optional): Whether to get the k-nearest neighbors of each queried cells. Defaults to False.
91
93
  **kwargs: Additional keyword arguments passed to the pytorch DataLoader.
92
94
  see @file data.py and @file collator.py for more details about some of the parameters
93
95
  """
@@ -98,6 +100,7 @@ class DataModule(L.LightningDataModule):
98
100
  clss_to_predict=clss_to_predict,
99
101
  hierarchical_clss=hierarchical_clss,
100
102
  metacell_mode=metacell_mode,
103
+ get_knn_cells=get_knn_cells,
101
104
  )
102
105
  # and location
103
106
  self.metacell_mode = bool(metacell_mode)
@@ -157,7 +160,6 @@ class DataModule(L.LightningDataModule):
157
160
  tp_name=tp_name,
158
161
  organism_name=organism_name,
159
162
  class_names=clss_to_predict,
160
- metacell_mode=bool(metacell_mode),
161
163
  )
162
164
  self.validation_split = validation_split
163
165
  self.test_split = test_split
@@ -96,8 +96,9 @@ class MappedCollection:
96
96
  cache_categories: Enable caching categories of ``obs_keys`` for faster access.
97
97
  parallel: Enable sampling with multiple processes.
98
98
  dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm``
99
- meta_assays: Assays to check for metacells.
100
- metacell_mode: Mode for metacells.
99
+ meta_assays: Assays that are already defined as metacells.
100
+ metacell_mode: frequency at which to sample a metacell (an average of k-nearest neighbors).
101
+ get_knn_cells: Whether to also dataload the k-nearest neighbors of each queried cells.
101
102
  """
102
103
 
103
104
  def __init__(
@@ -114,6 +115,7 @@ class MappedCollection:
114
115
  parallel: bool = False,
115
116
  dtype: str | None = None,
116
117
  metacell_mode: float = 0.0,
118
+ get_knn_cells: bool = False,
117
119
  meta_assays: list[str] = ["EFO:0022857", "EFO:0010961"],
118
120
  ):
119
121
  if join not in {None, "inner", "outer"}: # pragma: nocover
@@ -166,6 +168,7 @@ class MappedCollection:
166
168
  self.metacell_mode = metacell_mode
167
169
  self.path_list = path_list
168
170
  self.meta_assays = meta_assays
171
+ self.get_knn_cells = get_knn_cells
169
172
  self._make_connections(path_list, parallel)
170
173
 
171
174
  self._cache_cats: dict = {}
@@ -396,12 +399,15 @@ class MappedCollection:
396
399
  label_idx = self.encoders[label][label_idx]
397
400
  out[label] = label_idx
398
401
 
399
- out["is_meta"] = False
400
- if len(self.meta_assays) > 0 and "assay_ontology_term_id" in self.obs_keys:
401
- if out["assay_ontology_term_id"] in self.meta_assays:
402
- out["is_meta"] = True
403
- return out
404
402
  if self.metacell_mode > 0:
403
+ if (
404
+ len(self.meta_assays) > 0
405
+ and "assay_ontology_term_id" in self.obs_keys
406
+ ):
407
+ if out["assay_ontology_term_id"] in self.meta_assays:
408
+ out["is_meta"] = True
409
+ return out
410
+ out["is_meta"] = False
405
411
  if np.random.random() < self.metacell_mode:
406
412
  out["is_meta"] = True
407
413
  distances = self._get_data_idx(store["obsp"]["distances"], obs_idx)
@@ -410,6 +416,19 @@ class MappedCollection:
410
416
  out[layers_key] += self._get_data_idx(
411
417
  lazy_data, i, self.join_vars, var_idxs_join, self.n_vars
412
418
  )
419
+ elif self.get_knn_cells:
420
+ distances = self._get_data_idx(store["obsp"]["distances"], obs_idx)
421
+ nn_idx = np.argsort(-1 / (distances - 1e-6))[:6]
422
+ out["knn_cells"] = np.array(
423
+ [
424
+ self._get_data_idx(
425
+ lazy_data, i, self.join_vars, var_idxs_join, self.n_vars
426
+ )
427
+ for i in nn_idx
428
+ ],
429
+ dtype=int,
430
+ )
431
+ out["distances"] = distances[nn_idx]
413
432
 
414
433
  return out
415
434
 
@@ -9,7 +9,7 @@ import scanpy as sc
9
9
  from anndata import AnnData, read_h5ad
10
10
  from scipy.sparse import csr_matrix
11
11
  from upath import UPath
12
-
12
+ import gc
13
13
  from scdataloader import utils as data_utils
14
14
 
15
15
  FULL_LENGTH_ASSAYS = [
@@ -18,7 +18,7 @@ FULL_LENGTH_ASSAYS = [
18
18
  "EFO:0008931",
19
19
  ]
20
20
 
21
- MAXFILESIZE = 10_000_000_000
21
+ MAXFILESIZE = 5_000_000_000
22
22
 
23
23
 
24
24
  class Preprocessor:
@@ -64,6 +64,11 @@ class Preprocessor:
64
64
  """
65
65
  Initializes the preprocessor and configures the workflow steps.
66
66
 
67
+ Your dataset should contain at least the following obs:
68
+ - `organism_ontology_term_id` with the ontology id of the organism of your anndata
69
+ - gene names in the `var.index` field of your anndata that map to the ensembl_gene nomenclature
70
+ or the hugo gene symbols nomenclature (if the later, set `is_symbol` to True)
71
+
67
72
  Args:
68
73
  filter_gene_by_counts (int or bool, optional): Determines whether to filter genes by counts.
69
74
  If int, filters genes with counts. Defaults to False.
@@ -130,13 +135,21 @@ class Preprocessor:
130
135
  self.keepdata = keepdata
131
136
 
132
137
  def __call__(self, adata, dataset_id=None) -> AnnData:
133
- if adata[0].obs.organism_ontology_term_id.iloc[0] not in self.organisms:
138
+ if self.additional_preprocess is not None:
139
+ adata = self.additional_preprocess(adata)
140
+ if "organism_ontology_term_id" not in adata[0].obs.columns:
141
+ raise ValueError(
142
+ "organism_ontology_term_id not found in adata.obs, you need to add an ontology term id for the organism of your anndata"
143
+ )
144
+ if not adata[0].var.index.str.contains("ENS").any() and not self.is_symbol:
145
+ raise ValueError(
146
+ "gene names in the `var.index` field of your anndata should map to the ensembl_gene nomenclature else set `is_symbol` to True if using hugo symbols"
147
+ )
148
+ if adata.obs["organism_ontology_term_id"].iloc[0] not in self.organisms:
134
149
  raise ValueError(
135
150
  "we cannot work with this organism",
136
- adata[0].obs.organism_ontology_term_id.iloc[0],
151
+ adata.obs["organism_ontology_term_id"],
137
152
  )
138
- if self.additional_preprocess is not None:
139
- adata = self.additional_preprocess(adata)
140
153
  if adata.raw is not None and self.use_raw:
141
154
  adata.X = adata.raw.X
142
155
  del adata.raw
@@ -152,11 +165,12 @@ class Preprocessor:
152
165
  del adata.layers
153
166
  if len(adata.varm.keys()) > 0 and not self.keepdata:
154
167
  del adata.varm
155
- if len(adata.obsm.keys()) > 0 and self.do_postp and not self.keepdata:
168
+ if len(adata.obsm.keys()) > 0 and not self.keepdata:
156
169
  del adata.obsm
157
- if len(adata.obsp.keys()) > 0 and self.do_postp and not self.keepdata:
170
+ if len(adata.obsp.keys()) > 0 and not self.keepdata:
158
171
  del adata.obsp
159
172
  # check that it is a count
173
+
160
174
  print("checking raw counts")
161
175
  if np.abs(
162
176
  adata[:50_000].X.astype(int) - adata[:50_000].X
@@ -217,23 +231,51 @@ class Preprocessor:
217
231
  )
218
232
  )
219
233
 
220
- if self.is_symbol or not adata.var.index.str.contains("ENS").any():
221
- if not adata.var.index.str.contains("ENS").any():
222
- print("No ENS genes found, assuming gene symbols...")
223
- genesdf["ensembl_gene_id"] = genesdf.index
224
- var = (
225
- adata.var.merge(
226
- genesdf.drop_duplicates("symbol").set_index("symbol", drop=False),
227
- left_index=True,
228
- right_index=True,
229
- how="inner",
230
- )
231
- .sort_values(by="ensembl_gene_id")
232
- .set_index("ensembl_gene_id")
234
+ # Check if we have a mix of gene names and ensembl IDs
235
+ has_ens = adata.var.index.str.match(r"ENS.*\d{6,}$").any()
236
+ all_ens = adata.var.index.str.match(r"ENS.*\d{6,}$").all()
237
+
238
+ if not has_ens:
239
+ print("No ENS genes found, assuming gene symbols...")
240
+ elif not all_ens:
241
+ print("Mix of ENS and gene symbols found, converting all to ENS IDs...")
242
+
243
+ genesdf["ensembl_gene_id"] = genesdf.index
244
+
245
+ # For genes that are already ENS IDs, use them directly
246
+ ens_mask = adata.var.index.str.match(r"ENS.*\d{6,}$")
247
+ symbol_mask = ~ens_mask
248
+
249
+ # Handle symbol genes
250
+ if symbol_mask.any():
251
+ symbol_var = adata.var[symbol_mask].merge(
252
+ genesdf.drop_duplicates("symbol").set_index("symbol", drop=False),
253
+ left_index=True,
254
+ right_index=True,
255
+ how="inner",
256
+ )
257
+
258
+ # Handle ENS genes
259
+ if ens_mask.any():
260
+ ens_var = adata.var[ens_mask].merge(
261
+ genesdf, left_index=True, right_index=True, how="inner"
233
262
  )
234
- adata = adata[:, var["symbol"]]
235
- adata.var = var
236
- genesdf = genesdf.set_index("ensembl_gene_id")
263
+
264
+ # Combine and sort
265
+ if symbol_mask.any() and ens_mask.any():
266
+ var = pd.concat([symbol_var, ens_var])
267
+ elif symbol_mask.any():
268
+ var = symbol_var
269
+ else:
270
+ var = ens_var
271
+
272
+ adata = adata[:, var.index]
273
+ var = var.sort_values(by="ensembl_gene_id").set_index("ensembl_gene_id")
274
+ # Update adata with combined genes
275
+ adata.var = var
276
+ genesdf = genesdf.set_index("ensembl_gene_id")
277
+ # Drop duplicate genes, keeping first occurrence
278
+ adata = adata[:, ~adata.var.index.duplicated(keep="first")]
237
279
 
238
280
  intersect_genes = set(adata.var.index).intersection(set(genesdf.index))
239
281
  print(f"Removed {len(adata.var.index) - len(intersect_genes)} genes.")
@@ -462,13 +504,17 @@ class LaminPreprocessor(Preprocessor):
462
504
  print(file)
463
505
 
464
506
  path = cache_path(file) if self.force_preloaded else file.cache()
465
- backed = read_h5ad(path, backed="r")
466
- if backed.obs.is_primary_data.sum() == 0:
467
- print(f"{file.key} only contains non primary cells.. dropping")
468
- # Save the stem_uid to a file to avoid loading it again
507
+ backed = file.open()
508
+ # backed = read_h5ad(path, backed="r")
509
+ if "is_primary_data" in backed.obs.columns:
510
+ if backed.obs.is_primary_data.sum() == 0:
511
+ print(f"{file.key} only contains non primary cells.. dropping")
512
+ # Save the stem_uid to a file to avoid loading it again
469
513
  with open("nonprimary.txt", "a") as f:
470
514
  f.write(f"{file.stem_uid}\n")
471
515
  continue
516
+ else:
517
+ print("Warning: couldn't check unicity from is_primary_data column")
472
518
  if backed.shape[1] < 1000:
473
519
  print(
474
520
  f"{file.key} only contains less than 1000 genes and is likely not scRNAseq... dropping"
@@ -489,16 +535,23 @@ class LaminPreprocessor(Preprocessor):
489
535
  block_size = int(
490
536
  (np.ceil(badata.shape[0] / 30_000) * 30_000) // num_blocks
491
537
  )
492
- print("num blocks ", num_blocks)
538
+ print(
539
+ "num blocks ",
540
+ num_blocks,
541
+ "block size ",
542
+ block_size,
543
+ "total elements ",
544
+ badata.shape[0],
545
+ )
493
546
  for j in range(num_blocks):
494
- if j == 0 and i == 390:
495
- continue
496
547
  start_index = j * block_size
497
548
  end_index = min((j + 1) * block_size, badata.shape[0])
498
- block = badata[start_index:end_index].to_memory()
549
+ block = badata[start_index:end_index]
550
+ block = block.to_memory()
499
551
  print(block)
500
552
  block = super().__call__(
501
- block, dataset_id=file.stem_uid + "_p" + str(j)
553
+ block,
554
+ dataset_id=file.stem_uid + "_p" + str(j),
502
555
  )
503
556
  myfile = ln.Artifact.from_anndata(
504
557
  block,
@@ -508,16 +561,19 @@ class LaminPreprocessor(Preprocessor):
508
561
  + " p"
509
562
  + str(j)
510
563
  + " ( revises file "
511
- + str(file.key)
564
+ + str(file.stem_uid)
512
565
  + " )",
513
566
  version=version,
514
567
  )
515
568
  myfile.save()
569
+
516
570
  if self.keep_files:
517
571
  files.append(myfile)
572
+ del block
518
573
  else:
519
574
  del myfile
520
575
  del block
576
+ gc.collect()
521
577
 
522
578
  else:
523
579
  adata = super().__call__(adata, dataset_id=file.stem_uid)
@@ -530,6 +586,7 @@ class LaminPreprocessor(Preprocessor):
530
586
  myfile.save()
531
587
  if self.keep_files:
532
588
  files.append(myfile)
589
+ del adata
533
590
  else:
534
591
  del myfile
535
592
  del adata
@@ -549,7 +606,12 @@ class LaminPreprocessor(Preprocessor):
549
606
 
550
607
  # issues with KLlggfw6I6lvmbqiZm46
551
608
  if self.keep_files:
552
- dataset = ln.Collection(files, name=name, description=description)
609
+ # Reconstruct collection using keys
610
+ dataset = ln.Collection(
611
+ [ln.Artifact.filter(key=k).one() for k in files],
612
+ name=name,
613
+ description=description,
614
+ )
553
615
  dataset.save()
554
616
  return dataset
555
617
  else:
@@ -154,7 +154,7 @@ def getBiomartTable(
154
154
  return res
155
155
 
156
156
 
157
- def validate(adata: AnnData, organism: str, need_all=True):
157
+ def validate(adata: AnnData, organism: str, need_all=False):
158
158
  """
159
159
  validate checks if the adata object is valid for lamindb
160
160
 
@@ -578,7 +578,6 @@ def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"): # "NCBITaxon:10
578
578
 
579
579
 
580
580
  def populate_my_ontology(
581
- organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
582
581
  sex: List[str] = ["PATO:0000384", "PATO:0000383"],
583
582
  celltypes: List[str] = [],
584
583
  ethnicities: List[str] = [],
@@ -586,7 +585,7 @@ def populate_my_ontology(
586
585
  tissues: List[str] = [],
587
586
  diseases: List[str] = [],
588
587
  dev_stages: List[str] = [],
589
- organism_clade: str = "vertebrates",
588
+ organisms_clade: List[str] = ["vertebrates", "plants"],
590
589
  ):
591
590
  """
592
591
  creates a local version of the lamin ontologies and add the required missing values in base ontologies
@@ -622,23 +621,27 @@ def populate_my_ontology(
622
621
  ln.save(records)
623
622
  bt.CellType(name="unknown", ontology_id="unknown").save()
624
623
  # Organism
625
- if organisms is not None:
626
- names = (
627
- bt.Organism.public(organism=organism_clade).df().index
628
- if not organisms
629
- else organisms
630
- )
631
- source = bt.PublicSource.filter(name="ensembl", organism=organism_clade).last()
632
- records = [
633
- organism_or_organismlist
634
- if isinstance(organism_or_organismlist, bt.Organism)
635
- else organism_or_organismlist[0]
636
- for organism_or_organismlist in [
637
- bt.Organism.from_source(ontology_id=name, source=source)
638
- for name in names
624
+ if organisms_clade is not None:
625
+ records = []
626
+ for organism_clade in organisms_clade:
627
+ names = bt.Organism.public(organism=organism_clade).df().index
628
+ source = bt.PublicSource.filter(
629
+ name="ensembl", organism=organism_clade
630
+ ).last()
631
+ records += [
632
+ bt.Organism.from_source(name=name, source=source) for name in names
639
633
  ]
640
- ]
641
- ln.save(records)
634
+ nrecords = []
635
+ prevrec = set()
636
+ for rec in records:
637
+ if rec is None:
638
+ continue
639
+ if not isinstance(rec, bt.Organism):
640
+ rec = rec[0]
641
+ if rec.uid not in prevrec:
642
+ nrecords.append(rec)
643
+ prevrec.add(rec.uid)
644
+ ln.save(nrecords)
642
645
  bt.Organism(name="unknown", ontology_id="unknown").save()
643
646
  # Phenotype
644
647
  if sex is not None:
@@ -1 +0,0 @@
1
- 1.8.0
File without changes
File without changes
File without changes