scdataloader 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,7 @@
1
- from typing import Callable, Optional, Union
1
+ from typing import Any, Callable, Optional, Union
2
2
  from uuid import uuid4
3
3
 
4
4
  import anndata as ad
5
- import bionty as bt
6
5
  import lamindb as ln
7
6
  import numpy as np
8
7
  import pandas as pd
@@ -10,6 +9,7 @@ import scanpy as sc
10
9
  from anndata import AnnData
11
10
  from django.db import IntegrityError
12
11
  from scipy.sparse import csr_matrix
12
+ import os
13
13
 
14
14
  from scdataloader import utils as data_utils
15
15
 
@@ -19,6 +19,8 @@ FULL_LENGTH_ASSAYS = [
19
19
  "EFO:0008931",
20
20
  ]
21
21
 
22
+ MAXFILESIZE = 10_000_000_000
23
+
22
24
 
23
25
  class Preprocessor:
24
26
  """
@@ -28,30 +30,31 @@ class Preprocessor:
28
30
 
29
31
  def __init__(
30
32
  self,
31
- lb,
32
33
  filter_gene_by_counts: Union[int, bool] = False,
33
34
  filter_cell_by_counts: Union[int, bool] = False,
34
- normalize_total: Union[float, bool] = False,
35
- log1p: bool = False,
36
- subset_hvg: Union[int, bool] = False,
35
+ normalize_sum: float = 1e4,
36
+ subset_hvg: int = 0,
37
+ use_layer: Optional[str] = None,
38
+ is_symbol: bool = False,
37
39
  hvg_flavor: str = "seurat_v3",
38
40
  binning: Optional[int] = None,
39
41
  result_binned_key: str = "X_binned",
40
42
  length_normalize: bool = False,
43
+ force_preprocess: bool = False,
44
+ min_dataset_size: int = 100,
45
+ min_valid_genes_id: int = 10_000,
46
+ min_nnz_genes: int = 200,
47
+ maxdropamount: int = 50,
48
+ madoutlier: int = 5,
49
+ pct_mt_outlier: int = 8,
50
+ batch_key: Optional[str] = None,
51
+ skip_validate: bool = False,
41
52
  additional_preprocess: Optional[Callable[[AnnData], AnnData]] = None,
42
53
  additional_postprocess: Optional[Callable[[AnnData], AnnData]] = None,
43
- force_preprocess=False,
44
- min_dataset_size=100,
45
- min_valid_genes_id=10_000,
46
- min_nnz_genes=200,
47
- maxdropamount=2,
48
- madoutlier=5,
49
- pct_mt_outlier=8,
50
- batch_key=None,
51
- erase_prev_dataset: bool = False,
52
- cache: bool = True,
53
- stream: bool = False,
54
- ):
54
+ do_postp: bool = True,
55
+ organisms: list[str] = ["NCBITaxon:9606", "NCBITaxon:10090"],
56
+ use_raw: bool = True,
57
+ ) -> None:
55
58
  """
56
59
  Initializes the preprocessor and configures the workflow steps.
57
60
 
@@ -60,7 +63,7 @@ class Preprocessor:
60
63
  If int, filters genes with counts. Defaults to False.
61
64
  filter_cell_by_counts (int or bool, optional): Determines whether to filter cells by counts.
62
65
  If int, filters cells with counts. Defaults to False.
63
- normalize_total (float or bool, optional): Determines whether to normalize the total counts of each cell to a specific value.
66
+ normalize_sum (float or bool, optional): Determines whether to normalize the total counts of each cell to a specific value.
64
67
  Defaults to 1e4.
65
68
  log1p (bool, optional): Determines whether to apply log1p transform to the normalized data.
66
69
  Defaults to True.
@@ -71,19 +74,38 @@ class Preprocessor:
71
74
  binning (int, optional): Determines whether to bin the data into discrete values of number of bins provided.
72
75
  result_binned_key (str, optional): Specifies the key of :class:`~anndata.AnnData` to store the binned data.
73
76
  Defaults to "X_binned".
77
+ length_normalize (bool, optional): Determines whether to length normalize the data.
78
+ Defaults to False.
79
+ force_preprocess (bool, optional): Determines whether to bypass the check of raw counts.
80
+ Defaults to False.
81
+ min_dataset_size (int, optional): The minimum size required for a dataset to be kept.
82
+ Defaults to 100.
83
+ min_valid_genes_id (int, optional): The minimum number of valid genes to keep a dataset.
84
+ Defaults to 10_000.
85
+ min_nnz_genes (int, optional): The minimum number of non-zero genes to keep a cell.
86
+ Defaults to 200.
87
+ maxdropamount (int, optional): The maximum amount of dropped cells per dataset. (2 for 50% drop, 3 for 33% drop, etc.)
88
+ Defaults to 2.
89
+ madoutlier (int, optional): The maximum absolute deviation of the outlier samples.
90
+ Defaults to 5.
91
+ pct_mt_outlier (int, optional): The maximum percentage of mitochondrial genes outlier.
92
+ Defaults to 8.
93
+ batch_key (str, optional): The key of :class:`~anndata.AnnData.obs` to use for batch information.
94
+ This arg is used in the highly variable gene selection step.
95
+ skip_validate (bool, optional): Determines whether to skip the validation step.
96
+ Defaults to False.
74
97
  """
75
98
  self.filter_gene_by_counts = filter_gene_by_counts
76
99
  self.filter_cell_by_counts = filter_cell_by_counts
77
- self.normalize_total = normalize_total
78
- self.log1p = log1p
100
+ self.normalize_sum = normalize_sum
79
101
  self.subset_hvg = subset_hvg
80
102
  self.hvg_flavor = hvg_flavor
81
103
  self.binning = binning
104
+ self.organisms = organisms
82
105
  self.result_binned_key = result_binned_key
83
106
  self.additional_preprocess = additional_preprocess
84
107
  self.additional_postprocess = additional_postprocess
85
108
  self.force_preprocess = force_preprocess
86
- self.lb = lb
87
109
  self.min_dataset_size = min_dataset_size
88
110
  self.min_valid_genes_id = min_valid_genes_id
89
111
  self.min_nnz_genes = min_nnz_genes
@@ -91,124 +113,80 @@ class Preprocessor:
91
113
  self.madoutlier = madoutlier
92
114
  self.pct_mt_outlier = pct_mt_outlier
93
115
  self.batch_key = batch_key
94
- self.erase_prev_dataset = erase_prev_dataset
95
116
  self.length_normalize = length_normalize
96
- self.cache = cache
97
- self.stream = stream
98
-
99
- def __call__(
100
- self,
101
- data: Union[ln.Dataset, AnnData] = None,
102
- name="preprocessed dataset",
103
- description="preprocessed dataset using scprint",
104
- start_at=0,
105
- ):
106
- """
107
- format controls the different input value wrapping, including categorical
108
- binned style, fixed-sum normalized counts, log1p fixed-sum normalized counts, etc.
109
-
110
- Args:
111
- adata (AnnData): The AnnData object to preprocess.
112
- batch_key (str, optional): The key of AnnData.obs to use for batch information. This arg
113
- is used in the highly variable gene selection step.
114
- """
115
- files = []
116
- all_ready_processed_keys = set()
117
- if self.cache:
118
- for i in ln.Artifact.filter(description="preprocessed by scprint"):
119
- all_ready_processed_keys.add(i.initial_version.key)
120
- if isinstance(data, AnnData):
121
- return self.preprocess(data)
122
- elif isinstance(data, ln.Dataset):
123
- for i, file in enumerate(data.artifacts.all()[start_at:]):
124
- # use the counts matrix
125
- print(i)
126
- if file.key in all_ready_processed_keys:
127
- print(f"{file.key} is already processed")
128
- continue
129
- print(file)
130
- if file.backed().obs.is_primary_data.sum() == 0:
131
- print(f"{file.key} only contains non primary cells")
132
- continue
133
- adata = file.load(stream=self.stream)
134
-
135
- print(adata)
136
- try:
137
- adata = self.preprocess(adata)
138
-
139
- except ValueError as v:
140
- if v.args[0].startswith(
141
- "Dataset dropped because contains too many secondary"
142
- ):
143
- print(v)
144
- continue
145
- else:
146
- raise v
147
- try:
148
- file.save()
149
- except IntegrityError as e:
150
- # UNIQUE constraint failed: lnschema_bionty_organism.ontology_id
151
- print(f"seeing {e}... continuing")
152
- myfile = ln.Artifact(
153
- adata,
154
- is_new_version_of=file,
155
- description="preprocessed by scprint",
156
- )
157
- # issues with KLlggfw6I6lvmbqiZm46
158
- myfile.save()
159
- files.append(myfile)
160
- dataset = ln.Dataset(files, name=name, description=description)
161
- dataset.save()
162
- return dataset
163
- else:
164
- raise ValueError("Please provide either anndata or ln.Dataset")
165
-
166
- def preprocess(self, adata: AnnData):
117
+ self.skip_validate = skip_validate
118
+ self.use_layer = use_layer
119
+ self.is_symbol = is_symbol
120
+ self.do_postp = do_postp
121
+ self.use_raw = use_raw
122
+
123
+ def __call__(self, adata) -> AnnData:
124
+ if adata[0].obs.organism_ontology_term_id.iloc[0] not in self.organisms:
125
+ raise ValueError(
126
+ "we cannot work with this organism",
127
+ adata[0].obs.organism_ontology_term_id.iloc[0],
128
+ )
167
129
  if self.additional_preprocess is not None:
168
130
  adata = self.additional_preprocess(adata)
169
- if adata.raw is not None:
131
+ if adata.raw is not None and self.use_raw:
170
132
  adata.X = adata.raw.X
171
133
  del adata.raw
134
+ if self.use_layer is not None:
135
+ adata.X = adata.layers[self.use_layer]
172
136
  if adata.layers is not None:
137
+ if "counts" in adata.layers.keys():
138
+ if np.abs(adata[:50_000].X.astype(int) - adata[:50_000].X).sum():
139
+ print("X was not raw counts, using 'counts' layer")
140
+ adata.X = adata.layers["counts"].copy()
141
+ print("Dropping layers: ", adata.layers.keys())
173
142
  del adata.layers
174
143
  if len(adata.varm.keys()) > 0:
175
144
  del adata.varm
176
- if len(adata.obsm.keys()) > 0:
145
+ if len(adata.obsm.keys()) > 0 and self.do_postp:
177
146
  del adata.obsm
178
- if len(adata.obsp.keys()) > 0:
147
+ if len(adata.obsp.keys()) > 0 and self.do_postp:
179
148
  del adata.obsp
180
149
  if len(adata.uns.keys()) > 0:
181
150
  del adata.uns
182
151
  if len(adata.varp.keys()) > 0:
183
152
  del adata.varp
184
153
  # check that it is a count
185
- if (
186
- int(adata.X[:100].max()) != adata.X[:100].max()
187
- and not self.force_preprocess
188
- ): # check if likely raw data
189
- raise ValueError(
190
- "Data is not raw counts, please check layers, find raw data, or bypass with force_preprocess"
191
- )
154
+ print("checking raw counts")
155
+ if np.abs(
156
+ adata[:50_000].X.astype(int) - adata[:50_000].X
157
+ ).sum(): # check if likely raw data
158
+ if not self.force_preprocess:
159
+ raise ValueError(
160
+ "Data is not raw counts, please check layers, find raw data, or bypass with force_preprocess"
161
+ )
162
+ else:
163
+ print(
164
+ "Data is not raw counts, please check layers, find raw data, or bypass with force_preprocess"
165
+ )
192
166
  # please check layers
193
167
  # if not available count drop
168
+ prevsize = adata.shape[0]
169
+ # dropping non primary
170
+ if "is_primary_data" in adata.obs.columns:
171
+ adata = adata[adata.obs.is_primary_data]
172
+ if adata.shape[0] < self.min_dataset_size:
173
+ raise Exception("Dataset dropped due to too many secondary cells")
174
+ print(
175
+ "removed {} non primary cells, {} renamining".format(
176
+ prevsize - adata.shape[0], adata.shape[0]
177
+ )
178
+ )
194
179
  # # cleanup and dropping low expressed genes and unexpressed cells
195
180
  prevsize = adata.shape[0]
196
181
  adata.obs["nnz"] = np.array(np.sum(adata.X != 0, axis=1).flatten())[0]
197
- adata = adata[
198
- (adata.obs["nnz"] > self.min_nnz_genes)
199
- # or if slide-seq
200
- | (
201
- (adata.obs.assay_ontology_term_id == "EFO:0030062")
202
- & (adata.obs["nnz"] > (self.min_nnz_genes / 3))
203
- )
204
- ]
182
+ adata = adata[(adata.obs["nnz"] > self.min_nnz_genes)]
205
183
  if self.filter_gene_by_counts:
206
184
  sc.pp.filter_genes(adata, min_counts=self.filter_gene_by_counts)
207
185
  if self.filter_cell_by_counts:
208
186
  sc.pp.filter_cells(adata, min_counts=self.filter_cell_by_counts)
209
187
  # if lost > 50% of the dataset, drop dataset
210
188
  # load the genes
211
- genesdf = self.load_genes(adata.obs.organism_ontology_term_id[0])
189
+ genesdf = data_utils.load_genes(adata.obs.organism_ontology_term_id.iloc[0])
212
190
 
213
191
  if prevsize / adata.shape[0] > self.maxdropamount:
214
192
  raise Exception(
@@ -220,19 +198,32 @@ class Preprocessor:
220
198
  "Dataset dropped due to low expressed genes and unexpressed cells: current size: "
221
199
  + str(adata.shape[0])
222
200
  )
223
- # dropping non primary
224
- adata = adata[adata.obs.is_primary_data]
225
- if adata.shape[0] < self.min_dataset_size:
226
- raise ValueError(
227
- "Dataset dropped because contains too many secondary cells"
201
+ print(
202
+ "filtered out {} cells, {} renamining".format(
203
+ prevsize - adata.shape[0], adata.shape[0]
228
204
  )
205
+ )
206
+
207
+ if self.is_symbol:
208
+ genesdf["ensembl_gene_id"] = genesdf.index
209
+ var = (
210
+ adata.var.merge(
211
+ genesdf.drop_duplicates("symbol").set_index("symbol", drop=False),
212
+ left_index=True,
213
+ right_index=True,
214
+ how="inner",
215
+ )
216
+ .sort_values(by="ensembl_gene_id")
217
+ .set_index("ensembl_gene_id")
218
+ )
219
+ adata = adata[:, var["symbol"]]
220
+ adata.var = var
221
+ genesdf = genesdf.set_index("ensembl_gene_id")
229
222
 
230
223
  intersect_genes = set(adata.var.index).intersection(set(genesdf.index))
231
224
  print(f"Removed {len(adata.var.index) - len(intersect_genes)} genes.")
232
225
  if len(intersect_genes) < self.min_valid_genes_id:
233
- raise Exception(
234
- "Dataset dropped due to too many genes not mapping to it"
235
- )
226
+ raise Exception("Dataset dropped due to too many genes not mapping to it")
236
227
  adata = adata[:, list(intersect_genes)]
237
228
  # marking unseen genes
238
229
  unseen = set(genesdf.index) - set(adata.var.index)
@@ -245,60 +236,55 @@ class Preprocessor:
245
236
  adata = ad.concat([adata, emptyda], axis=1, join="outer", merge="only")
246
237
  # do a validation function
247
238
  adata.uns["unseen_genes"] = list(unseen)
248
- data_utils.validate(
249
- adata, self.lb, organism=adata.obs.organism_ontology_term_id[0]
250
- )
251
- # length normalization
252
- if (
253
- adata.obs["assay_ontology_term_id"].isin(FULL_LENGTH_ASSAYS).any()
254
- and self.length_normalize
255
- ):
256
- subadata = data_utils.length_normalize(
257
- adata[
258
- adata.obs["assay_ontology_term_id"].isin(
259
- FULL_LENGTH_ASSAYS
260
- )
261
- ],
262
- )
239
+ if not self.skip_validate:
240
+ print("validating")
241
+ data_utils.validate(adata, organism=adata.obs.organism_ontology_term_id[0])
242
+ # length normalization
243
+ if (
244
+ adata.obs["assay_ontology_term_id"].isin(FULL_LENGTH_ASSAYS).any()
245
+ and self.length_normalize
246
+ ):
247
+ print("doing length norm")
248
+ subadata = data_utils.length_normalize(
249
+ adata[adata.obs["assay_ontology_term_id"].isin(FULL_LENGTH_ASSAYS)],
250
+ )
263
251
 
264
- adata = ad.concat(
265
- [
266
- adata[
267
- ~adata.obs["assay_ontology_term_id"].isin(
268
- FULL_LENGTH_ASSAYS
269
- )
252
+ adata = ad.concat(
253
+ [
254
+ adata[
255
+ ~adata.obs["assay_ontology_term_id"].isin(
256
+ FULL_LENGTH_ASSAYS
257
+ )
258
+ ],
259
+ subadata,
270
260
  ],
271
- subadata,
272
- ],
273
- axis=0,
274
- join="outer",
275
- merge="only",
276
- )
277
- # step 3: normalize total
278
- if self.normalize_total:
279
- sc.pp.normalize_total(adata, target_sum=self.normalize_total)
280
- if self.log1p and not is_log1p(adata):
281
- sc.pp.log1p(adata)
261
+ axis=0,
262
+ join="outer",
263
+ merge="only",
264
+ )
282
265
 
283
266
  # QC
267
+
284
268
  adata.var[genesdf.columns] = genesdf.loc[adata.var.index]
269
+ for name in ["stable_id", "created_at", "updated_at"]:
270
+ if name in adata.var.columns:
271
+ adata.var = adata.var.drop(columns=name)
272
+ print("startin QC")
285
273
  sc.pp.calculate_qc_metrics(
286
274
  adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20]
287
275
  )
288
276
 
289
277
  adata.obs["outlier"] = (
290
278
  data_utils.is_outlier(adata, "total_counts", self.madoutlier)
291
- | data_utils.is_outlier(
292
- adata, "n_genes_by_counts", self.madoutlier
293
- )
279
+ | data_utils.is_outlier(adata, "n_genes_by_counts", self.madoutlier)
294
280
  | data_utils.is_outlier(
295
281
  adata, "pct_counts_in_top_20_genes", self.madoutlier
296
282
  )
297
283
  )
298
284
 
299
- adata.obs["mt_outlier"] = data_utils.is_outlier(
300
- adata, "pct_counts_mt", 3
301
- ) | (adata.obs["pct_counts_mt"] > self.pct_mt_outlier)
285
+ adata.obs["mt_outlier"] = data_utils.is_outlier(adata, "pct_counts_mt", 3) | (
286
+ adata.obs["pct_counts_mt"] > self.pct_mt_outlier
287
+ )
302
288
  total_outliers = (adata.obs["outlier"] | adata.obs["mt_outlier"]).sum()
303
289
  total_cells = adata.shape[0]
304
290
  percentage_outliers = (total_outliers / total_cells) * 100
@@ -309,27 +295,38 @@ class Preprocessor:
309
295
  # raise Exception("More than 50% of the dataset has been dropped due to outliers.")
310
296
  # adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()
311
297
  # remaining
312
- # step 5: subset hvg
313
- if self.subset_hvg:
314
- sc.pp.highly_variable_genes(
315
- adata,
316
- n_top_genes=self.subset_hvg,
317
- batch_key=self.batch_key,
318
- flavor=self.hvg_flavor,
319
- subset=True,
320
- )
298
+
321
299
  # based on the topometry paper https://www.biorxiv.org/content/10.1101/2022.03.14.484134v2
322
300
  # https://rapids-singlecell.readthedocs.io/en/latest/api/generated/rapids_singlecell.pp.pca.html#rapids_singlecell.pp.pca
323
- sc.pp.neighbors(
324
- adata, n_pcs=500 if adata.shape[0] > 500 else adata.shape[0] - 2
325
- )
326
- sc.tl.leiden(adata, key_added="leiden_3", resolution=3.0)
327
- sc.tl.leiden(adata, key_added="leiden_2", resolution=2.0)
328
- sc.tl.leiden(adata, key_added="leiden_1", resolution=1.0)
329
- sc.tl.umap(adata)
330
- # additional
331
- if self.additional_postprocess is not None:
332
- adata = self.additional_postprocess(adata)
301
+ if self.do_postp:
302
+ print("normalize")
303
+ adata.layers["clean"] = sc.pp.log1p(
304
+ sc.pp.normalize_total(
305
+ adata, target_sum=self.normalize_sum, inplace=False
306
+ )["X"]
307
+ )
308
+ # step 5: subset hvg
309
+ if self.subset_hvg:
310
+ sc.pp.highly_variable_genes(
311
+ adata,
312
+ layer="clean",
313
+ n_top_genes=self.subset_hvg,
314
+ batch_key=self.batch_key,
315
+ flavor=self.hvg_flavor,
316
+ subset=False,
317
+ )
318
+ adata.obsm["clean_pca"] = sc.pp.pca(
319
+ adata.layers["clean"],
320
+ n_comps=300 if adata.shape[0] > 300 else adata.shape[0] - 2,
321
+ )
322
+ sc.pp.neighbors(adata, use_rep="clean_pca")
323
+ sc.tl.leiden(adata, key_added="leiden_3", resolution=3.0)
324
+ sc.tl.leiden(adata, key_added="leiden_2", resolution=2.0)
325
+ sc.tl.leiden(adata, key_added="leiden_1", resolution=1.0)
326
+ sc.tl.umap(adata)
327
+ # additional
328
+ if self.additional_postprocess is not None:
329
+ adata = self.additional_postprocess(adata)
333
330
  adata = adata[:, adata.var.sort_index().index]
334
331
  # create random ids for all cells
335
332
  adata.obs.index = [str(uuid4()) for _ in range(adata.shape[0])]
@@ -341,9 +338,7 @@ class Preprocessor:
341
338
  print("Binning data ...")
342
339
  if not isinstance(self.binning, int):
343
340
  raise ValueError(
344
- "Binning arg must be an integer, but got {}.".format(
345
- self.binning
346
- )
341
+ "Binning arg must be an integer, but got {}.".format(self.binning)
347
342
  )
348
343
  # NOTE: the first bin is always a spectial for zero
349
344
  n_bins = self.binning
@@ -379,23 +374,140 @@ class Preprocessor:
379
374
  bin_edges.append(np.concatenate([[0], bins]))
380
375
  adata.layers[self.result_binned_key] = np.stack(binned_rows)
381
376
  adata.obsm["bin_edges"] = np.stack(bin_edges)
377
+ print("done")
382
378
  return adata
383
379
 
384
- def load_genes(self, organism):
385
- genesdf = bt.Gene(
386
- organism=self.lb.Organism.filter(ontology_id=organism).first().name
387
- ).df()
388
- genesdf = genesdf.drop_duplicates(subset="ensembl_gene_id")
389
- genesdf = genesdf.set_index("ensembl_gene_id")
390
- # mitochondrial genes
391
- genesdf["mt"] = genesdf.symbol.astype(str).str.startswith("MT-")
392
- # ribosomal genes
393
- genesdf["ribo"] = genesdf.symbol.astype(str).str.startswith(
394
- ("RPS", "RPL")
395
- )
396
- # hemoglobin genes.
397
- genesdf["hb"] = genesdf.symbol.astype(str).str.contains(("^HB[^(P)]"))
398
- return genesdf
380
+
381
+ class LaminPreprocessor(Preprocessor):
382
+ def __init__(
383
+ self,
384
+ *args,
385
+ erase_prev_dataset: bool = False,
386
+ cache: bool = True,
387
+ stream: bool = False,
388
+ keep_files: bool = True,
389
+ **kwargs,
390
+ ):
391
+ super().__init__(*args, **kwargs)
392
+ self.erase_prev_dataset = erase_prev_dataset
393
+ self.cache = cache
394
+ self.stream = stream
395
+ self.keep_files = keep_files
396
+
397
+ def __call__(
398
+ self,
399
+ data: Union[ln.Collection, AnnData] = None,
400
+ name="preprocessed dataset",
401
+ description="preprocessed dataset using scprint",
402
+ start_at=0,
403
+ version=2,
404
+ ):
405
+ """
406
+ format controls the different input value wrapping, including categorical
407
+ binned style, fixed-sum normalized counts, log1p fixed-sum normalized counts, etc.
408
+
409
+ Args:
410
+ adata (AnnData): The AnnData object to preprocess.
411
+ batch_key (str, optional): The key of AnnData.obs to use for batch information. This arg
412
+ is used in the highly variable gene selection step.
413
+ """
414
+ files = []
415
+ all_ready_processed_keys = set()
416
+ if self.cache:
417
+ for i in ln.Artifact.filter(description=description):
418
+ all_ready_processed_keys.add(i.stem_uid)
419
+ if isinstance(data, AnnData):
420
+ return super().__call__(data)
421
+ elif isinstance(data, ln.Collection):
422
+ for i, file in enumerate(data.artifacts.all()[start_at:]):
423
+ # use the counts matrix
424
+ print(i)
425
+ if file.stem_uid in all_ready_processed_keys:
426
+ print(f"{file.stem_uid} is already processed... not preprocessing")
427
+ continue
428
+ print(file)
429
+ backed = file.backed()
430
+ if backed.obs.is_primary_data.sum() == 0:
431
+ print(f"{file.key} only contains non primary cells.. dropping")
432
+ continue
433
+ if backed.shape[1] < 1000:
434
+ print(
435
+ f"{file.key} only contains less than 1000 genes and is likely not scRNAseq... dropping"
436
+ )
437
+ continue
438
+ if file.size <= MAXFILESIZE:
439
+ adata = file.load(stream=self.stream)
440
+ print(adata)
441
+ else:
442
+ badata = backed
443
+ print(badata)
444
+
445
+ try:
446
+ if file.size > MAXFILESIZE:
447
+ print(
448
+ f"dividing the dataset as it is too large: {file.size//1_000_000_000}Gb"
449
+ )
450
+ num_blocks = int(np.ceil(file.size / (MAXFILESIZE / 2)))
451
+ block_size = int(
452
+ (np.ceil(badata.shape[0] / 30_000) * 30_000) // num_blocks
453
+ )
454
+ print("num blocks ", num_blocks)
455
+ for i in range(num_blocks):
456
+ start_index = i * block_size
457
+ end_index = min((i + 1) * block_size, badata.shape[0])
458
+ block = badata[start_index:end_index].to_memory()
459
+ print(block)
460
+ block = super().__call__(block)
461
+ myfile = ln.Artifact(
462
+ block,
463
+ is_new_version_of=file,
464
+ description=description,
465
+ version=str(version) + "_s" + str(i),
466
+ )
467
+ myfile.save()
468
+ if self.keep_files:
469
+ files.append(myfile)
470
+ else:
471
+ del myfile
472
+ del block
473
+
474
+ else:
475
+ adata = super().__call__(adata)
476
+ myfile = ln.Artifact(
477
+ adata,
478
+ is_new_version_of=file,
479
+ description=description,
480
+ version=str(version),
481
+ )
482
+ myfile.save()
483
+ if self.keep_files:
484
+ files.append(myfile)
485
+ else:
486
+ del myfile
487
+ del adata
488
+
489
+ except ValueError as v:
490
+ if v.args[0].startswith("we cannot work with this organism"):
491
+ print(v)
492
+ continue
493
+ else:
494
+ raise v
495
+ except Exception as e:
496
+ if e.args[0].startswith("Dataset dropped due to"):
497
+ print(e)
498
+ continue
499
+ else:
500
+ raise e
501
+
502
+ # issues with KLlggfw6I6lvmbqiZm46
503
+ if self.keep_files:
504
+ dataset = ln.Collection(files, name=name, description=description)
505
+ dataset.save()
506
+ return dataset
507
+ else:
508
+ return
509
+ else:
510
+ raise ValueError("Please provide either anndata or ln.Collection")
399
511
 
400
512
 
401
513
  def is_log1p(adata: AnnData) -> bool:
@@ -494,7 +606,7 @@ def additional_preprocess(adata):
494
606
  adata.obs["cell_culture"] = False
495
607
  # if cell_type contains the word "(cell culture)" then it is a cell culture and we mark it as so and remove this from the cell type
496
608
  loc = adata.obs["cell_type_ontology_term_id"].str.contains(
497
- "(cell culture)"
609
+ "(cell culture)", regex=False
498
610
  )
499
611
  if loc.sum() > 0:
500
612
  adata.obs["cell_type_ontology_term_id"] = adata.obs[
@@ -505,7 +617,9 @@ def additional_preprocess(adata):
505
617
  loc, "cell_type_ontology_term_id"
506
618
  ].str.replace(" (cell culture)", "")
507
619
 
508
- loc = adata.obs["tissue_ontology_term_id"].str.contains("(cell culture)")
620
+ loc = adata.obs["tissue_ontology_term_id"].str.contains(
621
+ "(cell culture)", regex=False
622
+ )
509
623
  if loc.sum() > 0:
510
624
  adata.obs.loc[loc, "cell_culture"] = True
511
625
  adata.obs["tissue_ontology_term_id"] = adata.obs[
@@ -513,9 +627,9 @@ def additional_preprocess(adata):
513
627
  ].astype(str)
514
628
  adata.obs.loc[loc, "tissue_ontology_term_id"] = adata.obs.loc[
515
629
  loc, "tissue_ontology_term_id"
516
- ].str.replace(r" \(cell culture\)", "")
630
+ ].str.replace(" (cell culture)", "")
517
631
 
518
- loc = adata.obs["tissue_ontology_term_id"].str.contains("(organoid)")
632
+ loc = adata.obs["tissue_ontology_term_id"].str.contains("(organoid)", regex=False)
519
633
  if loc.sum() > 0:
520
634
  adata.obs.loc[loc, "cell_culture"] = True
521
635
  adata.obs["tissue_ontology_term_id"] = adata.obs[
@@ -523,9 +637,9 @@ def additional_preprocess(adata):
523
637
  ].astype(str)
524
638
  adata.obs.loc[loc, "tissue_ontology_term_id"] = adata.obs.loc[
525
639
  loc, "tissue_ontology_term_id"
526
- ].str.replace(r" \(organoid\)", "")
640
+ ].str.replace(" (organoid)", "")
527
641
 
528
- loc = adata.obs["tissue_ontology_term_id"].str.contains("CL:")
642
+ loc = adata.obs["tissue_ontology_term_id"].str.contains("CL:", regex=False)
529
643
  if loc.sum() > 0:
530
644
  adata.obs["tissue_ontology_term_id"] = adata.obs[
531
645
  "tissue_ontology_term_id"
@@ -553,12 +667,8 @@ def additional_postprocess(adata):
553
667
  ) # + "_" + adata.obs['dataset_id'].astype(str)
554
668
 
555
669
  # if group is too small
556
- okgroup = [
557
- i for i, j in adata.obs["dpt_group"].value_counts().items() if j >= 10
558
- ]
559
- not_okgroup = [
560
- i for i, j in adata.obs["dpt_group"].value_counts().items() if j < 3
561
- ]
670
+ okgroup = [i for i, j in adata.obs["dpt_group"].value_counts().items() if j >= 10]
671
+ not_okgroup = [i for i, j in adata.obs["dpt_group"].value_counts().items() if j < 3]
562
672
  # set the group to empty
563
673
  adata.obs.loc[adata.obs["dpt_group"].isin(not_okgroup), "dpt_group"] = ""
564
674
  adata.obs["heat_diff"] = np.nan
@@ -582,3 +692,209 @@ def additional_postprocess(adata):
582
692
  # to query N next time points we just get the N elements below and check they are in the group
583
693
  # to query the N nearest neighbors we just get the N elements above and N below and check they are in the group
584
694
  return adata
695
+
696
+
697
+ """
698
+ sexr = {
699
+ "Male": "PATO:0000384",
700
+ "Female": "PATO:0000383",
701
+ }
702
+ tissuer = {
703
+ "Kidney": "UBERON:0002113",
704
+ "Lung": "UBERON:0002048",
705
+ "Heart": "UBERON:0000948",
706
+ "Liver": "UBERON:0002107",
707
+ "Brain": "UBERON:0000955",
708
+ "BAT": "UBERON:0001348",
709
+ "Jejunum": "UBERON:0002115",
710
+ "Colon": "UBERON:0001155",
711
+ "Ileum": "UBERON:0002116",
712
+ "Stomach": "UBERON:0000945",
713
+ "gWAT": "UBERON:0001347",
714
+ "Duodenum": "UBERON:0002114",
715
+ "iWAT": "UBERON:0001347",
716
+ "Muscle": "UBERON:0001630",
717
+ }
718
+ ager = {
719
+ "03_months": "MmusDv:0000063",
720
+ "16_months": "MmusDv:0000087",
721
+ "06_months": "MmusDv:0000077",
722
+ "23_months": "MmusDv:0000127",
723
+ "12_months": "MmusDv:0000083",
724
+ "21_months": "MmusDv:0000125",
725
+ }
726
+
727
+ celltyper = {
728
+ "Proximal tubule cells": "epithelial cell of proximal tubule",
729
+ "Vascular endothelial cells": "endothelial cell of vascular tree",
730
+ "Intestinal epithelial cells": "intestinal epithelial cell",
731
+ "Hepatocytes": "hepatocyte",
732
+ "Fibroblasts": "fibroblast",
733
+ "Lymphoid cells_T cells": "T cell",
734
+ "Myeloid cells": "myeloid cell",
735
+ "Brown adipocytes": "brown fat cell",
736
+ "Lymphoid cells_B cells": "B cell",
737
+ "Adipocytes": "fat cell",
738
+ "Type II alveolar epithelial cells": "type II pneumocyte",
739
+ "Colonic epithelial cells": "colon epithelial cell",
740
+ "Mural cells": "mural cell",
741
+ "Cerebellum granule neurons": "cerebellar neuron",
742
+ "Goblet cells": "goblet cell",
743
+ "Vascular endothelial cells_General capillary cells": "endothelial cell of vascular tree",
744
+ "Ventricular cardiomyocytes": "regular ventricular cardiac myocyte",
745
+ "Type II myonuclei": "type II muscle cell",
746
+ "Thick ascending limb of LOH cells": "vasa recta ascending limb cell",
747
+ "Gastric mucous cells": "mucous cell of stomach",
748
+ "Distal convoluted tubule cells": "kidney distal convoluted tubule epithelial cell",
749
+ "Adipoce stem and progenitor cells": "hepatic oval stem cell",
750
+ "Chief cells": "chief cell of parathyroid gland",
751
+ "Paneth cells": "paneth cell",
752
+ "Myeloid cells_Alveolar macrophages": "alveolar macrophage",
753
+ "Lymphoid cells_Plasma cells": "plasma cell",
754
+ "Secretory cells": "secretory cell",
755
+ "Lymphoid cells_Resting B cells": "B cell",
756
+ "Cortical projection neurons 1": "corticothalamic-projecting glutamatergic cortical neuron",
757
+ "Endocardial endothelial cells": "endocardial cell",
758
+ "Type I alveolar epithelial cells": "type I pneumocyte",
759
+ "Interbrain and midbrain neurons 1": "midbrain dopaminergic neuron",
760
+ "Interbrain and midbrain neurons 2": "midbrain dopaminergic neuron",
761
+ "Myeloid cells_Monocytes": "monocyte",
762
+ "Myeloid cells_Dendritic cells": "myeloid dendritic cell",
763
+ "Oligodendrocytes": "oligodendrocyte",
764
+ "Lymphatic endothelial cells": "endothelial cell of lymphatic vessel",
765
+ "Enteroendocrine cells": "enteroendocrine cell",
766
+ "Vascular endothelial cells_Aerocytes": "endothelial cell of vascular tree",
767
+ "Gastric epithelial cells": "epithelial cell of stomach",
768
+ "Fibro–adipogenic progenitors": "fibro/adipogenic progenitor cell",
769
+ "Parietal cells": "parietal cell",
770
+ "Astrocytes": "astrocyte",
771
+ "Connecting tubule cells": "kidney connecting tubule beta-intercalated cell",
772
+ "Hepatic stellate cells": "hepatic stellate cell",
773
+ "Striatal neurons 1": "striatum neuron",
774
+ "Mesothelial cells": "mesothelial cell",
775
+ "Lymphoid cells_Cycling B cells": "germinal center B cell",
776
+ "Type B intercalated cells": "renal beta-intercalated cell",
777
+ "Type A intercalated cells": "renal alpha-intercalated cell",
778
+ "Myeloid cells_Neutrophils": "neutrophil",
779
+ "Principal cells": "renal principal cell",
780
+ "Cortical projection neurons 2": "corticothalamic-projecting glutamatergic cortical neuron",
781
+ "Muc2-producing goblet cells": "intestine goblet cell",
782
+ "OB neurons 1": "olfactory bulb interneuron",
783
+ "Atrial cardiomyocytes": "regular atrial cardiac myocyte",
784
+ "Lymphoid cells": "leukocyte",
785
+ "Skeletal muscle cells": "cell of skeletal muscle",
786
+ "Neural cells": "neural cell",
787
+ "Cerebellum interneurons": "cerebellar neuron",
788
+ "Interneurons 1": "interneuron",
789
+ "Descending thin limb of LOH cells": "vasa recta descending limb cell",
790
+ "Tuft cells": "intestinal tuft cell",
791
+ "Oligodendrocyte progenitor cells": "oligodendrocyte precursor cell",
792
+ "Enteric glia": "enteroglial cell",
793
+ "Endothelial cells": "endothelial cell",
794
+ "Dentate gyrus neurons": "dentate gyrus neuron",
795
+ "Myeloid cells_Interstitial macrophages": "tissue-resident macrophage",
796
+ "Ciliated cells": "ciliated cell",
797
+ "Microglia": "microglial cell",
798
+ "Interneurons 2": "interneuron",
799
+ "Ncam1 positive cells": "parafollicular cell",
800
+ "Rdh16 positive cells": "unknown",
801
+ "Circulating hepatoblasts": "hepatoblast",
802
+ "Enteric neurons": "enteric neuron",
803
+ "Ascending thin limb of LOH cells": "vasa recta ascending limb cell",
804
+ "Mfge8 positive cells": "unknown",
805
+ "Cholangiocytes": "cholangiocyte",
806
+ "Podocytes": "podocyte",
807
+ "Muscle satellite cells": "skeletal muscle satellite cell",
808
+ "Purkinje neurons": "Purkinje cell",
809
+ "Juxtaglomerular cells": "juxtaglomerular complex cell",
810
+ "Ngf positive cells": "neurogliaform cell",
811
+ "Bergmann glia": "Bergmann glial cell",
812
+ "Megf11 positive cells": "unknown",
813
+ "Myotendinous junction myonuclei": "unknown",
814
+ "Vascular leptomeningeal cells": "vascular leptomeningeal cell",
815
+ "Urothelial cells": "urothelial cell",
816
+ "Tenocytes": "tendon cell",
817
+ "Myelinating Schwann cells": "myelinating Schwann cell",
818
+ "Epididymal cells": "epididymis glandular cell",
819
+ "Muc6-producing goblet cells": "lung goblet cell",
820
+ "Type I myonuclei": "type I muscle cell",
821
+ "OB neurons 2": "olfactory bulb interneuron",
822
+ "Sis positive cells": "unknown",
823
+ "Lgr5 positive cells": "unknown",
824
+ "Macula densa cells": "macula densa epithelial cell",
825
+ "Choroid plexus epithelial cells": "choroid plexus epithelial cell",
826
+ "Cortical projection neurons 3": "corticothalamic-projecting glutamatergic cortical neuron",
827
+ "Interstitial cells of Cajal": "interstitial cell of Cajal",
828
+ "Cacna1b positive cells": "unknown",
829
+ "Hindbrain neurons 2": "neuron",
830
+ "Myeloid cells_Basophils": "basophil",
831
+ "Ependymal cells": "ependymal cell",
832
+ "Muc5ac-producing goblet cells": "lung goblet cell",
833
+ "Myeloid cells_Mast cells": "mast cell",
834
+ "Pulmonary neuroendocrine cells": "lung neuroendocrine cell",
835
+ "Basal cells": "basal cell",
836
+ "OB neurons 3": "olfactory bulb interneuron",
837
+ "Non-myelinating Schwann cells": "non-myelinating Schwann cell",
838
+ "Asic2 positive cells": "unknown",
839
+ "Striatal neurons 2": "striatum neuron",
840
+ "Erythroblasts": "erythroblast",
841
+ "Hindbrain neurons 1": "neuron",
842
+ "Neuromuscular junction myonuclei": "unknown",
843
+ "Habenula neurons": "unknown",
844
+ "Pituitary cells": "pituitary gland cell",
845
+ "Unipolar brush cells": "unipolar brush cell",
846
+ "Pde4c positive cells": "unknown",
847
+ "Pancreatic acinar cells": "pancreatic acinar cell",
848
+ "Inferior olivary nucleus neurons": "bushy cell",
849
+ "Colec10 positive cells": "unknown",
850
+ "Fcgbp positive cells": "unknown",
851
+ "Fut9 positive cells": "unknown",
852
+ "Mirg positive cells": "unknown",
853
+ "Alox15 positive cells": "unknown",
854
+ "Osteoblasts": "osteoblast",
855
+ }
856
+ genesdf = utils.load_genes("NCBITaxon:10090")
857
+ {k: v if v =="unknown" else bt.CellType.filter(name=v).one().ontology_id for k, v in celltyper.items()}
858
+
859
+ adata.obs["organism_ontology_term_id"] = "NCBITaxon:10090"
860
+ adata.obs["tissue_ontology_term_id"] = adata.obs["Organ_name"].replace(tissuer)
861
+ adata.obs["cell_type_ontology_term_id"] = adata.obs["Main_cell_type"].replace(
862
+ celltyper
863
+ )
864
+ adata.obs["disease_ontology_term_id"] = "PATO:0000461"
865
+ adata.obs["assay_ontology_term_id"] = "unknown"
866
+ adata.obs["self_reported_ethnicity_ontology_term_id"] = "unknown"
867
+ adata.obs["development_stage_ontology_term_id"] = adata.obs["Age_group"].replace(
868
+ ager
869
+ )
870
+ adata.obs["sex_ontology_term_id"] = adata.obs["Gender"].replace(sexr)
871
+
872
+ for i in range(num_blocks):
873
+ start_index = i * block_size
874
+ end_index = min((i + 1) * block_size, len(adata))
875
+ block = adata[start_index:end_index].to_memory()
876
+ # process block here
877
+
878
+ block = block[(block.obs["Gene_count"] > 400)]
879
+
880
+ intersect_genes = set(block.var.index).intersection(set(genesdf.index))
881
+ print(f"Removed {len(block.var.index) - len(intersect_genes)} genes.")
882
+ block = block[:, list(intersect_genes)]
883
+ # marking unseen genes
884
+ unseen = set(genesdf.index) - set(block.var.index)
885
+ # adding them to adata
886
+ emptyda = ad.AnnData(
887
+ csr_matrix((block.shape[0], len(unseen)), dtype=np.float32),
888
+ var=pd.DataFrame(index=list(unseen)),
889
+ obs=pd.DataFrame(index=block.obs.index),
890
+ )
891
+ block = ad.concat([block, emptyda], axis=1, join="outer", merge="only")
892
+ # do a validation function
893
+ block.uns["unseen_genes"] = list(unseen)
894
+ block = block[:, block.var.sort_index().index]
895
+ block.var[genesdf.columns] = genesdf.loc[block.var.index]
896
+ for name in ["stable_id", "created_at", "updated_at"]:
897
+ if name in block.var.columns:
898
+ block.var = block.var.drop(columns=name)
899
+ block.write_h5ad('zhang2024_adata_'+str(i)+".h5ad")
900
+ """