scdataloader 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,7 @@ import scanpy as sc
9
9
  from anndata import AnnData
10
10
  from django.db import IntegrityError
11
11
  from scipy.sparse import csr_matrix
12
+ import os
12
13
 
13
14
  from scdataloader import utils as data_utils
14
15
 
@@ -18,6 +19,8 @@ FULL_LENGTH_ASSAYS = [
18
19
  "EFO:0008931",
19
20
  ]
20
21
 
22
+ MAXFILESIZE = 10_000_000_000
23
+
21
24
 
22
25
  class Preprocessor:
23
26
  """
@@ -30,23 +33,27 @@ class Preprocessor:
30
33
  filter_gene_by_counts: Union[int, bool] = False,
31
34
  filter_cell_by_counts: Union[int, bool] = False,
32
35
  normalize_sum: float = 1e4,
33
- keep_norm_layer: bool = False,
34
36
  subset_hvg: int = 0,
37
+ use_layer: Optional[str] = None,
38
+ is_symbol: bool = False,
35
39
  hvg_flavor: str = "seurat_v3",
36
40
  binning: Optional[int] = None,
37
41
  result_binned_key: str = "X_binned",
38
42
  length_normalize: bool = False,
39
- force_preprocess=False,
40
- min_dataset_size=100,
41
- min_valid_genes_id=10_000,
42
- min_nnz_genes=200,
43
- maxdropamount=2,
44
- madoutlier=5,
45
- pct_mt_outlier=8,
46
- batch_key=None,
47
- skip_validate=False,
43
+ force_preprocess: bool = False,
44
+ min_dataset_size: int = 100,
45
+ min_valid_genes_id: int = 10_000,
46
+ min_nnz_genes: int = 200,
47
+ maxdropamount: int = 50,
48
+ madoutlier: int = 5,
49
+ pct_mt_outlier: int = 8,
50
+ batch_key: Optional[str] = None,
51
+ skip_validate: bool = False,
48
52
  additional_preprocess: Optional[Callable[[AnnData], AnnData]] = None,
49
53
  additional_postprocess: Optional[Callable[[AnnData], AnnData]] = None,
54
+ do_postp: bool = True,
55
+ organisms: list[str] = ["NCBITaxon:9606", "NCBITaxon:10090"],
56
+ use_raw: bool = True,
50
57
  ) -> None:
51
58
  """
52
59
  Initializes the preprocessor and configures the workflow steps.
@@ -67,14 +74,34 @@ class Preprocessor:
67
74
  binning (int, optional): Determines whether to bin the data into discrete values of number of bins provided.
68
75
  result_binned_key (str, optional): Specifies the key of :class:`~anndata.AnnData` to store the binned data.
69
76
  Defaults to "X_binned".
77
+ length_normalize (bool, optional): Determines whether to length normalize the data.
78
+ Defaults to False.
79
+ force_preprocess (bool, optional): Determines whether to bypass the check of raw counts.
80
+ Defaults to False.
81
+ min_dataset_size (int, optional): The minimum size required for a dataset to be kept.
82
+ Defaults to 100.
83
+ min_valid_genes_id (int, optional): The minimum number of valid genes to keep a dataset.
84
+ Defaults to 10_000.
85
+ min_nnz_genes (int, optional): The minimum number of non-zero genes to keep a cell.
86
+ Defaults to 200.
87
+ maxdropamount (int, optional): The maximum amount of dropped cells per dataset. (2 for 50% drop, 3 for 33% drop, etc.)
88
+ Defaults to 2.
89
+ madoutlier (int, optional): The maximum absolute deviation of the outlier samples.
90
+ Defaults to 5.
91
+ pct_mt_outlier (int, optional): The maximum percentage of mitochondrial genes outlier.
92
+ Defaults to 8.
93
+ batch_key (str, optional): The key of :class:`~anndata.AnnData.obs` to use for batch information.
94
+ This arg is used in the highly variable gene selection step.
95
+ skip_validate (bool, optional): Determines whether to skip the validation step.
96
+ Defaults to False.
70
97
  """
71
98
  self.filter_gene_by_counts = filter_gene_by_counts
72
99
  self.filter_cell_by_counts = filter_cell_by_counts
73
100
  self.normalize_sum = normalize_sum
74
- self.keep_norm_layer = keep_norm_layer
75
101
  self.subset_hvg = subset_hvg
76
102
  self.hvg_flavor = hvg_flavor
77
103
  self.binning = binning
104
+ self.organisms = organisms
78
105
  self.result_binned_key = result_binned_key
79
106
  self.additional_preprocess = additional_preprocess
80
107
  self.additional_postprocess = additional_postprocess
@@ -88,45 +115,71 @@ class Preprocessor:
88
115
  self.batch_key = batch_key
89
116
  self.length_normalize = length_normalize
90
117
  self.skip_validate = skip_validate
118
+ self.use_layer = use_layer
119
+ self.is_symbol = is_symbol
120
+ self.do_postp = do_postp
121
+ self.use_raw = use_raw
91
122
 
92
123
  def __call__(self, adata) -> AnnData:
124
+ if adata[0].obs.organism_ontology_term_id.iloc[0] not in self.organisms:
125
+ raise ValueError(
126
+ "we cannot work with this organism",
127
+ adata[0].obs.organism_ontology_term_id.iloc[0],
128
+ )
93
129
  if self.additional_preprocess is not None:
94
130
  adata = self.additional_preprocess(adata)
95
- if adata.raw is not None:
131
+ if adata.raw is not None and self.use_raw:
96
132
  adata.X = adata.raw.X
97
133
  del adata.raw
134
+ if self.use_layer is not None:
135
+ adata.X = adata.layers[self.use_layer]
98
136
  if adata.layers is not None:
137
+ if "counts" in adata.layers.keys():
138
+ if np.abs(adata[:50_000].X.astype(int) - adata[:50_000].X).sum():
139
+ print("X was not raw counts, using 'counts' layer")
140
+ adata.X = adata.layers["counts"].copy()
141
+ print("Dropping layers: ", adata.layers.keys())
99
142
  del adata.layers
100
143
  if len(adata.varm.keys()) > 0:
101
144
  del adata.varm
102
- if len(adata.obsm.keys()) > 0:
145
+ if len(adata.obsm.keys()) > 0 and self.do_postp:
103
146
  del adata.obsm
104
- if len(adata.obsp.keys()) > 0:
147
+ if len(adata.obsp.keys()) > 0 and self.do_postp:
105
148
  del adata.obsp
106
149
  if len(adata.uns.keys()) > 0:
107
150
  del adata.uns
108
151
  if len(adata.varp.keys()) > 0:
109
152
  del adata.varp
110
153
  # check that it is a count
111
- if (
112
- np.abs(adata.X.astype(int) - adata.X).sum() and not self.force_preprocess
113
- ): # check if likely raw data
114
- raise ValueError(
115
- "Data is not raw counts, please check layers, find raw data, or bypass with force_preprocess"
116
- )
154
+ print("checking raw counts")
155
+ if np.abs(
156
+ adata[:50_000].X.astype(int) - adata[:50_000].X
157
+ ).sum(): # check if likely raw data
158
+ if not self.force_preprocess:
159
+ raise ValueError(
160
+ "Data is not raw counts, please check layers, find raw data, or bypass with force_preprocess"
161
+ )
162
+ else:
163
+ print(
164
+ "Data is not raw counts, please check layers, find raw data, or bypass with force_preprocess"
165
+ )
117
166
  # please check layers
118
167
  # if not available count drop
168
+ prevsize = adata.shape[0]
169
+ # dropping non primary
170
+ if "is_primary_data" in adata.obs.columns:
171
+ adata = adata[adata.obs.is_primary_data]
172
+ if adata.shape[0] < self.min_dataset_size:
173
+ raise Exception("Dataset dropped due to too many secondary cells")
174
+ print(
175
+ "removed {} non primary cells, {} renamining".format(
176
+ prevsize - adata.shape[0], adata.shape[0]
177
+ )
178
+ )
119
179
  # # cleanup and dropping low expressed genes and unexpressed cells
120
180
  prevsize = adata.shape[0]
121
181
  adata.obs["nnz"] = np.array(np.sum(adata.X != 0, axis=1).flatten())[0]
122
- adata = adata[
123
- (adata.obs["nnz"] > self.min_nnz_genes)
124
- # or if slide-seq
125
- | (
126
- (adata.obs.assay_ontology_term_id == "EFO:0030062")
127
- & (adata.obs["nnz"] > (self.min_nnz_genes / 3))
128
- )
129
- ]
182
+ adata = adata[(adata.obs["nnz"] > self.min_nnz_genes)]
130
183
  if self.filter_gene_by_counts:
131
184
  sc.pp.filter_genes(adata, min_counts=self.filter_gene_by_counts)
132
185
  if self.filter_cell_by_counts:
@@ -145,12 +198,27 @@ class Preprocessor:
145
198
  "Dataset dropped due to low expressed genes and unexpressed cells: current size: "
146
199
  + str(adata.shape[0])
147
200
  )
148
- # dropping non primary
149
- adata = adata[adata.obs.is_primary_data]
150
- if adata.shape[0] < self.min_dataset_size:
151
- raise ValueError(
152
- "Dataset dropped because contains too many secondary cells"
201
+ print(
202
+ "filtered out {} cells, {} renamining".format(
203
+ prevsize - adata.shape[0], adata.shape[0]
204
+ )
205
+ )
206
+
207
+ if self.is_symbol:
208
+ genesdf["ensembl_gene_id"] = genesdf.index
209
+ var = (
210
+ adata.var.merge(
211
+ genesdf.drop_duplicates("symbol").set_index("symbol", drop=False),
212
+ left_index=True,
213
+ right_index=True,
214
+ how="inner",
215
+ )
216
+ .sort_values(by="ensembl_gene_id")
217
+ .set_index("ensembl_gene_id")
153
218
  )
219
+ adata = adata[:, var["symbol"]]
220
+ adata.var = var
221
+ genesdf = genesdf.set_index("ensembl_gene_id")
154
222
 
155
223
  intersect_genes = set(adata.var.index).intersection(set(genesdf.index))
156
224
  print(f"Removed {len(adata.var.index) - len(intersect_genes)} genes.")
@@ -169,36 +237,39 @@ class Preprocessor:
169
237
  # do a validation function
170
238
  adata.uns["unseen_genes"] = list(unseen)
171
239
  if not self.skip_validate:
240
+ print("validating")
172
241
  data_utils.validate(adata, organism=adata.obs.organism_ontology_term_id[0])
173
- # length normalization
174
- if (
175
- adata.obs["assay_ontology_term_id"].isin(FULL_LENGTH_ASSAYS).any()
176
- and self.length_normalize
177
- ):
178
- subadata = data_utils.length_normalize(
179
- adata[adata.obs["assay_ontology_term_id"].isin(FULL_LENGTH_ASSAYS)],
180
- )
242
+ # length normalization
243
+ if (
244
+ adata.obs["assay_ontology_term_id"].isin(FULL_LENGTH_ASSAYS).any()
245
+ and self.length_normalize
246
+ ):
247
+ print("doing length norm")
248
+ subadata = data_utils.length_normalize(
249
+ adata[adata.obs["assay_ontology_term_id"].isin(FULL_LENGTH_ASSAYS)],
250
+ )
181
251
 
182
- adata = ad.concat(
183
- [
184
- adata[
185
- ~adata.obs["assay_ontology_term_id"].isin(FULL_LENGTH_ASSAYS)
252
+ adata = ad.concat(
253
+ [
254
+ adata[
255
+ ~adata.obs["assay_ontology_term_id"].isin(
256
+ FULL_LENGTH_ASSAYS
257
+ )
258
+ ],
259
+ subadata,
186
260
  ],
187
- subadata,
188
- ],
189
- axis=0,
190
- join="outer",
191
- merge="only",
192
- )
193
- # step 3: normalize total
194
- adata.layers["clean"] = sc.pp.log1p(
195
- sc.pp.normalize_total(adata, target_sum=self.normalize_sum, inplace=False)[
196
- "X"
197
- ]
198
- )
261
+ axis=0,
262
+ join="outer",
263
+ merge="only",
264
+ )
199
265
 
200
266
  # QC
267
+
201
268
  adata.var[genesdf.columns] = genesdf.loc[adata.var.index]
269
+ for name in ["stable_id", "created_at", "updated_at"]:
270
+ if name in adata.var.columns:
271
+ adata.var = adata.var.drop(columns=name)
272
+ print("startin QC")
202
273
  sc.pp.calculate_qc_metrics(
203
274
  adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20]
204
275
  )
@@ -224,31 +295,38 @@ class Preprocessor:
224
295
  # raise Exception("More than 50% of the dataset has been dropped due to outliers.")
225
296
  # adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()
226
297
  # remaining
227
- # step 5: subset hvg
228
- if self.subset_hvg:
229
- sc.pp.highly_variable_genes(
230
- adata,
231
- layer="clean",
232
- n_top_genes=self.subset_hvg,
233
- batch_key=self.batch_key,
234
- flavor=self.hvg_flavor,
235
- subset=False,
236
- )
298
+
237
299
  # based on the topometry paper https://www.biorxiv.org/content/10.1101/2022.03.14.484134v2
238
300
  # https://rapids-singlecell.readthedocs.io/en/latest/api/generated/rapids_singlecell.pp.pca.html#rapids_singlecell.pp.pca
239
-
240
- adata.obsm["clean_pca"] = sc.pp.pca(
241
- adata.layers["clean"],
242
- n_comps=300 if adata.shape[0] > 300 else adata.shape[0] - 2,
243
- )
244
- sc.pp.neighbors(adata, use_rep="clean_pca")
245
- sc.tl.leiden(adata, key_added="leiden_3", resolution=3.0)
246
- sc.tl.leiden(adata, key_added="leiden_2", resolution=2.0)
247
- sc.tl.leiden(adata, key_added="leiden_1", resolution=1.0)
248
- sc.tl.umap(adata)
249
- # additional
250
- if self.additional_postprocess is not None:
251
- adata = self.additional_postprocess(adata)
301
+ if self.do_postp:
302
+ print("normalize")
303
+ adata.layers["clean"] = sc.pp.log1p(
304
+ sc.pp.normalize_total(
305
+ adata, target_sum=self.normalize_sum, inplace=False
306
+ )["X"]
307
+ )
308
+ # step 5: subset hvg
309
+ if self.subset_hvg:
310
+ sc.pp.highly_variable_genes(
311
+ adata,
312
+ layer="clean",
313
+ n_top_genes=self.subset_hvg,
314
+ batch_key=self.batch_key,
315
+ flavor=self.hvg_flavor,
316
+ subset=False,
317
+ )
318
+ adata.obsm["clean_pca"] = sc.pp.pca(
319
+ adata.layers["clean"],
320
+ n_comps=300 if adata.shape[0] > 300 else adata.shape[0] - 2,
321
+ )
322
+ sc.pp.neighbors(adata, use_rep="clean_pca")
323
+ sc.tl.leiden(adata, key_added="leiden_3", resolution=3.0)
324
+ sc.tl.leiden(adata, key_added="leiden_2", resolution=2.0)
325
+ sc.tl.leiden(adata, key_added="leiden_1", resolution=1.0)
326
+ sc.tl.umap(adata)
327
+ # additional
328
+ if self.additional_postprocess is not None:
329
+ adata = self.additional_postprocess(adata)
252
330
  adata = adata[:, adata.var.sort_index().index]
253
331
  # create random ids for all cells
254
332
  adata.obs.index = [str(uuid4()) for _ in range(adata.shape[0])]
@@ -296,6 +374,7 @@ class Preprocessor:
296
374
  bin_edges.append(np.concatenate([[0], bins]))
297
375
  adata.layers[self.result_binned_key] = np.stack(binned_rows)
298
376
  adata.obsm["bin_edges"] = np.stack(bin_edges)
377
+ print("done")
299
378
  return adata
300
379
 
301
380
 
@@ -306,12 +385,14 @@ class LaminPreprocessor(Preprocessor):
306
385
  erase_prev_dataset: bool = False,
307
386
  cache: bool = True,
308
387
  stream: bool = False,
388
+ keep_files: bool = True,
309
389
  **kwargs,
310
390
  ):
311
391
  super().__init__(*args, **kwargs)
312
392
  self.erase_prev_dataset = erase_prev_dataset
313
393
  self.cache = cache
314
394
  self.stream = stream
395
+ self.keep_files = keep_files
315
396
 
316
397
  def __call__(
317
398
  self,
@@ -319,7 +400,7 @@ class LaminPreprocessor(Preprocessor):
319
400
  name="preprocessed dataset",
320
401
  description="preprocessed dataset using scprint",
321
402
  start_at=0,
322
- version="2",
403
+ version=2,
323
404
  ):
324
405
  """
325
406
  format controls the different input value wrapping, including categorical
@@ -334,49 +415,97 @@ class LaminPreprocessor(Preprocessor):
334
415
  all_ready_processed_keys = set()
335
416
  if self.cache:
336
417
  for i in ln.Artifact.filter(description=description):
337
- all_ready_processed_keys.add(i.initial_version.key)
418
+ all_ready_processed_keys.add(i.stem_uid)
338
419
  if isinstance(data, AnnData):
339
- return self.preprocess(data)
420
+ return super().__call__(data)
340
421
  elif isinstance(data, ln.Collection):
341
422
  for i, file in enumerate(data.artifacts.all()[start_at:]):
342
423
  # use the counts matrix
343
424
  print(i)
344
- if file.key in all_ready_processed_keys:
345
- print(f"{file.key} is already processed")
425
+ if file.stem_uid in all_ready_processed_keys:
426
+ print(f"{file.stem_uid} is already processed... not preprocessing")
346
427
  continue
347
428
  print(file)
348
- if file.backed().obs.is_primary_data.sum() == 0:
349
- print(f"{file.key} only contains non primary cells")
429
+ backed = file.backed()
430
+ if backed.obs.is_primary_data.sum() == 0:
431
+ print(f"{file.key} only contains non primary cells.. dropping")
350
432
  continue
351
- adata = file.load(stream=self.stream)
433
+ if backed.shape[1] < 1000:
434
+ print(
435
+ f"{file.key} only contains less than 1000 genes and is likely not scRNAseq... dropping"
436
+ )
437
+ continue
438
+ if file.size <= MAXFILESIZE:
439
+ adata = file.load(stream=self.stream)
440
+ print(adata)
441
+ else:
442
+ badata = backed
443
+ print(badata)
352
444
 
353
- print(adata)
354
445
  try:
355
- adata = super().__call__(adata)
446
+ if file.size > MAXFILESIZE:
447
+ print(
448
+ f"dividing the dataset as it is too large: {file.size//1_000_000_000}Gb"
449
+ )
450
+ num_blocks = int(np.ceil(file.size / (MAXFILESIZE / 2)))
451
+ block_size = int(
452
+ (np.ceil(badata.shape[0] / 30_000) * 30_000) // num_blocks
453
+ )
454
+ print("num blocks ", num_blocks)
455
+ for i in range(num_blocks):
456
+ start_index = i * block_size
457
+ end_index = min((i + 1) * block_size, badata.shape[0])
458
+ block = badata[start_index:end_index].to_memory()
459
+ print(block)
460
+ block = super().__call__(block)
461
+ myfile = ln.Artifact(
462
+ block,
463
+ is_new_version_of=file,
464
+ description=description,
465
+ version=str(version) + "_s" + str(i),
466
+ )
467
+ myfile.save()
468
+ if self.keep_files:
469
+ files.append(myfile)
470
+ else:
471
+ del myfile
472
+ del block
473
+
474
+ else:
475
+ adata = super().__call__(adata)
476
+ myfile = ln.Artifact(
477
+ adata,
478
+ is_new_version_of=file,
479
+ description=description,
480
+ version=str(version),
481
+ )
482
+ myfile.save()
483
+ if self.keep_files:
484
+ files.append(myfile)
485
+ else:
486
+ del myfile
487
+ del adata
356
488
 
357
489
  except ValueError as v:
358
- if v.args[0].startswith(
359
- "Dataset dropped because contains too many secondary"
360
- ):
490
+ if v.args[0].startswith("we cannot work with this organism"):
361
491
  print(v)
362
492
  continue
363
493
  else:
364
494
  raise v
365
- for name in ["stable_id", "created_at", "updated_at"]:
366
- if name in adata.var.columns:
367
- adata.var = adata.var.drop(columns=name)
368
- myfile = ln.Artifact(
369
- adata,
370
- is_new_version_of=file,
371
- description=description,
372
- version=version,
373
- )
495
+ except Exception as e:
496
+ if e.args[0].startswith("Dataset dropped due to"):
497
+ print(e)
498
+ continue
499
+ else:
500
+ raise e
501
+
374
502
  # issues with KLlggfw6I6lvmbqiZm46
375
- myfile.save()
376
- files.append(myfile)
377
- dataset = ln.Collection(files, name=name, description=description)
378
- dataset.save()
379
- return dataset
503
+ if self.keep_files:
504
+ dataset = ln.Collection(files, name=name, description=description)
505
+ dataset.save()
506
+ return dataset
507
+ else:
508
+ return
380
509
  else:
381
510
  raise ValueError("Please provide either anndata or ln.Collection")
382
511
 
@@ -498,7 +627,7 @@ def additional_preprocess(adata):
498
627
  ].astype(str)
499
628
  adata.obs.loc[loc, "tissue_ontology_term_id"] = adata.obs.loc[
500
629
  loc, "tissue_ontology_term_id"
501
- ].str.replace(r" \(cell culture\)", "")
630
+ ].str.replace(" (cell culture)", "")
502
631
 
503
632
  loc = adata.obs["tissue_ontology_term_id"].str.contains("(organoid)", regex=False)
504
633
  if loc.sum() > 0:
@@ -508,7 +637,7 @@ def additional_preprocess(adata):
508
637
  ].astype(str)
509
638
  adata.obs.loc[loc, "tissue_ontology_term_id"] = adata.obs.loc[
510
639
  loc, "tissue_ontology_term_id"
511
- ].str.replace(r" \(organoid\)", "")
640
+ ].str.replace(" (organoid)", "")
512
641
 
513
642
  loc = adata.obs["tissue_ontology_term_id"].str.contains("CL:", regex=False)
514
643
  if loc.sum() > 0:
@@ -563,3 +692,209 @@ def additional_postprocess(adata):
563
692
  # to query N next time points we just get the N elements below and check they are in the group
564
693
  # to query the N nearest neighbors we just get the N elements above and N below and check they are in the group
565
694
  return adata
695
+
696
+
697
+ """
698
+ sexr = {
699
+ "Male": "PATO:0000384",
700
+ "Female": "PATO:0000383",
701
+ }
702
+ tissuer = {
703
+ "Kidney": "UBERON:0002113",
704
+ "Lung": "UBERON:0002048",
705
+ "Heart": "UBERON:0000948",
706
+ "Liver": "UBERON:0002107",
707
+ "Brain": "UBERON:0000955",
708
+ "BAT": "UBERON:0001348",
709
+ "Jejunum": "UBERON:0002115",
710
+ "Colon": "UBERON:0001155",
711
+ "Ileum": "UBERON:0002116",
712
+ "Stomach": "UBERON:0000945",
713
+ "gWAT": "UBERON:0001347",
714
+ "Duodenum": "UBERON:0002114",
715
+ "iWAT": "UBERON:0001347",
716
+ "Muscle": "UBERON:0001630",
717
+ }
718
+ ager = {
719
+ "03_months": "MmusDv:0000063",
720
+ "16_months": "MmusDv:0000087",
721
+ "06_months": "MmusDv:0000077",
722
+ "23_months": "MmusDv:0000127",
723
+ "12_months": "MmusDv:0000083",
724
+ "21_months": "MmusDv:0000125",
725
+ }
726
+
727
+ celltyper = {
728
+ "Proximal tubule cells": "epithelial cell of proximal tubule",
729
+ "Vascular endothelial cells": "endothelial cell of vascular tree",
730
+ "Intestinal epithelial cells": "intestinal epithelial cell",
731
+ "Hepatocytes": "hepatocyte",
732
+ "Fibroblasts": "fibroblast",
733
+ "Lymphoid cells_T cells": "T cell",
734
+ "Myeloid cells": "myeloid cell",
735
+ "Brown adipocytes": "brown fat cell",
736
+ "Lymphoid cells_B cells": "B cell",
737
+ "Adipocytes": "fat cell",
738
+ "Type II alveolar epithelial cells": "type II pneumocyte",
739
+ "Colonic epithelial cells": "colon epithelial cell",
740
+ "Mural cells": "mural cell",
741
+ "Cerebellum granule neurons": "cerebellar neuron",
742
+ "Goblet cells": "goblet cell",
743
+ "Vascular endothelial cells_General capillary cells": "endothelial cell of vascular tree",
744
+ "Ventricular cardiomyocytes": "regular ventricular cardiac myocyte",
745
+ "Type II myonuclei": "type II muscle cell",
746
+ "Thick ascending limb of LOH cells": "vasa recta ascending limb cell",
747
+ "Gastric mucous cells": "mucous cell of stomach",
748
+ "Distal convoluted tubule cells": "kidney distal convoluted tubule epithelial cell",
749
+ "Adipoce stem and progenitor cells": "hepatic oval stem cell",
750
+ "Chief cells": "chief cell of parathyroid gland",
751
+ "Paneth cells": "paneth cell",
752
+ "Myeloid cells_Alveolar macrophages": "alveolar macrophage",
753
+ "Lymphoid cells_Plasma cells": "plasma cell",
754
+ "Secretory cells": "secretory cell",
755
+ "Lymphoid cells_Resting B cells": "B cell",
756
+ "Cortical projection neurons 1": "corticothalamic-projecting glutamatergic cortical neuron",
757
+ "Endocardial endothelial cells": "endocardial cell",
758
+ "Type I alveolar epithelial cells": "type I pneumocyte",
759
+ "Interbrain and midbrain neurons 1": "midbrain dopaminergic neuron",
760
+ "Interbrain and midbrain neurons 2": "midbrain dopaminergic neuron",
761
+ "Myeloid cells_Monocytes": "monocyte",
762
+ "Myeloid cells_Dendritic cells": "myeloid dendritic cell",
763
+ "Oligodendrocytes": "oligodendrocyte",
764
+ "Lymphatic endothelial cells": "endothelial cell of lymphatic vessel",
765
+ "Enteroendocrine cells": "enteroendocrine cell",
766
+ "Vascular endothelial cells_Aerocytes": "endothelial cell of vascular tree",
767
+ "Gastric epithelial cells": "epithelial cell of stomach",
768
+ "Fibro–adipogenic progenitors": "fibro/adipogenic progenitor cell",
769
+ "Parietal cells": "parietal cell",
770
+ "Astrocytes": "astrocyte",
771
+ "Connecting tubule cells": "kidney connecting tubule beta-intercalated cell",
772
+ "Hepatic stellate cells": "hepatic stellate cell",
773
+ "Striatal neurons 1": "striatum neuron",
774
+ "Mesothelial cells": "mesothelial cell",
775
+ "Lymphoid cells_Cycling B cells": "germinal center B cell",
776
+ "Type B intercalated cells": "renal beta-intercalated cell",
777
+ "Type A intercalated cells": "renal alpha-intercalated cell",
778
+ "Myeloid cells_Neutrophils": "neutrophil",
779
+ "Principal cells": "renal principal cell",
780
+ "Cortical projection neurons 2": "corticothalamic-projecting glutamatergic cortical neuron",
781
+ "Muc2-producing goblet cells": "intestine goblet cell",
782
+ "OB neurons 1": "olfactory bulb interneuron",
783
+ "Atrial cardiomyocytes": "regular atrial cardiac myocyte",
784
+ "Lymphoid cells": "leukocyte",
785
+ "Skeletal muscle cells": "cell of skeletal muscle",
786
+ "Neural cells": "neural cell",
787
+ "Cerebellum interneurons": "cerebellar neuron",
788
+ "Interneurons 1": "interneuron",
789
+ "Descending thin limb of LOH cells": "vasa recta descending limb cell",
790
+ "Tuft cells": "intestinal tuft cell",
791
+ "Oligodendrocyte progenitor cells": "oligodendrocyte precursor cell",
792
+ "Enteric glia": "enteroglial cell",
793
+ "Endothelial cells": "endothelial cell",
794
+ "Dentate gyrus neurons": "dentate gyrus neuron",
795
+ "Myeloid cells_Interstitial macrophages": "tissue-resident macrophage",
796
+ "Ciliated cells": "ciliated cell",
797
+ "Microglia": "microglial cell",
798
+ "Interneurons 2": "interneuron",
799
+ "Ncam1 positive cells": "parafollicular cell",
800
+ "Rdh16 positive cells": "unknown",
801
+ "Circulating hepatoblasts": "hepatoblast",
802
+ "Enteric neurons": "enteric neuron",
803
+ "Ascending thin limb of LOH cells": "vasa recta ascending limb cell",
804
+ "Mfge8 positive cells": "unknown",
805
+ "Cholangiocytes": "cholangiocyte",
806
+ "Podocytes": "podocyte",
807
+ "Muscle satellite cells": "skeletal muscle satellite cell",
808
+ "Purkinje neurons": "Purkinje cell",
809
+ "Juxtaglomerular cells": "juxtaglomerular complex cell",
810
+ "Ngf positive cells": "neurogliaform cell",
811
+ "Bergmann glia": "Bergmann glial cell",
812
+ "Megf11 positive cells": "unknown",
813
+ "Myotendinous junction myonuclei": "unknown",
814
+ "Vascular leptomeningeal cells": "vascular leptomeningeal cell",
815
+ "Urothelial cells": "urothelial cell",
816
+ "Tenocytes": "tendon cell",
817
+ "Myelinating Schwann cells": "myelinating Schwann cell",
818
+ "Epididymal cells": "epididymis glandular cell",
819
+ "Muc6-producing goblet cells": "lung goblet cell",
820
+ "Type I myonuclei": "type I muscle cell",
821
+ "OB neurons 2": "olfactory bulb interneuron",
822
+ "Sis positive cells": "unknown",
823
+ "Lgr5 positive cells": "unknown",
824
+ "Macula densa cells": "macula densa epithelial cell",
825
+ "Choroid plexus epithelial cells": "choroid plexus epithelial cell",
826
+ "Cortical projection neurons 3": "corticothalamic-projecting glutamatergic cortical neuron",
827
+ "Interstitial cells of Cajal": "interstitial cell of Cajal",
828
+ "Cacna1b positive cells": "unknown",
829
+ "Hindbrain neurons 2": "neuron",
830
+ "Myeloid cells_Basophils": "basophil",
831
+ "Ependymal cells": "ependymal cell",
832
+ "Muc5ac-producing goblet cells": "lung goblet cell",
833
+ "Myeloid cells_Mast cells": "mast cell",
834
+ "Pulmonary neuroendocrine cells": "lung neuroendocrine cell",
835
+ "Basal cells": "basal cell",
836
+ "OB neurons 3": "olfactory bulb interneuron",
837
+ "Non-myelinating Schwann cells": "non-myelinating Schwann cell",
838
+ "Asic2 positive cells": "unknown",
839
+ "Striatal neurons 2": "striatum neuron",
840
+ "Erythroblasts": "erythroblast",
841
+ "Hindbrain neurons 1": "neuron",
842
+ "Neuromuscular junction myonuclei": "unknown",
843
+ "Habenula neurons": "unknown",
844
+ "Pituitary cells": "pituitary gland cell",
845
+ "Unipolar brush cells": "unipolar brush cell",
846
+ "Pde4c positive cells": "unknown",
847
+ "Pancreatic acinar cells": "pancreatic acinar cell",
848
+ "Inferior olivary nucleus neurons": "bushy cell",
849
+ "Colec10 positive cells": "unknown",
850
+ "Fcgbp positive cells": "unknown",
851
+ "Fut9 positive cells": "unknown",
852
+ "Mirg positive cells": "unknown",
853
+ "Alox15 positive cells": "unknown",
854
+ "Osteoblasts": "osteoblast",
855
+ }
856
+ genesdf = utils.load_genes("NCBITaxon:10090")
857
+ {k: v if v =="unknown" else bt.CellType.filter(name=v).one().ontology_id for k, v in celltyper.items()}
858
+
859
+ adata.obs["organism_ontology_term_id"] = "NCBITaxon:10090"
860
+ adata.obs["tissue_ontology_term_id"] = adata.obs["Organ_name"].replace(tissuer)
861
+ adata.obs["cell_type_ontology_term_id"] = adata.obs["Main_cell_type"].replace(
862
+ celltyper
863
+ )
864
+ adata.obs["disease_ontology_term_id"] = "PATO:0000461"
865
+ adata.obs["assay_ontology_term_id"] = "unknown"
866
+ adata.obs["self_reported_ethnicity_ontology_term_id"] = "unknown"
867
+ adata.obs["development_stage_ontology_term_id"] = adata.obs["Age_group"].replace(
868
+ ager
869
+ )
870
+ adata.obs["sex_ontology_term_id"] = adata.obs["Gender"].replace(sexr)
871
+
872
+ for i in range(num_blocks):
873
+ start_index = i * block_size
874
+ end_index = min((i + 1) * block_size, len(adata))
875
+ block = adata[start_index:end_index].to_memory()
876
+ # process block here
877
+
878
+ block = block[(block.obs["Gene_count"] > 400)]
879
+
880
+ intersect_genes = set(block.var.index).intersection(set(genesdf.index))
881
+ print(f"Removed {len(block.var.index) - len(intersect_genes)} genes.")
882
+ block = block[:, list(intersect_genes)]
883
+ # marking unseen genes
884
+ unseen = set(genesdf.index) - set(block.var.index)
885
+ # adding them to adata
886
+ emptyda = ad.AnnData(
887
+ csr_matrix((block.shape[0], len(unseen)), dtype=np.float32),
888
+ var=pd.DataFrame(index=list(unseen)),
889
+ obs=pd.DataFrame(index=block.obs.index),
890
+ )
891
+ block = ad.concat([block, emptyda], axis=1, join="outer", merge="only")
892
+ # do a validation function
893
+ block.uns["unseen_genes"] = list(unseen)
894
+ block = block[:, block.var.sort_index().index]
895
+ block.var[genesdf.columns] = genesdf.loc[block.var.index]
896
+ for name in ["stable_id", "created_at", "updated_at"]:
897
+ if name in block.var.columns:
898
+ block.var = block.var.drop(columns=name)
899
+ block.write_h5ad('zhang2024_adata_'+str(i)+".h5ad")
900
+ """