scdataloader 1.9.2__py3-none-any.whl → 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  import gc
2
- from typing import Callable, Optional, Union
2
+ import time
3
+ from typing import Callable, List, Optional, Union
3
4
  from uuid import uuid4
4
5
 
5
6
  import anndata as ad
@@ -8,13 +9,14 @@ import numpy as np
8
9
  import pandas as pd
9
10
  import scanpy as sc
10
11
  from anndata import AnnData, read_h5ad
12
+ from django.db.utils import OperationalError
11
13
  from scipy.sparse import csr_matrix
12
14
  from upath import UPath
13
15
 
14
16
  from scdataloader import utils as data_utils
15
17
 
16
18
  FULL_LENGTH_ASSAYS = [
17
- "EFO: 0700016",
19
+ "EFO:0700016",
18
20
  "EFO:0008930",
19
21
  "EFO:0008931",
20
22
  ]
@@ -47,20 +49,21 @@ class Preprocessor:
47
49
  maxdropamount: int = 50,
48
50
  madoutlier: int = 5,
49
51
  pct_mt_outlier: int = 8,
50
- batch_keys: list[str] = [
52
+ batch_keys: List[str] = [
51
53
  "assay_ontology_term_id",
52
54
  "self_reported_ethnicity_ontology_term_id",
53
55
  "sex_ontology_term_id",
54
56
  "donor_id",
55
57
  "suspension_type",
56
58
  ],
57
- skip_validate: bool = False,
59
+ skip_validate: bool = True,
58
60
  additional_preprocess: Optional[Callable[[AnnData], AnnData]] = None,
59
61
  additional_postprocess: Optional[Callable[[AnnData], AnnData]] = None,
60
62
  do_postp: bool = True,
61
- organisms: list[str] = ["NCBITaxon:9606", "NCBITaxon:10090"],
63
+ organisms: List[str] = ["NCBITaxon:9606", "NCBITaxon:10090"],
62
64
  use_raw: bool = True,
63
65
  keepdata: bool = False,
66
+ drop_non_primary: bool = False,
64
67
  ) -> None:
65
68
  """
66
69
  Initializes the preprocessor and configures the workflow steps.
@@ -108,6 +111,8 @@ class Preprocessor:
108
111
  Defaults to False.
109
112
  keepdata (bool, optional): Determines whether to keep the data in the AnnData object.
110
113
  Defaults to False.
114
+ drop_non_primary (bool, optional): Determines whether to drop non-primary cells.
115
+ Defaults to False.
111
116
  """
112
117
  self.filter_gene_by_counts = filter_gene_by_counts
113
118
  self.filter_cell_by_counts = filter_cell_by_counts
@@ -123,6 +128,7 @@ class Preprocessor:
123
128
  self.min_valid_genes_id = min_valid_genes_id
124
129
  self.min_nnz_genes = min_nnz_genes
125
130
  self.maxdropamount = maxdropamount
131
+ self.drop_non_primary = drop_non_primary
126
132
  self.madoutlier = madoutlier
127
133
  self.n_hvg_for_postp = n_hvg_for_postp
128
134
  self.pct_mt_outlier = pct_mt_outlier
@@ -139,13 +145,14 @@ class Preprocessor:
139
145
  if self.additional_preprocess is not None:
140
146
  adata = self.additional_preprocess(adata)
141
147
  if "organism_ontology_term_id" not in adata[0].obs.columns:
142
- raise ValueError(
143
- "organism_ontology_term_id not found in adata.obs, you need to add an ontology term id for the organism of your anndata"
144
- )
145
- if not adata[0].var.index.str.contains("ENS").any() and not self.is_symbol:
146
- raise ValueError(
147
- "gene names in the `var.index` field of your anndata should map to the ensembl_gene nomenclature else set `is_symbol` to True if using hugo symbols"
148
- )
148
+ if "organism_ontology_term_id" in adata.uns:
149
+ adata.obs["organism_ontology_term_id"] = adata.uns[
150
+ "organism_ontology_term_id"
151
+ ]
152
+ else:
153
+ raise ValueError(
154
+ "organism_ontology_term_id not found in adata.obs, you need to add an ontology term id for the organism of your anndata"
155
+ )
149
156
  if adata.obs["organism_ontology_term_id"].iloc[0] not in self.organisms:
150
157
  raise ValueError(
151
158
  "we cannot work with this organism",
@@ -161,8 +168,8 @@ class Preprocessor:
161
168
  if np.abs(adata[:50_000].X.astype(int) - adata[:50_000].X).sum():
162
169
  print("X was not raw counts, using 'counts' layer")
163
170
  adata.X = adata.layers["counts"].copy()
164
- print("Dropping layers: ", adata.layers.keys())
165
171
  if not self.keepdata:
172
+ print("Dropping layers: ", adata.layers.keys())
166
173
  del adata.layers
167
174
  if len(adata.varm.keys()) > 0 and not self.keepdata:
168
175
  del adata.varm
@@ -170,6 +177,8 @@ class Preprocessor:
170
177
  del adata.obsm
171
178
  if len(adata.obsp.keys()) > 0 and not self.keepdata:
172
179
  del adata.obsp
180
+ if len(adata.varp.keys()) > 0 and not self.keepdata:
181
+ del adata.varp
173
182
  # check that it is a count
174
183
 
175
184
  print("checking raw counts")
@@ -188,7 +197,7 @@ class Preprocessor:
188
197
  # if not available count drop
189
198
  prevsize = adata.shape[0]
190
199
  # dropping non primary
191
- if "is_primary_data" in adata.obs.columns:
200
+ if "is_primary_data" in adata.obs.columns and self.drop_non_primary:
192
201
  adata = adata[adata.obs.is_primary_data]
193
202
  if adata.shape[0] < self.min_dataset_size:
194
203
  raise Exception("Dataset dropped due to too many secondary cells")
@@ -213,13 +222,10 @@ class Preprocessor:
213
222
  min_genes=self.min_nnz_genes,
214
223
  )
215
224
  # if lost > 50% of the dataset, drop dataset
216
- # load the genes
217
- genesdf = data_utils.load_genes(adata.obs.organism_ontology_term_id.iloc[0])
218
-
219
- if prevsize / adata.shape[0] > self.maxdropamount:
225
+ if prevsize / (adata.shape[0] + 1) > self.maxdropamount:
220
226
  raise Exception(
221
227
  "Dataset dropped due to low expressed genes and unexpressed cells: factor of "
222
- + str(prevsize / adata.shape[0])
228
+ + str(prevsize / (adata.shape[0] + 1))
223
229
  )
224
230
  if adata.shape[0] < self.min_dataset_size:
225
231
  raise Exception(
@@ -232,60 +238,39 @@ class Preprocessor:
232
238
  )
233
239
  )
234
240
 
235
- # Check if we have a mix of gene names and ensembl IDs
236
- has_ens = adata.var.index.str.match(r"ENS.*\d{6,}$").any()
237
- all_ens = adata.var.index.str.match(r"ENS.*\d{6,}$").all()
238
-
239
- if not has_ens:
240
- print("No ENS genes found, assuming gene symbols...")
241
- elif not all_ens:
242
- print("Mix of ENS and gene symbols found, converting all to ENS IDs...")
243
-
241
+ # load the genes
242
+ genesdf = data_utils.load_genes(adata.obs.organism_ontology_term_id.iloc[0])
244
243
  genesdf["ensembl_gene_id"] = genesdf.index
245
244
 
246
245
  # For genes that are already ENS IDs, use them directly
247
- ens_mask = adata.var.index.str.match(r"ENS.*\d{6,}$")
248
- symbol_mask = ~ens_mask
249
-
246
+ prev_size = adata.shape[1]
250
247
  # Handle symbol genes
251
- if symbol_mask.any():
252
- symbol_var = adata.var[symbol_mask].merge(
248
+ if self.is_symbol:
249
+ new_var = adata.var.merge(
253
250
  genesdf.drop_duplicates("symbol").set_index("symbol", drop=False),
254
251
  left_index=True,
255
252
  right_index=True,
256
253
  how="inner",
257
254
  )
258
-
259
- # Handle ENS genes
260
- if ens_mask.any():
261
- ens_var = adata.var[ens_mask].merge(
255
+ new_var["symbol"] = new_var.index
256
+ adata = adata[:, new_var.index]
257
+ new_var.index = new_var["ensembl_gene_id"]
258
+ else:
259
+ new_var = adata.var.merge(
262
260
  genesdf, left_index=True, right_index=True, how="inner"
263
261
  )
262
+ adata = adata[:, new_var.index]
263
+ print(f"Removed {prev_size - adata.shape[1]} genes not known to the ontology")
264
+ prev_size = adata.shape[1]
264
265
 
265
- # Combine and sort
266
- if symbol_mask.any() and ens_mask.any():
267
- var = pd.concat([symbol_var, ens_var])
268
- elif symbol_mask.any():
269
- var = symbol_var
270
- else:
271
- var = ens_var
272
-
273
- adata = adata[:, var.index]
274
- # var = var.sort_values(by="ensembl_gene_id").set_index("ensembl_gene_id")
275
- # Update adata with combined genes
276
- if "ensembl_gene_id" in var.columns:
277
- adata.var = var.set_index("ensembl_gene_id")
278
- else:
279
- adata.var = var
266
+ adata.var = new_var
280
267
  # Drop duplicate genes, keeping first occurrence
281
268
  adata = adata[:, ~adata.var.index.duplicated(keep="first")]
269
+ print(f"Removed {prev_size - adata.shape[1]} duplicate genes")
282
270
 
283
- intersect_genes = set(adata.var.index).intersection(set(genesdf.index))
284
- print(f"Removed {len(adata.var.index) - len(intersect_genes)} genes.")
285
- if len(intersect_genes) < self.min_valid_genes_id:
271
+ if adata.shape[1] < self.min_valid_genes_id:
286
272
  raise Exception("Dataset dropped due to too many genes not mapping to it")
287
- adata = adata[:, list(intersect_genes)]
288
- # marking unseen genes
273
+
289
274
  unseen = set(genesdf.index) - set(adata.var.index)
290
275
  # adding them to adata
291
276
  emptyda = ad.AnnData(
@@ -293,6 +278,9 @@ class Preprocessor:
293
278
  var=pd.DataFrame(index=list(unseen)),
294
279
  obs=pd.DataFrame(index=adata.obs.index),
295
280
  )
281
+ print(
282
+ f"Added {len(unseen)} genes in the ontology but not present in the dataset"
283
+ )
296
284
  adata = ad.concat([adata, emptyda], axis=1, join="outer", merge="only")
297
285
  # do a validation function
298
286
  adata.uns["unseen_genes"] = list(unseen)
@@ -330,7 +318,7 @@ class Preprocessor:
330
318
  # QC
331
319
 
332
320
  adata.var[genesdf.columns] = genesdf.loc[adata.var.index]
333
- print("startin QC")
321
+ print("starting QC")
334
322
  sc.pp.calculate_qc_metrics(
335
323
  adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20]
336
324
  )
@@ -348,7 +336,7 @@ class Preprocessor:
348
336
  )
349
337
  total_outliers = (adata.obs["outlier"] | adata.obs["mt_outlier"]).sum()
350
338
  total_cells = adata.shape[0]
351
- percentage_outliers = (total_outliers / total_cells) * 100
339
+ percentage_outliers = (total_outliers / (total_cells + 1)) * 100
352
340
  print(
353
341
  f"Seeing {total_outliers} outliers ({percentage_outliers:.2f}% of total dataset):"
354
342
  )
@@ -374,6 +362,8 @@ class Preprocessor:
374
362
  adata.obs["batches"] = adata.obs[batches].apply(
375
363
  lambda x: ",".join(x.dropna().astype(str)), axis=1
376
364
  )
365
+ if "highly_variable" in adata.var.columns:
366
+ adata.var = adata.var.drop(columns=["highly_variable"])
377
367
  if self.n_hvg_for_postp:
378
368
  try:
379
369
  sc.pp.highly_variable_genes(
@@ -395,12 +385,15 @@ class Preprocessor:
395
385
  subset=False,
396
386
  layer="norm",
397
387
  )
398
-
388
+ print("starting PCA")
399
389
  adata.obsm["X_pca"] = sc.pp.pca(
400
- adata.layers["norm"][:, adata.var.highly_variable]
401
- if "highly_variable" in adata.var.columns
402
- else adata.layers["norm"],
403
- n_comps=200 if adata.shape[0] > 200 else adata.shape[0] - 2,
390
+ (
391
+ adata.layers["norm"][:, adata.var["highly_variable"]]
392
+ if "highly_variable" in adata.var.columns
393
+ else adata.layers["norm"]
394
+ ),
395
+ n_comps=50 if adata.shape[0] > 1000 else adata.shape[0] // 20,
396
+ chunked=adata.shape[0] > 100_000,
404
397
  )
405
398
 
406
399
  # additional
@@ -464,13 +457,15 @@ class LaminPreprocessor(Preprocessor):
464
457
  *args,
465
458
  cache: bool = True,
466
459
  keep_files: bool = True,
467
- force_preloaded: bool = False,
460
+ force_lamin_cache: bool = False,
461
+ assays_to_drop: List[str] = ["EFO:0008939"],
468
462
  **kwargs,
469
463
  ):
470
464
  super().__init__(*args, **kwargs)
471
465
  self.cache = cache
472
466
  self.keep_files = keep_files
473
- self.force_preloaded = force_preloaded
467
+ self.force_lamin_cache = force_lamin_cache
468
+ self.assays_to_drop = assays_to_drop
474
469
 
475
470
  def __call__(
476
471
  self,
@@ -505,19 +500,25 @@ class LaminPreprocessor(Preprocessor):
505
500
  print(f"{file.stem_uid} is already processed... not preprocessing")
506
501
  continue
507
502
  print(file)
503
+ if self.force_lamin_cache:
504
+ path = cache_path(file)
505
+ backed = read_h5ad(path, backed="r")
506
+ else:
507
+ # file.cache()
508
+ backed = file.open()
508
509
 
509
- _ = cache_path(file) if self.force_preloaded else file.cache()
510
- backed = file.open()
511
- # backed = read_h5ad(path, backed="r")
512
510
  if "is_primary_data" in backed.obs.columns:
513
511
  if backed.obs.is_primary_data.sum() == 0:
514
512
  print(f"{file.key} only contains non primary cells.. dropping")
515
513
  # Save the stem_uid to a file to avoid loading it again
516
- with open("nonprimary.txt", "a") as f:
517
- f.write(f"{file.stem_uid}\n")
518
- continue
514
+ with open("nonprimary.txt", "a") as f:
515
+ f.write(f"{file.stem_uid}\n")
516
+ continue
519
517
  else:
520
518
  print("Warning: couldn't check unicity from is_primary_data column")
519
+ if backed.obs.assay_ontology_term_id[0] in self.assays_to_drop:
520
+ print(f"{file.key} is in the assay drop list.. dropping")
521
+ continue
521
522
  if backed.shape[1] < 1000:
522
523
  print(
523
524
  f"{file.key} only contains less than 1000 genes and is likely not scRNAseq... dropping"
@@ -556,37 +557,52 @@ class LaminPreprocessor(Preprocessor):
556
557
  block,
557
558
  dataset_id=file.stem_uid + "_p" + str(j),
558
559
  )
559
- myfile = ln.Artifact.from_anndata(
560
- block,
561
- description=description
562
- + " n"
563
- + str(i)
564
- + " p"
565
- + str(j)
566
- + " ( revises file "
567
- + str(file.stem_uid)
568
- + " )",
569
- version=version,
570
- )
571
- myfile.save()
572
-
560
+ saved = False
561
+ while not saved:
562
+ try:
563
+ myfile = ln.Artifact.from_anndata(
564
+ block,
565
+ description=description
566
+ + " n"
567
+ + str(i)
568
+ + " p"
569
+ + str(j)
570
+ + " ( revises file "
571
+ + str(file.stem_uid)
572
+ + " )",
573
+ version=version,
574
+ )
575
+ myfile.save()
576
+ saved = True
577
+ except OperationalError:
578
+ print(
579
+ "Database locked, waiting 30 seconds and retrying..."
580
+ )
581
+ time.sleep(10)
573
582
  if self.keep_files:
574
583
  files.append(myfile)
575
584
  del block
576
585
  else:
577
586
  del myfile
578
587
  del block
579
- gc.collect()
580
-
581
588
  else:
582
589
  adata = super().__call__(adata, dataset_id=file.stem_uid)
583
- myfile = ln.Artifact.from_anndata(
584
- adata,
585
- revises=file,
586
- description=description + " p" + str(i),
587
- version=version,
588
- )
589
- myfile.save()
590
+ saved = False
591
+ while not saved:
592
+ try:
593
+ myfile = ln.Artifact.from_anndata(
594
+ adata,
595
+ # revises=file,
596
+ description=description + " p" + str(i),
597
+ version=version,
598
+ )
599
+ myfile.save()
600
+ saved = True
601
+ except OperationalError:
602
+ print(
603
+ "Database locked, waiting 10 seconds and retrying..."
604
+ )
605
+ time.sleep(10)
590
606
  if self.keep_files:
591
607
  files.append(myfile)
592
608
  del adata
@@ -606,7 +622,7 @@ class LaminPreprocessor(Preprocessor):
606
622
  continue
607
623
  else:
608
624
  raise e
609
-
625
+ gc.collect()
610
626
  # issues with KLlggfw6I6lvmbqiZm46
611
627
  if self.keep_files:
612
628
  # Reconstruct collection using keys
@@ -716,7 +732,7 @@ def additional_preprocess(adata):
716
732
  }
717
733
  }
718
734
  ) # multi ethnic will have to get renamed
719
- adata.obs["cell_culture"] = False
735
+ adata.obs["cell_culture"] = "False"
720
736
  # if cell_type contains the word "(cell culture)" then it is a cell culture and we mark it as so and remove this from the cell type
721
737
  loc = adata.obs["cell_type_ontology_term_id"].str.contains(
722
738
  "(cell culture)", regex=False
@@ -725,7 +741,7 @@ def additional_preprocess(adata):
725
741
  adata.obs["cell_type_ontology_term_id"] = adata.obs[
726
742
  "cell_type_ontology_term_id"
727
743
  ].astype(str)
728
- adata.obs.loc[loc, "cell_culture"] = True
744
+ adata.obs.loc[loc, "cell_culture"] = "True"
729
745
  adata.obs.loc[loc, "cell_type_ontology_term_id"] = adata.obs.loc[
730
746
  loc, "cell_type_ontology_term_id"
731
747
  ].str.replace(" (cell culture)", "")
@@ -734,7 +750,7 @@ def additional_preprocess(adata):
734
750
  "(cell culture)", regex=False
735
751
  )
736
752
  if loc.sum() > 0:
737
- adata.obs.loc[loc, "cell_culture"] = True
753
+ adata.obs.loc[loc, "cell_culture"] = "True"
738
754
  adata.obs["tissue_ontology_term_id"] = adata.obs[
739
755
  "tissue_ontology_term_id"
740
756
  ].astype(str)
@@ -744,7 +760,7 @@ def additional_preprocess(adata):
744
760
 
745
761
  loc = adata.obs["tissue_ontology_term_id"].str.contains("(organoid)", regex=False)
746
762
  if loc.sum() > 0:
747
- adata.obs.loc[loc, "cell_culture"] = True
763
+ adata.obs.loc[loc, "cell_culture"] = "True"
748
764
  adata.obs["tissue_ontology_term_id"] = adata.obs[
749
765
  "tissue_ontology_term_id"
750
766
  ].astype(str)
@@ -773,6 +789,7 @@ def additional_postprocess(adata):
773
789
  # sc.external.pp.harmony_integrate(adata, key="batches")
774
790
  # sc.pp.neighbors(adata, use_rep="X_pca_harmony")
775
791
  # else:
792
+ print("starting post processing")
776
793
  sc.pp.neighbors(adata, use_rep="X_pca")
777
794
  sc.tl.leiden(adata, key_added="leiden_2", resolution=2.0)
778
795
  sc.tl.leiden(adata, key_added="leiden_1", resolution=1.0)
@@ -791,8 +808,12 @@ def additional_postprocess(adata):
791
808
  MAXSIM = 0.94
792
809
  from collections import Counter
793
810
 
811
+ import bionty as bt
812
+
794
813
  from .config import MAIN_HUMAN_MOUSE_DEV_STAGE_MAP
795
814
 
815
+ remap_stages = {u: k for k, v in MAIN_HUMAN_MOUSE_DEV_STAGE_MAP.items() for u in v}
816
+
796
817
  adata.obs[NEWOBS] = (
797
818
  adata.obs[COL].astype(str) + "_" + adata.obs["leiden_1"].astype(str)
798
819
  )
@@ -860,18 +881,17 @@ def additional_postprocess(adata):
860
881
  num += 1
861
882
  adata.obs[NEWOBS] = adata.obs[NEWOBS].map(merge_mapping).fillna(adata.obs[NEWOBS])
862
883
 
863
- import bionty as bt
864
-
865
884
  stages = adata.obs["development_stage_ontology_term_id"].unique()
866
885
  if adata.obs.organism_ontology_term_id.unique() == ["NCBITaxon:9606"]:
867
886
  relabel = {i: i for i in stages}
868
887
  for stage in stages:
888
+ if stage in MAIN_HUMAN_MOUSE_DEV_STAGE_MAP.keys():
889
+ continue
869
890
  stage_obj = bt.DevelopmentalStage.filter(ontology_id=stage).first()
870
891
  parents = set([i.ontology_id for i in stage_obj.parents.filter()])
871
892
  parents = parents - set(
872
893
  [
873
894
  "HsapDv:0010000",
874
- "HsapDv:0000204",
875
895
  "HsapDv:0000227",
876
896
  ]
877
897
  )
@@ -879,9 +899,14 @@ def additional_postprocess(adata):
879
899
  for p in parents:
880
900
  if p in MAIN_HUMAN_MOUSE_DEV_STAGE_MAP:
881
901
  relabel[stage] = p
882
- adata.obs["simplified_dev_stage"] = adata.obs[
883
- "development_stage_ontology_term_id"
884
- ].map(relabel)
902
+ adata.obs["age_group"] = adata.obs["development_stage_ontology_term_id"].map(
903
+ relabel
904
+ )
905
+ for stage in adata.obs["age_group"].unique():
906
+ if stage in remap_stages.keys():
907
+ adata.obs["age_group"] = adata.obs["age_group"].map(
908
+ lambda x: remap_stages[x] if x == stage else x
909
+ )
885
910
  elif adata.obs.organism_ontology_term_id.unique() == ["NCBITaxon:10090"]:
886
911
  rename_mapping = {
887
912
  k: v for v, j in MAIN_HUMAN_MOUSE_DEV_STAGE_MAP.items() for k in j
@@ -890,11 +915,12 @@ def additional_postprocess(adata):
890
915
  for stage in stages:
891
916
  if stage in rename_mapping:
892
917
  relabel[stage] = rename_mapping[stage]
893
- adata.obs["simplified_dev_stage"] = adata.obs[
894
- "development_stage_ontology_term_id"
895
- ].map(relabel)
918
+ adata.obs["age_group"] = adata.obs["development_stage_ontology_term_id"].map(
919
+ relabel
920
+ )
896
921
  else:
897
- raise ValueError("organism not supported")
922
+ # raise ValueError("organism not supported")
923
+ print("organism not supported for age labels")
898
924
  # palantir.utils.run_diffusion_maps(adata, n_components=20)
899
925
  # palantir.utils.determine_multiscale_space(adata)
900
926
  # terminal_states = palantir.utils.find_terminal_states(