scdataloader 0.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- from typing import Any, Callable, Optional, Union
1
+ from typing import Callable, Optional, Union
2
2
  from uuid import uuid4
3
3
 
4
4
  import anndata as ad
@@ -7,9 +7,7 @@ import numpy as np
7
7
  import pandas as pd
8
8
  import scanpy as sc
9
9
  from anndata import AnnData
10
- from django.db import IntegrityError
11
10
  from scipy.sparse import csr_matrix
12
- import os
13
11
 
14
12
  from scdataloader import utils as data_utils
15
13
 
@@ -204,7 +202,9 @@ class Preprocessor:
204
202
  )
205
203
  )
206
204
 
207
- if self.is_symbol:
205
+ if self.is_symbol or not adata.var.index.str.contains("ENSG").any():
206
+ if not adata.var.index.str.contains("ENSG").any():
207
+ print("No ENSG genes found, assuming gene symbols...")
208
208
  genesdf["ensembl_gene_id"] = genesdf.index
209
209
  var = (
210
210
  adata.var.merge(
@@ -266,9 +266,6 @@ class Preprocessor:
266
266
  # QC
267
267
 
268
268
  adata.var[genesdf.columns] = genesdf.loc[adata.var.index]
269
- for name in ["stable_id", "created_at", "updated_at"]:
270
- if name in adata.var.columns:
271
- adata.var = adata.var.drop(columns=name)
272
269
  print("startin QC")
273
270
  sc.pp.calculate_qc_metrics(
274
271
  adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20]
@@ -692,209 +689,3 @@ def additional_postprocess(adata):
692
689
  # to query N next time points we just get the N elements below and check they are in the group
693
690
  # to query the N nearest neighbors we just get the N elements above and N below and check they are in the group
694
691
  return adata
695
-
696
-
697
- """
698
- sexr = {
699
- "Male": "PATO:0000384",
700
- "Female": "PATO:0000383",
701
- }
702
- tissuer = {
703
- "Kidney": "UBERON:0002113",
704
- "Lung": "UBERON:0002048",
705
- "Heart": "UBERON:0000948",
706
- "Liver": "UBERON:0002107",
707
- "Brain": "UBERON:0000955",
708
- "BAT": "UBERON:0001348",
709
- "Jejunum": "UBERON:0002115",
710
- "Colon": "UBERON:0001155",
711
- "Ileum": "UBERON:0002116",
712
- "Stomach": "UBERON:0000945",
713
- "gWAT": "UBERON:0001347",
714
- "Duodenum": "UBERON:0002114",
715
- "iWAT": "UBERON:0001347",
716
- "Muscle": "UBERON:0001630",
717
- }
718
- ager = {
719
- "03_months": "MmusDv:0000063",
720
- "16_months": "MmusDv:0000087",
721
- "06_months": "MmusDv:0000077",
722
- "23_months": "MmusDv:0000127",
723
- "12_months": "MmusDv:0000083",
724
- "21_months": "MmusDv:0000125",
725
- }
726
-
727
- celltyper = {
728
- "Proximal tubule cells": "epithelial cell of proximal tubule",
729
- "Vascular endothelial cells": "endothelial cell of vascular tree",
730
- "Intestinal epithelial cells": "intestinal epithelial cell",
731
- "Hepatocytes": "hepatocyte",
732
- "Fibroblasts": "fibroblast",
733
- "Lymphoid cells_T cells": "T cell",
734
- "Myeloid cells": "myeloid cell",
735
- "Brown adipocytes": "brown fat cell",
736
- "Lymphoid cells_B cells": "B cell",
737
- "Adipocytes": "fat cell",
738
- "Type II alveolar epithelial cells": "type II pneumocyte",
739
- "Colonic epithelial cells": "colon epithelial cell",
740
- "Mural cells": "mural cell",
741
- "Cerebellum granule neurons": "cerebellar neuron",
742
- "Goblet cells": "goblet cell",
743
- "Vascular endothelial cells_General capillary cells": "endothelial cell of vascular tree",
744
- "Ventricular cardiomyocytes": "regular ventricular cardiac myocyte",
745
- "Type II myonuclei": "type II muscle cell",
746
- "Thick ascending limb of LOH cells": "vasa recta ascending limb cell",
747
- "Gastric mucous cells": "mucous cell of stomach",
748
- "Distal convoluted tubule cells": "kidney distal convoluted tubule epithelial cell",
749
- "Adipoce stem and progenitor cells": "hepatic oval stem cell",
750
- "Chief cells": "chief cell of parathyroid gland",
751
- "Paneth cells": "paneth cell",
752
- "Myeloid cells_Alveolar macrophages": "alveolar macrophage",
753
- "Lymphoid cells_Plasma cells": "plasma cell",
754
- "Secretory cells": "secretory cell",
755
- "Lymphoid cells_Resting B cells": "B cell",
756
- "Cortical projection neurons 1": "corticothalamic-projecting glutamatergic cortical neuron",
757
- "Endocardial endothelial cells": "endocardial cell",
758
- "Type I alveolar epithelial cells": "type I pneumocyte",
759
- "Interbrain and midbrain neurons 1": "midbrain dopaminergic neuron",
760
- "Interbrain and midbrain neurons 2": "midbrain dopaminergic neuron",
761
- "Myeloid cells_Monocytes": "monocyte",
762
- "Myeloid cells_Dendritic cells": "myeloid dendritic cell",
763
- "Oligodendrocytes": "oligodendrocyte",
764
- "Lymphatic endothelial cells": "endothelial cell of lymphatic vessel",
765
- "Enteroendocrine cells": "enteroendocrine cell",
766
- "Vascular endothelial cells_Aerocytes": "endothelial cell of vascular tree",
767
- "Gastric epithelial cells": "epithelial cell of stomach",
768
- "Fibro–adipogenic progenitors": "fibro/adipogenic progenitor cell",
769
- "Parietal cells": "parietal cell",
770
- "Astrocytes": "astrocyte",
771
- "Connecting tubule cells": "kidney connecting tubule beta-intercalated cell",
772
- "Hepatic stellate cells": "hepatic stellate cell",
773
- "Striatal neurons 1": "striatum neuron",
774
- "Mesothelial cells": "mesothelial cell",
775
- "Lymphoid cells_Cycling B cells": "germinal center B cell",
776
- "Type B intercalated cells": "renal beta-intercalated cell",
777
- "Type A intercalated cells": "renal alpha-intercalated cell",
778
- "Myeloid cells_Neutrophils": "neutrophil",
779
- "Principal cells": "renal principal cell",
780
- "Cortical projection neurons 2": "corticothalamic-projecting glutamatergic cortical neuron",
781
- "Muc2-producing goblet cells": "intestine goblet cell",
782
- "OB neurons 1": "olfactory bulb interneuron",
783
- "Atrial cardiomyocytes": "regular atrial cardiac myocyte",
784
- "Lymphoid cells": "leukocyte",
785
- "Skeletal muscle cells": "cell of skeletal muscle",
786
- "Neural cells": "neural cell",
787
- "Cerebellum interneurons": "cerebellar neuron",
788
- "Interneurons 1": "interneuron",
789
- "Descending thin limb of LOH cells": "vasa recta descending limb cell",
790
- "Tuft cells": "intestinal tuft cell",
791
- "Oligodendrocyte progenitor cells": "oligodendrocyte precursor cell",
792
- "Enteric glia": "enteroglial cell",
793
- "Endothelial cells": "endothelial cell",
794
- "Dentate gyrus neurons": "dentate gyrus neuron",
795
- "Myeloid cells_Interstitial macrophages": "tissue-resident macrophage",
796
- "Ciliated cells": "ciliated cell",
797
- "Microglia": "microglial cell",
798
- "Interneurons 2": "interneuron",
799
- "Ncam1 positive cells": "parafollicular cell",
800
- "Rdh16 positive cells": "unknown",
801
- "Circulating hepatoblasts": "hepatoblast",
802
- "Enteric neurons": "enteric neuron",
803
- "Ascending thin limb of LOH cells": "vasa recta ascending limb cell",
804
- "Mfge8 positive cells": "unknown",
805
- "Cholangiocytes": "cholangiocyte",
806
- "Podocytes": "podocyte",
807
- "Muscle satellite cells": "skeletal muscle satellite cell",
808
- "Purkinje neurons": "Purkinje cell",
809
- "Juxtaglomerular cells": "juxtaglomerular complex cell",
810
- "Ngf positive cells": "neurogliaform cell",
811
- "Bergmann glia": "Bergmann glial cell",
812
- "Megf11 positive cells": "unknown",
813
- "Myotendinous junction myonuclei": "unknown",
814
- "Vascular leptomeningeal cells": "vascular leptomeningeal cell",
815
- "Urothelial cells": "urothelial cell",
816
- "Tenocytes": "tendon cell",
817
- "Myelinating Schwann cells": "myelinating Schwann cell",
818
- "Epididymal cells": "epididymis glandular cell",
819
- "Muc6-producing goblet cells": "lung goblet cell",
820
- "Type I myonuclei": "type I muscle cell",
821
- "OB neurons 2": "olfactory bulb interneuron",
822
- "Sis positive cells": "unknown",
823
- "Lgr5 positive cells": "unknown",
824
- "Macula densa cells": "macula densa epithelial cell",
825
- "Choroid plexus epithelial cells": "choroid plexus epithelial cell",
826
- "Cortical projection neurons 3": "corticothalamic-projecting glutamatergic cortical neuron",
827
- "Interstitial cells of Cajal": "interstitial cell of Cajal",
828
- "Cacna1b positive cells": "unknown",
829
- "Hindbrain neurons 2": "neuron",
830
- "Myeloid cells_Basophils": "basophil",
831
- "Ependymal cells": "ependymal cell",
832
- "Muc5ac-producing goblet cells": "lung goblet cell",
833
- "Myeloid cells_Mast cells": "mast cell",
834
- "Pulmonary neuroendocrine cells": "lung neuroendocrine cell",
835
- "Basal cells": "basal cell",
836
- "OB neurons 3": "olfactory bulb interneuron",
837
- "Non-myelinating Schwann cells": "non-myelinating Schwann cell",
838
- "Asic2 positive cells": "unknown",
839
- "Striatal neurons 2": "striatum neuron",
840
- "Erythroblasts": "erythroblast",
841
- "Hindbrain neurons 1": "neuron",
842
- "Neuromuscular junction myonuclei": "unknown",
843
- "Habenula neurons": "unknown",
844
- "Pituitary cells": "pituitary gland cell",
845
- "Unipolar brush cells": "unipolar brush cell",
846
- "Pde4c positive cells": "unknown",
847
- "Pancreatic acinar cells": "pancreatic acinar cell",
848
- "Inferior olivary nucleus neurons": "bushy cell",
849
- "Colec10 positive cells": "unknown",
850
- "Fcgbp positive cells": "unknown",
851
- "Fut9 positive cells": "unknown",
852
- "Mirg positive cells": "unknown",
853
- "Alox15 positive cells": "unknown",
854
- "Osteoblasts": "osteoblast",
855
- }
856
- genesdf = utils.load_genes("NCBITaxon:10090")
857
- {k: v if v =="unknown" else bt.CellType.filter(name=v).one().ontology_id for k, v in celltyper.items()}
858
-
859
- adata.obs["organism_ontology_term_id"] = "NCBITaxon:10090"
860
- adata.obs["tissue_ontology_term_id"] = adata.obs["Organ_name"].replace(tissuer)
861
- adata.obs["cell_type_ontology_term_id"] = adata.obs["Main_cell_type"].replace(
862
- celltyper
863
- )
864
- adata.obs["disease_ontology_term_id"] = "PATO:0000461"
865
- adata.obs["assay_ontology_term_id"] = "unknown"
866
- adata.obs["self_reported_ethnicity_ontology_term_id"] = "unknown"
867
- adata.obs["development_stage_ontology_term_id"] = adata.obs["Age_group"].replace(
868
- ager
869
- )
870
- adata.obs["sex_ontology_term_id"] = adata.obs["Gender"].replace(sexr)
871
-
872
- for i in range(num_blocks):
873
- start_index = i * block_size
874
- end_index = min((i + 1) * block_size, len(adata))
875
- block = adata[start_index:end_index].to_memory()
876
- # process block here
877
-
878
- block = block[(block.obs["Gene_count"] > 400)]
879
-
880
- intersect_genes = set(block.var.index).intersection(set(genesdf.index))
881
- print(f"Removed {len(block.var.index) - len(intersect_genes)} genes.")
882
- block = block[:, list(intersect_genes)]
883
- # marking unseen genes
884
- unseen = set(genesdf.index) - set(block.var.index)
885
- # adding them to adata
886
- emptyda = ad.AnnData(
887
- csr_matrix((block.shape[0], len(unseen)), dtype=np.float32),
888
- var=pd.DataFrame(index=list(unseen)),
889
- obs=pd.DataFrame(index=block.obs.index),
890
- )
891
- block = ad.concat([block, emptyda], axis=1, join="outer", merge="only")
892
- # do a validation function
893
- block.uns["unseen_genes"] = list(unseen)
894
- block = block[:, block.var.sort_index().index]
895
- block.var[genesdf.columns] = genesdf.loc[block.var.index]
896
- for name in ["stable_id", "created_at", "updated_at"]:
897
- if name in block.var.columns:
898
- block.var = block.var.drop(columns=name)
899
- block.write_h5ad('zhang2024_adata_'+str(i)+".h5ad")
900
- """
scdataloader/utils.py CHANGED
@@ -12,12 +12,48 @@ from scipy.sparse import csr_matrix
12
12
  from scipy.stats import median_abs_deviation
13
13
  from functools import lru_cache
14
14
  from collections import Counter
15
+ from torch import Tensor
16
+ import torch
15
17
 
16
18
  from typing import Union, List, Optional
17
19
 
18
20
  from anndata import AnnData
19
21
 
20
22
 
23
+ def downsample_profile(mat: Tensor, dropout: float):
24
+ """
25
+ This function downsamples the expression profile of a given single cell RNA matrix.
26
+
27
+ The noise is applied based on the renoise parameter,
28
+ the total counts of the matrix, and the number of genes. The function first calculates the noise
29
+ threshold (scaler) based on the renoise parameter. It then generates an initial matrix count by
30
+ applying a Poisson distribution to a random tensor scaled by the total counts and the number of genes.
31
+ The function then models the sampling zeros by applying a Poisson distribution to a random tensor
32
+ scaled by the noise threshold, the total counts, and the number of genes. The function also models
33
+ the technical zeros by generating a random tensor and comparing it to the noise threshold. The final
34
+ matrix count is calculated by subtracting the sampling zeros from the initial matrix count and
35
+ multiplying by the technical zeros. The function ensures that the final matrix count is not less
36
+ than zero by taking the maximum of the final matrix count and a tensor of zeros. The function
37
+ returns the final matrix count.
38
+
39
+ Args:
40
+ mat (torch.Tensor): The input matrix.
41
+ dropout (float): The renoise parameter.
42
+
43
+ Returns:
44
+ torch.Tensor: The matrix count after applying noise.
45
+ """
46
+ batch = mat.shape[0]
47
+ ngenes = mat.shape[1]
48
+ dropout = dropout * 1.1
49
+ # we model the sampling zeros (dropping 30% of the reads)
50
+ res = torch.poisson((mat * (dropout / 2))).int()
51
+ # we model the technical zeros (dropping 50% of the genes)
52
+ notdrop = (torch.rand((batch, ngenes), device=mat.device) >= (dropout / 2)).int()
53
+ mat = (mat - res) * notdrop
54
+ return torch.maximum(mat, torch.zeros((1, 1), device=mat.device, dtype=torch.int))
55
+
56
+
21
57
  def createFoldersFor(filepath: str):
22
58
  """
23
59
  will recursively create folders if needed until having all the folders required to save the file in this filepath
@@ -38,6 +74,7 @@ def _fetchFromServer(
38
74
  Args:
39
75
  ensemble_server (str): The URL of the ensemble server to fetch data from.
40
76
  attributes (list): The list of attributes to fetch from the server.
77
+ database (str): The database to fetch data from.
41
78
 
42
79
  Returns:
43
80
  pd.DataFrame: A pandas DataFrame containing the fetched data.
@@ -68,6 +105,9 @@ def getBiomartTable(
68
105
  ensemble_server (str, optional): the biomart server. Defaults to "http://jul2023.archive.ensembl.org/biomart".
69
106
  useCache (bool, optional): whether to use the cache or not. Defaults to False.
70
107
  cache_folder (str, optional): the cache folder. Defaults to "/tmp/biomart/".
108
+ attributes (List[str], optional): the attributes to fetch. Defaults to [].
109
+ bypass_attributes (bool, optional): whether to bypass the attributes or not. Defaults to False.
110
+ database (str, optional): the database to fetch from. Defaults to "hsapiens_gene_ensembl".
71
111
 
72
112
  Raises:
73
113
  ValueError: should be a dataframe (when the result from the server is something else)
@@ -98,15 +138,15 @@ def getBiomartTable(
98
138
 
99
139
  res = _fetchFromServer(ensemble_server, attr + attributes, database=database)
100
140
  res.to_csv(cachefile, index=False)
101
-
102
141
  res.columns = attr + attributes
103
142
  if type(res) is not type(pd.DataFrame()):
104
143
  raise ValueError("should be a dataframe")
105
- res = res[~(res["ensembl_gene_id"].isna() & res["hgnc_symbol"].isna())]
106
- res.loc[res[res.hgnc_symbol.isna()].index, "hgnc_symbol"] = res[
107
- res.hgnc_symbol.isna()
108
- ]["ensembl_gene_id"]
109
-
144
+ res = res[~(res["ensembl_gene_id"].isna())]
145
+ if "hgnc_symbol" in res.columns:
146
+ res = res[res["hgnc_symbol"].isna()]
147
+ res.loc[res[res.hgnc_symbol.isna()].index, "hgnc_symbol"] = res[
148
+ res.hgnc_symbol.isna()
149
+ ]["ensembl_gene_id"]
110
150
  return res
111
151
 
112
152
 
@@ -206,6 +246,16 @@ def get_all_ancestors(val: str, df: pd.DataFrame):
206
246
  return set.union(set(parents), *[get_all_ancestors(val, df) for val in parents])
207
247
 
208
248
 
249
+ # setting a cache of 200 elements
250
+ # @lru_cache(maxsize=200)
251
+ def get_descendants(val, df):
252
+ ontos = set(df[df.parents__ontology_id.str.contains(val)].index.tolist())
253
+ r_onto = set()
254
+ for onto in ontos:
255
+ r_onto |= get_descendants(onto, df)
256
+ return r_onto | ontos
257
+
258
+
209
259
  def get_ancestry_mapping(all_elem: list, onto_df: pd.DataFrame):
210
260
  """
211
261
  This function generates a mapping of all elements to their ancestors in the ontology dataframe.
@@ -304,13 +354,12 @@ def load_dataset_local(
304
354
 
305
355
  def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"): # "NCBITaxon:10090",
306
356
  organismdf = []
307
- if type(organisms) == str:
357
+ if type(organisms) is str:
308
358
  organisms = [organisms]
309
359
  for organism in organisms:
310
360
  genesdf = bt.Gene.filter(
311
361
  organism_id=bt.Organism.filter(ontology_id=organism).first().id
312
362
  ).df()
313
- genesdf = genesdf[~genesdf["public_source_id"].isna()]
314
363
  genesdf = genesdf.drop_duplicates(subset="ensembl_gene_id")
315
364
  genesdf = genesdf.set_index("ensembl_gene_id").sort_index()
316
365
  # mitochondrial genes
@@ -321,7 +370,12 @@ def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"): # "NCBITaxon:10
321
370
  genesdf["hb"] = genesdf.symbol.astype(str).str.contains(("^HB[^(P)]"))
322
371
  genesdf["organism"] = organism
323
372
  organismdf.append(genesdf)
324
- return pd.concat(organismdf)
373
+ organismdf = pd.concat(organismdf)
374
+ organismdf.drop(
375
+ columns=["source_id", "run_id", "created_by_id", "updated_at", "stable_id"],
376
+ inplace=True,
377
+ )
378
+ return organismdf
325
379
 
326
380
 
327
381
  def populate_my_ontology(
@@ -358,77 +412,79 @@ def populate_my_ontology(
358
412
  diseases (list, optional): List of diseases. Defaults to [].
359
413
  dev_stages (list, optional): List of developmental stages. Defaults to [].
360
414
  """
361
-
362
- names = bt.CellType.public().df().index if not celltypes else celltypes
363
- records = bt.CellType.from_values(names, field="ontology_id")
364
- ln.save(records, parents=bool(celltypes))
365
- bt.CellType(name="unknown", ontology_id="unknown").save()
415
+ # cell type
416
+ if celltypes is not None:
417
+ names = bt.CellType.public().df().index if not celltypes else celltypes
418
+ records = bt.CellType.from_values(names, field="ontology_id")
419
+ ln.save(records)
420
+ bt.CellType(name="unknown", ontology_id="unknown").save()
366
421
  # Organism
367
- names = bt.Organism.public().df().index if not organisms else organisms
368
- records = [
369
- i[0] if type(i) is list else i
370
- for i in [bt.Organism.from_public(ontology_id=i) for i in names]
371
- ]
372
- ln.save(records, parents=bool(organisms))
373
- bt.Organism(name="unknown", ontology_id="unknown").save()
422
+ if organisms is not None:
423
+ names = bt.Organism.public().df().index if not organisms else organisms
424
+ records = [
425
+ i[0] if type(i) is list else i
426
+ for i in [bt.Organism.from_source(ontology_id=i) for i in names]
427
+ ]
428
+ ln.save(records)
429
+ bt.Organism(name="unknown", ontology_id="unknown").save()
430
+ organism_names = names
374
431
  # Phenotype
375
- names = bt.Phenotype.public().df().index if not sex else sex
376
- records = [
377
- bt.Phenotype.from_public(
378
- ontology_id=i,
379
- public_source=bt.PublicSource.filter(
380
- entity="Phenotype", source="pato"
381
- ).one(),
382
- )
383
- for i in names
384
- ]
385
- ln.save(records, parents=bool(sex))
386
- bt.Phenotype(name="unknown", ontology_id="unknown").save()
432
+ if sex is not None:
433
+ names = bt.Phenotype.public().df().index if not sex else sex
434
+ records = [
435
+ bt.Phenotype.from_source(
436
+ ontology_id=i,
437
+ )
438
+ for i in names
439
+ ]
440
+ ln.save(records)
441
+ bt.Phenotype(name="unknown", ontology_id="unknown").save()
387
442
  # ethnicity
388
- names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
389
- records = bt.Ethnicity.from_values(names, field="ontology_id")
390
- ln.save(records, parents=bool(ethnicities))
391
- bt.Ethnicity(
392
- name="unknown", ontology_id="unknown"
393
- ).save() # multi ethnic will have to get renamed
443
+ if ethnicities is not None:
444
+ names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
445
+ records = bt.Ethnicity.from_values(names, field="ontology_id")
446
+ ln.save(records)
447
+ bt.Ethnicity(
448
+ name="unknown", ontology_id="unknown"
449
+ ).save() # multi ethnic will have to get renamed
394
450
  # ExperimentalFactor
395
- names = bt.ExperimentalFactor.public().df().index if not assays else assays
396
- records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
397
- ln.save(records, parents=bool(assays))
398
- bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
399
- # lookup = bt.ExperimentalFactor.lookup()
400
- # lookup.smart_seq_v4.parents.add(lookup.smart_like)
451
+ if assays is not None:
452
+ names = bt.ExperimentalFactor.public().df().index if not assays else assays
453
+ records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
454
+ ln.save(records)
455
+ bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
456
+ # lookup = bt.ExperimentalFactor.lookup()
457
+ # lookup.smart_seq_v4.parents.add(lookup.smart_like)
401
458
  # Tissue
402
- names = bt.Tissue.public().df().index if not tissues else tissues
403
- records = bt.Tissue.from_values(names, field="ontology_id")
404
- ln.save(records, parents=bool(tissues))
405
- bt.Tissue(name="unknown", ontology_id="unknown").save()
459
+ if tissues is not None:
460
+ names = bt.Tissue.public().df().index if not tissues else tissues
461
+ records = bt.Tissue.from_values(names, field="ontology_id")
462
+ ln.save(records)
463
+ bt.Tissue(name="unknown", ontology_id="unknown").save()
406
464
  # DevelopmentalStage
407
- names = (
408
- bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
409
- )
410
- records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
411
- ln.save(records, parents=bool(dev_stages))
412
- bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
413
-
414
- names = bt.DevelopmentalStage.public(organism="mouse").df().name
415
- bionty_source = bt.PublicSource.filter(
416
- entity="DevelopmentalStage", organism="mouse"
417
- ).one()
418
- records = [
419
- bt.DevelopmentalStage.from_public(name=i, public_source=bionty_source)
420
- for i in names.tolist()
421
- ]
422
- records[-4] = records[-4][0]
423
- ln.save(records)
465
+ if dev_stages is not None:
466
+ names = (
467
+ bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
468
+ )
469
+ records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
470
+ ln.save(records)
471
+ bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
472
+
473
+ names = bt.DevelopmentalStage.public(organism="mouse").df().index
474
+ records = [
475
+ bt.DevelopmentalStage.from_source(ontology_id=i) for i in names.tolist()
476
+ ]
477
+ records[-4] = records[-4][0]
478
+ ln.save(records)
424
479
  # Disease
425
- names = bt.Disease.public().df().index if not diseases else diseases
426
- records = bt.Disease.from_values(names, field="ontology_id")
427
- ln.save(records, parents=bool(diseases))
428
- bt.Disease(name="normal", ontology_id="PATO:0000461").save()
429
- bt.Disease(name="unknown", ontology_id="unknown").save()
480
+ if diseases is not None:
481
+ names = bt.Disease.public().df().index if not diseases else diseases
482
+ records = bt.Disease.from_values(names, field="ontology_id")
483
+ ln.save(records)
484
+ bt.Disease(name="normal", ontology_id="PATO:0000461").save()
485
+ bt.Disease(name="unknown", ontology_id="unknown").save()
430
486
  # genes
431
- for organism in ["NCBITaxon:10090", "NCBITaxon:9606"]:
487
+ for organism in organism_names:
432
488
  # convert onto to name
433
489
  organism = bt.Organism.filter(ontology_id=organism).one().name
434
490
  names = bt.Gene.public(organism=organism).df()["ensembl_gene_id"]
@@ -474,26 +530,6 @@ def length_normalize(adata: AnnData, gene_lengths: list):
474
530
  return adata
475
531
 
476
532
 
477
- def pd_load_cached(url: str, loc: str = "/tmp/", cache: bool = True, **kwargs):
478
- """
479
- pd_load_cached downloads a file from a url and loads it as a pandas dataframe
480
-
481
- Args:
482
- url (str): the url to download the file from
483
- loc (str, optional): the location to save the file to. Defaults to "/tmp/".
484
- cache (bool, optional): whether to use the cached file or not. Defaults to True.
485
-
486
- Returns:
487
- pd.DataFrame: the dataframe
488
- """
489
- # Check if the file exists, if not, download it
490
- loc += url.split("/")[-1]
491
- if not os.path.isfile(loc) or not cache:
492
- urllib.request.urlretrieve(url, loc)
493
- # Load the data from the file
494
- return pd.read_csv(loc, **kwargs)
495
-
496
-
497
533
  def translate(
498
534
  val: Union[str, list, set, Counter, dict], t: str = "cell_type_ontology_term_id"
499
535
  ):