scdataloader 0.0.3__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scdataloader/VERSION +1 -1
- scdataloader/__init__.py +1 -1
- scdataloader/__main__.py +66 -42
- scdataloader/collator.py +136 -67
- scdataloader/config.py +112 -0
- scdataloader/data.py +160 -169
- scdataloader/datamodule.py +403 -0
- scdataloader/mapped.py +285 -109
- scdataloader/preprocess.py +240 -109
- scdataloader/utils.py +162 -70
- {scdataloader-0.0.3.dist-info → scdataloader-1.0.1.dist-info}/METADATA +87 -18
- scdataloader-1.0.1.dist-info/RECORD +16 -0
- scdataloader/dataloader.py +0 -318
- scdataloader-0.0.3.dist-info/RECORD +0 -15
- {scdataloader-0.0.3.dist-info → scdataloader-1.0.1.dist-info}/LICENSE +0 -0
- {scdataloader-0.0.3.dist-info → scdataloader-1.0.1.dist-info}/WHEEL +0 -0
- {scdataloader-0.0.3.dist-info → scdataloader-1.0.1.dist-info}/entry_points.txt +0 -0
scdataloader/utils.py
CHANGED
|
@@ -11,9 +11,50 @@ from django.db import IntegrityError
|
|
|
11
11
|
from scipy.sparse import csr_matrix
|
|
12
12
|
from scipy.stats import median_abs_deviation
|
|
13
13
|
from functools import lru_cache
|
|
14
|
+
from collections import Counter
|
|
15
|
+
from torch import Tensor
|
|
16
|
+
import torch
|
|
14
17
|
|
|
18
|
+
from typing import Union, List, Optional
|
|
15
19
|
|
|
16
|
-
|
|
20
|
+
from anndata import AnnData
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def downsample_profile(mat: Tensor, dropout: float):
|
|
24
|
+
"""
|
|
25
|
+
This function downsamples the expression profile of a given single cell RNA matrix.
|
|
26
|
+
|
|
27
|
+
The noise is applied based on the renoise parameter,
|
|
28
|
+
the total counts of the matrix, and the number of genes. The function first calculates the noise
|
|
29
|
+
threshold (scaler) based on the renoise parameter. It then generates an initial matrix count by
|
|
30
|
+
applying a Poisson distribution to a random tensor scaled by the total counts and the number of genes.
|
|
31
|
+
The function then models the sampling zeros by applying a Poisson distribution to a random tensor
|
|
32
|
+
scaled by the noise threshold, the total counts, and the number of genes. The function also models
|
|
33
|
+
the technical zeros by generating a random tensor and comparing it to the noise threshold. The final
|
|
34
|
+
matrix count is calculated by subtracting the sampling zeros from the initial matrix count and
|
|
35
|
+
multiplying by the technical zeros. The function ensures that the final matrix count is not less
|
|
36
|
+
than zero by taking the maximum of the final matrix count and a tensor of zeros. The function
|
|
37
|
+
returns the final matrix count.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
mat (torch.Tensor): The input matrix.
|
|
41
|
+
dropout (float): The renoise parameter.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
torch.Tensor: The matrix count after applying noise.
|
|
45
|
+
"""
|
|
46
|
+
batch = mat.shape[0]
|
|
47
|
+
ngenes = mat.shape[1]
|
|
48
|
+
dropout = dropout * 1.1
|
|
49
|
+
# we model the sampling zeros (dropping 30% of the reads)
|
|
50
|
+
res = torch.poisson((mat * (dropout / 2))).int()
|
|
51
|
+
# we model the technical zeros (dropping 50% of the genes)
|
|
52
|
+
notdrop = (torch.rand((batch, ngenes), device=mat.device) >= (dropout / 2)).int()
|
|
53
|
+
mat = (mat - res) * notdrop
|
|
54
|
+
return torch.maximum(mat, torch.zeros((1, 1), device=mat.device, dtype=torch.int))
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def createFoldersFor(filepath: str):
|
|
17
58
|
"""
|
|
18
59
|
will recursively create folders if needed until having all the folders required to save the file in this filepath
|
|
19
60
|
"""
|
|
@@ -24,19 +65,22 @@ def createFoldersFor(filepath):
|
|
|
24
65
|
os.mkdir(prevval)
|
|
25
66
|
|
|
26
67
|
|
|
27
|
-
def _fetchFromServer(
|
|
68
|
+
def _fetchFromServer(
|
|
69
|
+
ensemble_server: str, attributes: list, database: str = "hsapiens_gene_ensembl"
|
|
70
|
+
):
|
|
28
71
|
"""
|
|
29
72
|
Fetches data from the specified ensemble server.
|
|
30
73
|
|
|
31
74
|
Args:
|
|
32
75
|
ensemble_server (str): The URL of the ensemble server to fetch data from.
|
|
33
76
|
attributes (list): The list of attributes to fetch from the server.
|
|
77
|
+
database (str): The database to fetch data from.
|
|
34
78
|
|
|
35
79
|
Returns:
|
|
36
80
|
pd.DataFrame: A pandas DataFrame containing the fetched data.
|
|
37
81
|
"""
|
|
38
82
|
server = BiomartServer(ensemble_server)
|
|
39
|
-
ensmbl = server.datasets[
|
|
83
|
+
ensmbl = server.datasets[database]
|
|
40
84
|
print(attributes)
|
|
41
85
|
res = pd.read_csv(
|
|
42
86
|
io.StringIO(
|
|
@@ -48,11 +92,12 @@ def _fetchFromServer(ensemble_server, attributes):
|
|
|
48
92
|
|
|
49
93
|
|
|
50
94
|
def getBiomartTable(
|
|
51
|
-
ensemble_server="http://jul2023.archive.ensembl.org/biomart",
|
|
52
|
-
useCache=False,
|
|
53
|
-
cache_folder="/tmp/biomart/",
|
|
54
|
-
attributes=[],
|
|
55
|
-
bypass_attributes=False,
|
|
95
|
+
ensemble_server: str = "http://jul2023.archive.ensembl.org/biomart",
|
|
96
|
+
useCache: bool = False,
|
|
97
|
+
cache_folder: str = "/tmp/biomart/",
|
|
98
|
+
attributes: List[str] = [],
|
|
99
|
+
bypass_attributes: bool = False,
|
|
100
|
+
database: str = "hsapiens_gene_ensembl",
|
|
56
101
|
):
|
|
57
102
|
"""generate a genelist dataframe from ensembl's biomart
|
|
58
103
|
|
|
@@ -60,6 +105,9 @@ def getBiomartTable(
|
|
|
60
105
|
ensemble_server (str, optional): the biomart server. Defaults to "http://jul2023.archive.ensembl.org/biomart".
|
|
61
106
|
useCache (bool, optional): whether to use the cache or not. Defaults to False.
|
|
62
107
|
cache_folder (str, optional): the cache folder. Defaults to "/tmp/biomart/".
|
|
108
|
+
attributes (List[str], optional): the attributes to fetch. Defaults to [].
|
|
109
|
+
bypass_attributes (bool, optional): whether to bypass the attributes or not. Defaults to False.
|
|
110
|
+
database (str, optional): the database to fetch from. Defaults to "hsapiens_gene_ensembl".
|
|
63
111
|
|
|
64
112
|
Raises:
|
|
65
113
|
ValueError: should be a dataframe (when the result from the server is something else)
|
|
@@ -88,21 +136,22 @@ def getBiomartTable(
|
|
|
88
136
|
else:
|
|
89
137
|
print("downloading gene names from biomart")
|
|
90
138
|
|
|
91
|
-
res = _fetchFromServer(ensemble_server, attr + attributes)
|
|
139
|
+
res = _fetchFromServer(ensemble_server, attr + attributes, database=database)
|
|
92
140
|
res.to_csv(cachefile, index=False)
|
|
93
141
|
|
|
94
142
|
res.columns = attr + attributes
|
|
95
143
|
if type(res) is not type(pd.DataFrame()):
|
|
96
144
|
raise ValueError("should be a dataframe")
|
|
97
|
-
res = res[~(res["ensembl_gene_id"].isna()
|
|
98
|
-
|
|
99
|
-
res
|
|
100
|
-
|
|
101
|
-
|
|
145
|
+
res = res[~(res["ensembl_gene_id"].isna())]
|
|
146
|
+
if "hgnc_symbol" in res.columns:
|
|
147
|
+
res = res[res["hgnc_symbol"].isna()]
|
|
148
|
+
res.loc[res[res.hgnc_symbol.isna()].index, "hgnc_symbol"] = res[
|
|
149
|
+
res.hgnc_symbol.isna()
|
|
150
|
+
]["ensembl_gene_id"]
|
|
102
151
|
return res
|
|
103
152
|
|
|
104
153
|
|
|
105
|
-
def validate(adata, organism):
|
|
154
|
+
def validate(adata: AnnData, organism: str):
|
|
106
155
|
"""
|
|
107
156
|
validate checks if the adata object is valid for lamindb
|
|
108
157
|
|
|
@@ -144,9 +193,6 @@ def validate(adata, organism):
|
|
|
144
193
|
raise ValueError(
|
|
145
194
|
f"Column '{val}' is missing in the provided anndata object."
|
|
146
195
|
)
|
|
147
|
-
bionty_source = bt.PublicSource.filter(
|
|
148
|
-
entity="DevelopmentalStage", organism=organism
|
|
149
|
-
).one()
|
|
150
196
|
|
|
151
197
|
if not bt.Ethnicity.validate(
|
|
152
198
|
adata.obs["self_reported_ethnicity_ontology_term_id"],
|
|
@@ -169,14 +215,10 @@ def validate(adata, organism):
|
|
|
169
215
|
adata.obs["cell_type_ontology_term_id"], field="ontology_id"
|
|
170
216
|
).all():
|
|
171
217
|
raise ValueError("Invalid cell type ontology term id found")
|
|
172
|
-
if (
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
field="ontology_id",
|
|
177
|
-
)
|
|
178
|
-
.all()
|
|
179
|
-
):
|
|
218
|
+
if not bt.DevelopmentalStage.validate(
|
|
219
|
+
adata.obs["development_stage_ontology_term_id"],
|
|
220
|
+
field="ontology_id",
|
|
221
|
+
).all():
|
|
180
222
|
raise ValueError("Invalid dev stage ontology term id found")
|
|
181
223
|
if not bt.Tissue.validate(
|
|
182
224
|
adata.obs["tissue_ontology_term_id"], field="ontology_id"
|
|
@@ -186,18 +228,16 @@ def validate(adata, organism):
|
|
|
186
228
|
adata.obs["assay_ontology_term_id"], field="ontology_id"
|
|
187
229
|
).all():
|
|
188
230
|
raise ValueError("Invalid assay ontology term id found")
|
|
189
|
-
if (
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
.all()
|
|
193
|
-
):
|
|
231
|
+
if not bt.Gene.validate(
|
|
232
|
+
adata.var.index, field="ensembl_gene_id", organism=organism
|
|
233
|
+
).all():
|
|
194
234
|
raise ValueError("Invalid gene ensembl id found")
|
|
195
235
|
return True
|
|
196
236
|
|
|
197
237
|
|
|
198
238
|
# setting a cache of 200 elements
|
|
199
239
|
# @lru_cache(maxsize=200)
|
|
200
|
-
def get_all_ancestors(val, df):
|
|
240
|
+
def get_all_ancestors(val: str, df: pd.DataFrame):
|
|
201
241
|
if val not in df.index:
|
|
202
242
|
return set()
|
|
203
243
|
parents = df.loc[val].parents__ontology_id
|
|
@@ -207,7 +247,17 @@ def get_all_ancestors(val, df):
|
|
|
207
247
|
return set.union(set(parents), *[get_all_ancestors(val, df) for val in parents])
|
|
208
248
|
|
|
209
249
|
|
|
210
|
-
|
|
250
|
+
# setting a cache of 200 elements
|
|
251
|
+
# @lru_cache(maxsize=200)
|
|
252
|
+
def get_descendants(val, df):
|
|
253
|
+
ontos = set(df[df.parents__ontology_id.str.contains(val)].index.tolist())
|
|
254
|
+
r_onto = set()
|
|
255
|
+
for onto in ontos:
|
|
256
|
+
r_onto |= get_descendants(onto, df)
|
|
257
|
+
return r_onto | ontos
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def get_ancestry_mapping(all_elem: list, onto_df: pd.DataFrame):
|
|
211
261
|
"""
|
|
212
262
|
This function generates a mapping of all elements to their ancestors in the ontology dataframe.
|
|
213
263
|
|
|
@@ -242,12 +292,12 @@ def get_ancestry_mapping(all_elem, onto_df):
|
|
|
242
292
|
|
|
243
293
|
|
|
244
294
|
def load_dataset_local(
|
|
245
|
-
remote_dataset,
|
|
246
|
-
download_folder,
|
|
247
|
-
name,
|
|
248
|
-
description,
|
|
249
|
-
use_cache=True,
|
|
250
|
-
only=None,
|
|
295
|
+
remote_dataset: ln.Collection,
|
|
296
|
+
download_folder: str,
|
|
297
|
+
name: str,
|
|
298
|
+
description: str,
|
|
299
|
+
use_cache: bool = True,
|
|
300
|
+
only: Optional[List[int]] = None,
|
|
251
301
|
):
|
|
252
302
|
"""
|
|
253
303
|
This function loads a remote lamindb dataset to local.
|
|
@@ -303,7 +353,7 @@ def load_dataset_local(
|
|
|
303
353
|
return dataset
|
|
304
354
|
|
|
305
355
|
|
|
306
|
-
def load_genes(organisms):
|
|
356
|
+
def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"): # "NCBITaxon:10090",
|
|
307
357
|
organismdf = []
|
|
308
358
|
if type(organisms) == str:
|
|
309
359
|
organisms = [organisms]
|
|
@@ -313,7 +363,7 @@ def load_genes(organisms):
|
|
|
313
363
|
).df()
|
|
314
364
|
genesdf = genesdf[~genesdf["public_source_id"].isna()]
|
|
315
365
|
genesdf = genesdf.drop_duplicates(subset="ensembl_gene_id")
|
|
316
|
-
genesdf = genesdf.set_index("ensembl_gene_id")
|
|
366
|
+
genesdf = genesdf.set_index("ensembl_gene_id").sort_index()
|
|
317
367
|
# mitochondrial genes
|
|
318
368
|
genesdf["mt"] = genesdf.symbol.astype(str).str.startswith("MT-")
|
|
319
369
|
# ribosomal genes
|
|
@@ -326,14 +376,14 @@ def load_genes(organisms):
|
|
|
326
376
|
|
|
327
377
|
|
|
328
378
|
def populate_my_ontology(
|
|
329
|
-
organisms=["NCBITaxon:10090", "NCBITaxon:9606"],
|
|
330
|
-
sex=["PATO:0000384", "PATO:0000383"],
|
|
331
|
-
celltypes=[],
|
|
332
|
-
ethnicities=[],
|
|
333
|
-
assays=[],
|
|
334
|
-
tissues=[],
|
|
335
|
-
diseases=[],
|
|
336
|
-
dev_stages=[],
|
|
379
|
+
organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
|
|
380
|
+
sex: List[str] = ["PATO:0000384", "PATO:0000383"],
|
|
381
|
+
celltypes: List[str] = [],
|
|
382
|
+
ethnicities: List[str] = [],
|
|
383
|
+
assays: List[str] = [],
|
|
384
|
+
tissues: List[str] = [],
|
|
385
|
+
diseases: List[str] = [],
|
|
386
|
+
dev_stages: List[str] = [],
|
|
337
387
|
):
|
|
338
388
|
"""
|
|
339
389
|
creates a local version of the lamin ontologies and add the required missing values in base ontologies
|
|
@@ -360,20 +410,20 @@ def populate_my_ontology(
|
|
|
360
410
|
dev_stages (list, optional): List of developmental stages. Defaults to [].
|
|
361
411
|
"""
|
|
362
412
|
|
|
363
|
-
names = bt.CellType.
|
|
413
|
+
names = bt.CellType.public().df().index if not celltypes else celltypes
|
|
364
414
|
records = bt.CellType.from_values(names, field="ontology_id")
|
|
365
|
-
ln.save(records)
|
|
415
|
+
ln.save(records, parents=bool(celltypes))
|
|
366
416
|
bt.CellType(name="unknown", ontology_id="unknown").save()
|
|
367
417
|
# Organism
|
|
368
|
-
names = bt.Organism.
|
|
418
|
+
names = bt.Organism.public().df().index if not organisms else organisms
|
|
369
419
|
records = [
|
|
370
420
|
i[0] if type(i) is list else i
|
|
371
421
|
for i in [bt.Organism.from_public(ontology_id=i) for i in names]
|
|
372
422
|
]
|
|
373
|
-
ln.save(records)
|
|
423
|
+
ln.save(records, parents=bool(organisms))
|
|
374
424
|
bt.Organism(name="unknown", ontology_id="unknown").save()
|
|
375
425
|
# Phenotype
|
|
376
|
-
names = bt.Phenotype.
|
|
426
|
+
names = bt.Phenotype.public().df().index if not sex else sex
|
|
377
427
|
records = [
|
|
378
428
|
bt.Phenotype.from_public(
|
|
379
429
|
ontology_id=i,
|
|
@@ -383,38 +433,47 @@ def populate_my_ontology(
|
|
|
383
433
|
)
|
|
384
434
|
for i in names
|
|
385
435
|
]
|
|
386
|
-
ln.save(records)
|
|
436
|
+
ln.save(records, parents=bool(sex))
|
|
387
437
|
bt.Phenotype(name="unknown", ontology_id="unknown").save()
|
|
388
438
|
# ethnicity
|
|
389
|
-
names = bt.Ethnicity.
|
|
439
|
+
names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
|
|
390
440
|
records = bt.Ethnicity.from_values(names, field="ontology_id")
|
|
391
|
-
ln.save(records)
|
|
441
|
+
ln.save(records, parents=bool(ethnicities))
|
|
392
442
|
bt.Ethnicity(
|
|
393
443
|
name="unknown", ontology_id="unknown"
|
|
394
444
|
).save() # multi ethnic will have to get renamed
|
|
395
445
|
# ExperimentalFactor
|
|
396
|
-
names = bt.ExperimentalFactor.
|
|
446
|
+
names = bt.ExperimentalFactor.public().df().index if not assays else assays
|
|
397
447
|
records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
|
|
398
|
-
ln.save(records)
|
|
448
|
+
ln.save(records, parents=bool(assays))
|
|
399
449
|
bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
|
|
400
450
|
# lookup = bt.ExperimentalFactor.lookup()
|
|
401
451
|
# lookup.smart_seq_v4.parents.add(lookup.smart_like)
|
|
402
452
|
# Tissue
|
|
403
|
-
names = bt.Tissue.
|
|
453
|
+
names = bt.Tissue.public().df().index if not tissues else tissues
|
|
404
454
|
records = bt.Tissue.from_values(names, field="ontology_id")
|
|
405
|
-
ln.save(records)
|
|
455
|
+
ln.save(records, parents=bool(tissues))
|
|
406
456
|
bt.Tissue(name="unknown", ontology_id="unknown").save()
|
|
407
457
|
# DevelopmentalStage
|
|
408
|
-
names = (
|
|
409
|
-
bt.DevelopmentalStage.from_public().df().index if not dev_stages else dev_stages
|
|
410
|
-
)
|
|
458
|
+
names = bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
|
|
411
459
|
records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
|
|
412
|
-
ln.save(records)
|
|
460
|
+
ln.save(records, parents=bool(dev_stages))
|
|
413
461
|
bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
|
|
462
|
+
|
|
463
|
+
names = bt.DevelopmentalStage.public(organism="mouse").df().name
|
|
464
|
+
bionty_source = bt.PublicSource.filter(
|
|
465
|
+
entity="DevelopmentalStage", organism="mouse"
|
|
466
|
+
).one()
|
|
467
|
+
records = [
|
|
468
|
+
bt.DevelopmentalStage.from_public(name=i, public_source=bionty_source)
|
|
469
|
+
for i in names.tolist()
|
|
470
|
+
]
|
|
471
|
+
records[-4] = records[-4][0]
|
|
472
|
+
ln.save(records)
|
|
414
473
|
# Disease
|
|
415
|
-
names = bt.Disease.
|
|
474
|
+
names = bt.Disease.public().df().index if not diseases else diseases
|
|
416
475
|
records = bt.Disease.from_values(names, field="ontology_id")
|
|
417
|
-
ln.save(records)
|
|
476
|
+
ln.save(records, parents=bool(diseases))
|
|
418
477
|
bt.Disease(name="normal", ontology_id="PATO:0000461").save()
|
|
419
478
|
bt.Disease(name="unknown", ontology_id="unknown").save()
|
|
420
479
|
# genes
|
|
@@ -430,7 +489,7 @@ def populate_my_ontology(
|
|
|
430
489
|
ln.save(records)
|
|
431
490
|
|
|
432
491
|
|
|
433
|
-
def is_outlier(adata, metric: str, nmads: int):
|
|
492
|
+
def is_outlier(adata: AnnData, metric: str, nmads: int):
|
|
434
493
|
"""
|
|
435
494
|
is_outlier detects outliers in adata.obs[metric]
|
|
436
495
|
|
|
@@ -449,7 +508,7 @@ def is_outlier(adata, metric: str, nmads: int):
|
|
|
449
508
|
return outlier
|
|
450
509
|
|
|
451
510
|
|
|
452
|
-
def length_normalize(adata, gene_lengths):
|
|
511
|
+
def length_normalize(adata: AnnData, gene_lengths: list):
|
|
453
512
|
"""
|
|
454
513
|
length_normalize normalizes the counts by the gene length
|
|
455
514
|
|
|
@@ -464,7 +523,7 @@ def length_normalize(adata, gene_lengths):
|
|
|
464
523
|
return adata
|
|
465
524
|
|
|
466
525
|
|
|
467
|
-
def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs):
|
|
526
|
+
def pd_load_cached(url: str, loc: str = "/tmp/", cache: bool = True, **kwargs):
|
|
468
527
|
"""
|
|
469
528
|
pd_load_cached downloads a file from a url and loads it as a pandas dataframe
|
|
470
529
|
|
|
@@ -482,3 +541,36 @@ def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs):
|
|
|
482
541
|
urllib.request.urlretrieve(url, loc)
|
|
483
542
|
# Load the data from the file
|
|
484
543
|
return pd.read_csv(loc, **kwargs)
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def translate(
|
|
547
|
+
val: Union[str, list, set, Counter, dict], t: str = "cell_type_ontology_term_id"
|
|
548
|
+
):
|
|
549
|
+
"""
|
|
550
|
+
translate translates the ontology term id to the name
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
val (str, dict, set, list, dict): the object to translate
|
|
554
|
+
t (flat, optional): the type of ontology terms.
|
|
555
|
+
one of cell_type_ontology_term_id, assay_ontology_term_id, tissue_ontology_term_id.
|
|
556
|
+
Defaults to "cell_type_ontology_term_id".
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
dict: the mapping for the translation
|
|
560
|
+
"""
|
|
561
|
+
if t == "cell_type_ontology_term_id":
|
|
562
|
+
obj = bt.CellType.public(organism="all")
|
|
563
|
+
elif t == "assay_ontology_term_id":
|
|
564
|
+
obj = bt.ExperimentalFactor.public()
|
|
565
|
+
elif t == "tissue_ontology_term_id":
|
|
566
|
+
obj = bt.Tissue.public()
|
|
567
|
+
else:
|
|
568
|
+
return None
|
|
569
|
+
if type(val) is str:
|
|
570
|
+
return {val: obj.search(val, field=obj.ontology_id).name.iloc[0]}
|
|
571
|
+
elif type(val) is list or type(val) is set:
|
|
572
|
+
return {i: obj.search(i, field=obj.ontology_id).name.iloc[0] for i in set(val)}
|
|
573
|
+
elif type(val) is dict or type(val) is Counter:
|
|
574
|
+
return {
|
|
575
|
+
obj.search(k, field=obj.ontology_id).name.iloc[0]: v for k, v in val.items()
|
|
576
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: scdataloader
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.0.1
|
|
4
4
|
Summary: a dataloader for single cell data in lamindb
|
|
5
5
|
Home-page: https://github.com/jkobject/scDataLoader
|
|
6
6
|
License: GPL3
|
|
@@ -34,12 +34,16 @@ Description-Content-Type: text/markdown
|
|
|
34
34
|
|
|
35
35
|
[](https://codecov.io/gh/jkobject/scDataLoader)
|
|
36
36
|
[](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
|
|
37
|
+
[](https://badge.fury.io/py/scDataLoader)
|
|
38
|
+
[](https://scDataLoader.readthedocs.io/en/latest/?badge=latest)
|
|
39
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
40
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
41
|
+
[](https://pepy.tech/project/scDataLoader)
|
|
42
|
+
[](https://img.shields.io/github/issues/jkobject/scDataLoader)
|
|
43
|
+
[](https://github.com/psf/black)
|
|
44
|
+
[](https://doi.org/10.1101/2024.07.29.605556)
|
|
37
45
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
41
|
-
|
|
42
|
-
This data loader is designed to be used with:
|
|
46
|
+
This single cell pytorch dataloader / lighting datamodule is designed to be used with:
|
|
43
47
|
|
|
44
48
|
- [lamindb](https://lamin.ai/)
|
|
45
49
|
|
|
@@ -55,18 +59,13 @@ It allows you to:
|
|
|
55
59
|
3. create a more complex single cell dataset
|
|
56
60
|
4. extend it to your need
|
|
57
61
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
the idea is to use it to train models like scGPT / GeneFormer (and soon, scPrint ;)). It is:
|
|
62
|
+
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
61
63
|
|
|
62
|
-
|
|
63
|
-
2. doing some dataset specific preprocessing if needed
|
|
64
|
-
3. creating a dataset object on top of .mapped() (that is needed for mapping genes, cell labels etc..)
|
|
65
|
-
4. passing it to a dataloader object that can work with it correctly
|
|
64
|
+
## More
|
|
66
65
|
|
|
67
|
-
|
|
66
|
+
I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.
|
|
68
67
|
|
|
69
|
-

|
|
68
|
+

|
|
70
69
|
|
|
71
70
|
## Install it from PyPI
|
|
72
71
|
|
|
@@ -85,15 +84,85 @@ then run the notebooks with the poetry installed environment
|
|
|
85
84
|
|
|
86
85
|
## Usage
|
|
87
86
|
|
|
88
|
-
|
|
87
|
+
### Direct Usage
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
# initialize a local lamin database
|
|
91
|
+
# !lamin init --storage ~/scdataloader --schema bionty
|
|
92
|
+
|
|
93
|
+
from scdataloader import utils
|
|
94
|
+
from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
|
|
95
|
+
|
|
96
|
+
# preprocess datasets
|
|
97
|
+
DESCRIPTION='preprocessed by scDataLoader'
|
|
98
|
+
|
|
99
|
+
cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
|
|
100
|
+
cx_dataset, len(cx_dataset.artifacts.all())
|
|
89
101
|
|
|
90
|
-
|
|
91
|
-
|
|
102
|
+
|
|
103
|
+
do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
|
|
104
|
+
|
|
105
|
+
preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
|
|
106
|
+
|
|
107
|
+
# create dataloaders
|
|
108
|
+
from scdataloader import DataModule
|
|
109
|
+
import tqdm
|
|
110
|
+
|
|
111
|
+
datamodule = DataModule(
|
|
112
|
+
collection_name="preprocessed dataset",
|
|
113
|
+
organisms=["NCBITaxon:9606"], #organism that we will work on
|
|
114
|
+
how="most expr", # for the collator (most expr genes only will be selected)
|
|
115
|
+
max_len=1000, # only the 1000 most expressed
|
|
116
|
+
batch_size=64,
|
|
117
|
+
num_workers=1,
|
|
118
|
+
validation_split=0.1,
|
|
119
|
+
test_split=0)
|
|
120
|
+
|
|
121
|
+
for i in tqdm.tqdm(datamodule.train_dataloader()):
|
|
122
|
+
# pass #or do pass
|
|
123
|
+
print(i)
|
|
124
|
+
break
|
|
125
|
+
|
|
126
|
+
# with lightning:
|
|
127
|
+
# Trainer(model, datamodule)
|
|
128
|
+
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
see the notebooks in [docs](https://www.jkobject.com/scDataLoader/):
|
|
132
|
+
|
|
133
|
+
1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/)
|
|
134
|
+
2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/)
|
|
135
|
+
|
|
136
|
+
### command line preprocessing
|
|
137
|
+
|
|
138
|
+
You can use the command line to preprocess a large database of datasets like here for cellxgene. this allows parallelizing and easier usage.
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
scdataloader --instance "laminlabs/cellxgene" --name "cellxgene-census" --version "2023-12-15" --description "preprocessed for scprint" --new_name "scprint main" --start_at 10 >> scdataloader.out
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### command line usage
|
|
145
|
+
|
|
146
|
+
The main way to use
|
|
147
|
+
|
|
148
|
+
> please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage
|
|
92
149
|
|
|
93
150
|
## Development
|
|
94
151
|
|
|
95
152
|
Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
|
|
96
153
|
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
157
|
+
|
|
158
|
+
## Acknowledgments
|
|
159
|
+
|
|
160
|
+
- [lamin.ai](https://lamin.ai/)
|
|
161
|
+
- [scanpy](https://scanpy.readthedocs.io/en/stable/)
|
|
162
|
+
- [anndata](https://anndata.readthedocs.io/en/latest/)
|
|
163
|
+
- [scprint](https://www.jkobject.com/scPRINT/)
|
|
164
|
+
|
|
165
|
+
Awesome single cell dataloader created by @jkobject
|
|
97
166
|
GNU GENERAL PUBLIC LICENSE
|
|
98
167
|
Version 3, 29 June 2007
|
|
99
168
|
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
scdataloader/VERSION,sha256=WYVJhIUxBN9cNT4vaBoV_HkkdC-aLkaMKa8kjc5FzgM,6
|
|
2
|
+
scdataloader/__init__.py,sha256=NIlE4oTUPRZ3uSW_maozoEHp470I7PV1vMOJ4XpSmL4,122
|
|
3
|
+
scdataloader/__main__.py,sha256=db_upDq3tNEtcDH17mPIczToAqGkSKfLy0Qbj6B4YmE,6385
|
|
4
|
+
scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
|
|
5
|
+
scdataloader/collator.py,sha256=zkFdxirTDub1dJ1OJXO0p48kvd2r2ncKMdevAKIdTTc,13447
|
|
6
|
+
scdataloader/config.py,sha256=rrW2DZxG4J2_pmpDbXXsaKJkpNC57w5dIlItiFbANYw,2905
|
|
7
|
+
scdataloader/data.py,sha256=VugtHo9T9PqoJSv3lkJJAB89KD-fRwdVw1D76gnCc9c,12584
|
|
8
|
+
scdataloader/datamodule.py,sha256=WLEWcDMcC1G3VD5tORfhfqRRHcTscpI0EzPikg3udbI,16881
|
|
9
|
+
scdataloader/mapped.py,sha256=yF9l3obuRWbQjW8QZGRSKhc50fizXTWf3Pe1m542fW8,19481
|
|
10
|
+
scdataloader/preprocess.py,sha256=noynYWuy9clhFu9UnN-vSvAHJHwakDttkI5aj1e_T98,29055
|
|
11
|
+
scdataloader/utils.py,sha256=xyDsWaqkjhzlVBP8FiYdBUWHsel3twcVWmI53PhKqTM,21888
|
|
12
|
+
scdataloader-1.0.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
13
|
+
scdataloader-1.0.1.dist-info/METADATA,sha256=2Xd8M1dq_JmvmFjmrrzn-1U4eOtwU6L51Y_7MCkGxvY,41327
|
|
14
|
+
scdataloader-1.0.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
15
|
+
scdataloader-1.0.1.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
|
|
16
|
+
scdataloader-1.0.1.dist-info/RECORD,,
|