scdataloader 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scdataloader/VERSION +1 -1
- scdataloader/__init__.py +4 -0
- scdataloader/__main__.py +209 -0
- scdataloader/collator.py +307 -0
- scdataloader/config.py +106 -0
- scdataloader/data.py +181 -218
- scdataloader/datamodule.py +375 -0
- scdataloader/mapped.py +46 -32
- scdataloader/preprocess.py +524 -208
- scdataloader/utils.py +189 -123
- {scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/METADATA +77 -7
- scdataloader-0.0.4.dist-info/RECORD +16 -0
- {scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/WHEEL +1 -1
- scdataloader-0.0.2.dist-info/RECORD +0 -12
- {scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/LICENSE +0 -0
- {scdataloader-0.0.2.dist-info → scdataloader-0.0.4.dist-info}/entry_points.txt +0 -0
scdataloader/utils.py
CHANGED
|
@@ -10,9 +10,15 @@ from biomart import BiomartServer
|
|
|
10
10
|
from django.db import IntegrityError
|
|
11
11
|
from scipy.sparse import csr_matrix
|
|
12
12
|
from scipy.stats import median_abs_deviation
|
|
13
|
+
from functools import lru_cache
|
|
14
|
+
from collections import Counter
|
|
13
15
|
|
|
16
|
+
from typing import Union, List, Optional
|
|
14
17
|
|
|
15
|
-
|
|
18
|
+
from anndata import AnnData
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def createFoldersFor(filepath: str):
|
|
16
22
|
"""
|
|
17
23
|
will recursively create folders if needed until having all the folders required to save the file in this filepath
|
|
18
24
|
"""
|
|
@@ -23,15 +29,25 @@ def createFoldersFor(filepath):
|
|
|
23
29
|
os.mkdir(prevval)
|
|
24
30
|
|
|
25
31
|
|
|
26
|
-
def _fetchFromServer(
|
|
32
|
+
def _fetchFromServer(
|
|
33
|
+
ensemble_server: str, attributes: list, database: str = "hsapiens_gene_ensembl"
|
|
34
|
+
):
|
|
35
|
+
"""
|
|
36
|
+
Fetches data from the specified ensemble server.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
ensemble_server (str): The URL of the ensemble server to fetch data from.
|
|
40
|
+
attributes (list): The list of attributes to fetch from the server.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
pd.DataFrame: A pandas DataFrame containing the fetched data.
|
|
44
|
+
"""
|
|
27
45
|
server = BiomartServer(ensemble_server)
|
|
28
|
-
ensmbl = server.datasets[
|
|
46
|
+
ensmbl = server.datasets[database]
|
|
29
47
|
print(attributes)
|
|
30
48
|
res = pd.read_csv(
|
|
31
49
|
io.StringIO(
|
|
32
|
-
ensmbl.search(
|
|
33
|
-
{"attributes": attributes}, header=1
|
|
34
|
-
).content.decode()
|
|
50
|
+
ensmbl.search({"attributes": attributes}, header=1).content.decode()
|
|
35
51
|
),
|
|
36
52
|
sep="\t",
|
|
37
53
|
)
|
|
@@ -39,11 +55,12 @@ def _fetchFromServer(ensemble_server, attributes):
|
|
|
39
55
|
|
|
40
56
|
|
|
41
57
|
def getBiomartTable(
|
|
42
|
-
ensemble_server="http://jul2023.archive.ensembl.org/biomart",
|
|
43
|
-
useCache=False,
|
|
44
|
-
cache_folder="/tmp/biomart/",
|
|
45
|
-
attributes=[],
|
|
46
|
-
bypass_attributes=False,
|
|
58
|
+
ensemble_server: str = "http://jul2023.archive.ensembl.org/biomart",
|
|
59
|
+
useCache: bool = False,
|
|
60
|
+
cache_folder: str = "/tmp/biomart/",
|
|
61
|
+
attributes: List[str] = [],
|
|
62
|
+
bypass_attributes: bool = False,
|
|
63
|
+
database: str = "hsapiens_gene_ensembl",
|
|
47
64
|
):
|
|
48
65
|
"""generate a genelist dataframe from ensembl's biomart
|
|
49
66
|
|
|
@@ -79,7 +96,7 @@ def getBiomartTable(
|
|
|
79
96
|
else:
|
|
80
97
|
print("downloading gene names from biomart")
|
|
81
98
|
|
|
82
|
-
res = _fetchFromServer(ensemble_server, attr + attributes)
|
|
99
|
+
res = _fetchFromServer(ensemble_server, attr + attributes, database=database)
|
|
83
100
|
res.to_csv(cachefile, index=False)
|
|
84
101
|
|
|
85
102
|
res.columns = attr + attributes
|
|
@@ -93,7 +110,7 @@ def getBiomartTable(
|
|
|
93
110
|
return res
|
|
94
111
|
|
|
95
112
|
|
|
96
|
-
def validate(adata
|
|
113
|
+
def validate(adata: AnnData, organism: str):
|
|
97
114
|
"""
|
|
98
115
|
validate checks if the adata object is valid for lamindb
|
|
99
116
|
|
|
@@ -116,8 +133,7 @@ def validate(adata, lb, organism):
|
|
|
116
133
|
Returns:
|
|
117
134
|
bool: True if the adata object is valid
|
|
118
135
|
"""
|
|
119
|
-
organism =
|
|
120
|
-
lb.settings.organism = organism
|
|
136
|
+
organism = bt.Organism.filter(ontology_id=organism).one().name
|
|
121
137
|
|
|
122
138
|
if adata.var.index.duplicated().any():
|
|
123
139
|
raise ValueError("Duplicate gene names found in adata.var.index")
|
|
@@ -136,70 +152,61 @@ def validate(adata, lb, organism):
|
|
|
136
152
|
raise ValueError(
|
|
137
153
|
f"Column '{val}' is missing in the provided anndata object."
|
|
138
154
|
)
|
|
139
|
-
bionty_source = lb.BiontySource.filter(
|
|
140
|
-
entity="DevelopmentalStage", organism=organism
|
|
141
|
-
).one()
|
|
142
155
|
|
|
143
|
-
if not
|
|
156
|
+
if not bt.Ethnicity.validate(
|
|
144
157
|
adata.obs["self_reported_ethnicity_ontology_term_id"],
|
|
145
158
|
field="ontology_id",
|
|
146
159
|
).all():
|
|
147
160
|
raise ValueError("Invalid ethnicity ontology term id found")
|
|
148
|
-
if not
|
|
161
|
+
if not bt.Organism.validate(
|
|
149
162
|
adata.obs["organism_ontology_term_id"], field="ontology_id"
|
|
150
163
|
).all():
|
|
151
164
|
raise ValueError("Invalid organism ontology term id found")
|
|
152
|
-
if not
|
|
165
|
+
if not bt.Phenotype.validate(
|
|
153
166
|
adata.obs["sex_ontology_term_id"], field="ontology_id"
|
|
154
167
|
).all():
|
|
155
168
|
raise ValueError("Invalid sex ontology term id found")
|
|
156
|
-
if not
|
|
169
|
+
if not bt.Disease.validate(
|
|
157
170
|
adata.obs["disease_ontology_term_id"], field="ontology_id"
|
|
158
171
|
).all():
|
|
159
172
|
raise ValueError("Invalid disease ontology term id found")
|
|
160
|
-
if not
|
|
173
|
+
if not bt.CellType.validate(
|
|
161
174
|
adata.obs["cell_type_ontology_term_id"], field="ontology_id"
|
|
162
175
|
).all():
|
|
163
176
|
raise ValueError("Invalid cell type ontology term id found")
|
|
164
|
-
if (
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
field="ontology_id",
|
|
169
|
-
)
|
|
170
|
-
.all()
|
|
171
|
-
):
|
|
177
|
+
if not bt.DevelopmentalStage.validate(
|
|
178
|
+
adata.obs["development_stage_ontology_term_id"],
|
|
179
|
+
field="ontology_id",
|
|
180
|
+
).all():
|
|
172
181
|
raise ValueError("Invalid dev stage ontology term id found")
|
|
173
|
-
if not
|
|
182
|
+
if not bt.Tissue.validate(
|
|
174
183
|
adata.obs["tissue_ontology_term_id"], field="ontology_id"
|
|
175
184
|
).all():
|
|
176
185
|
raise ValueError("Invalid tissue ontology term id found")
|
|
177
|
-
if not
|
|
186
|
+
if not bt.ExperimentalFactor.validate(
|
|
178
187
|
adata.obs["assay_ontology_term_id"], field="ontology_id"
|
|
179
188
|
).all():
|
|
180
189
|
raise ValueError("Invalid assay ontology term id found")
|
|
181
|
-
if (
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
.all()
|
|
185
|
-
):
|
|
190
|
+
if not bt.Gene.validate(
|
|
191
|
+
adata.var.index, field="ensembl_gene_id", organism=organism
|
|
192
|
+
).all():
|
|
186
193
|
raise ValueError("Invalid gene ensembl id found")
|
|
187
194
|
return True
|
|
188
195
|
|
|
189
196
|
|
|
190
|
-
|
|
197
|
+
# setting a cache of 200 elements
|
|
198
|
+
# @lru_cache(maxsize=200)
|
|
199
|
+
def get_all_ancestors(val: str, df: pd.DataFrame):
|
|
191
200
|
if val not in df.index:
|
|
192
201
|
return set()
|
|
193
202
|
parents = df.loc[val].parents__ontology_id
|
|
194
203
|
if parents is None or len(parents) == 0:
|
|
195
204
|
return set()
|
|
196
205
|
else:
|
|
197
|
-
return set.union(
|
|
198
|
-
set(parents), *[get_all_ancestors(val, df) for val in parents]
|
|
199
|
-
)
|
|
206
|
+
return set.union(set(parents), *[get_all_ancestors(val, df) for val in parents])
|
|
200
207
|
|
|
201
208
|
|
|
202
|
-
def get_ancestry_mapping(all_elem, onto_df):
|
|
209
|
+
def get_ancestry_mapping(all_elem: list, onto_df: pd.DataFrame):
|
|
203
210
|
"""
|
|
204
211
|
This function generates a mapping of all elements to their ancestors in the ontology dataframe.
|
|
205
212
|
|
|
@@ -234,13 +241,12 @@ def get_ancestry_mapping(all_elem, onto_df):
|
|
|
234
241
|
|
|
235
242
|
|
|
236
243
|
def load_dataset_local(
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
only=None,
|
|
244
|
+
remote_dataset: ln.Collection,
|
|
245
|
+
download_folder: str,
|
|
246
|
+
name: str,
|
|
247
|
+
description: str,
|
|
248
|
+
use_cache: bool = True,
|
|
249
|
+
only: Optional[List[int]] = None,
|
|
244
250
|
):
|
|
245
251
|
"""
|
|
246
252
|
This function loads a remote lamindb dataset to local.
|
|
@@ -258,9 +264,7 @@ def load_dataset_local(
|
|
|
258
264
|
lamindb.Dataset: The local dataset.
|
|
259
265
|
"""
|
|
260
266
|
saved_files = []
|
|
261
|
-
default_storage = ln.Storage.filter(
|
|
262
|
-
root=ln.settings.storage.as_posix()
|
|
263
|
-
).one()
|
|
267
|
+
default_storage = ln.Storage.filter(root=ln.settings.storage.as_posix()).one()
|
|
264
268
|
files = (
|
|
265
269
|
remote_dataset.artifacts.all()
|
|
266
270
|
if not only
|
|
@@ -275,17 +279,15 @@ def load_dataset_local(
|
|
|
275
279
|
if len(organism) == 0:
|
|
276
280
|
print("No organism detected")
|
|
277
281
|
continue
|
|
278
|
-
organism =
|
|
279
|
-
#
|
|
282
|
+
organism = bt.Organism.filter(ontology_id=organism[0]).one().name
|
|
283
|
+
# bt.settings.organism = organism
|
|
280
284
|
path = file.path
|
|
281
285
|
try:
|
|
282
286
|
file.save()
|
|
283
287
|
except IntegrityError:
|
|
284
288
|
print(f"File {file.key} already exists in storage")
|
|
285
289
|
# if location already has a file, don't save again
|
|
286
|
-
if use_cache and os.path.exists(
|
|
287
|
-
os.path.expanduser(download_folder + file.key)
|
|
288
|
-
):
|
|
290
|
+
if use_cache and os.path.exists(os.path.expanduser(download_folder + file.key)):
|
|
289
291
|
print(f"File {file.key} already exists in storage")
|
|
290
292
|
else:
|
|
291
293
|
path.download_to(download_folder + file.key)
|
|
@@ -295,32 +297,53 @@ def load_dataset_local(
|
|
|
295
297
|
except IntegrityError:
|
|
296
298
|
print(f"File {file.key} already exists in storage")
|
|
297
299
|
saved_files.append(file)
|
|
298
|
-
dataset = ln.
|
|
300
|
+
dataset = ln.Collection(saved_files, name=name, description=description)
|
|
299
301
|
dataset.save()
|
|
300
302
|
return dataset
|
|
301
303
|
|
|
302
304
|
|
|
305
|
+
def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"): # "NCBITaxon:10090",
|
|
306
|
+
organismdf = []
|
|
307
|
+
if type(organisms) == str:
|
|
308
|
+
organisms = [organisms]
|
|
309
|
+
for organism in organisms:
|
|
310
|
+
genesdf = bt.Gene.filter(
|
|
311
|
+
organism_id=bt.Organism.filter(ontology_id=organism).first().id
|
|
312
|
+
).df()
|
|
313
|
+
genesdf = genesdf[~genesdf["public_source_id"].isna()]
|
|
314
|
+
genesdf = genesdf.drop_duplicates(subset="ensembl_gene_id")
|
|
315
|
+
genesdf = genesdf.set_index("ensembl_gene_id").sort_index()
|
|
316
|
+
# mitochondrial genes
|
|
317
|
+
genesdf["mt"] = genesdf.symbol.astype(str).str.startswith("MT-")
|
|
318
|
+
# ribosomal genes
|
|
319
|
+
genesdf["ribo"] = genesdf.symbol.astype(str).str.startswith(("RPS", "RPL"))
|
|
320
|
+
# hemoglobin genes.
|
|
321
|
+
genesdf["hb"] = genesdf.symbol.astype(str).str.contains(("^HB[^(P)]"))
|
|
322
|
+
genesdf["organism"] = organism
|
|
323
|
+
organismdf.append(genesdf)
|
|
324
|
+
return pd.concat(organismdf)
|
|
325
|
+
|
|
326
|
+
|
|
303
327
|
def populate_my_ontology(
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
dev_stages=[],
|
|
328
|
+
organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
|
|
329
|
+
sex: List[str] = ["PATO:0000384", "PATO:0000383"],
|
|
330
|
+
celltypes: List[str] = [],
|
|
331
|
+
ethnicities: List[str] = [],
|
|
332
|
+
assays: List[str] = [],
|
|
333
|
+
tissues: List[str] = [],
|
|
334
|
+
diseases: List[str] = [],
|
|
335
|
+
dev_stages: List[str] = [],
|
|
313
336
|
):
|
|
314
337
|
"""
|
|
315
338
|
creates a local version of the lamin ontologies and add the required missing values in base ontologies
|
|
316
339
|
|
|
317
340
|
run this function just one for each new lamin storage
|
|
318
341
|
|
|
319
|
-
erase everything with
|
|
342
|
+
erase everything with bt.$ontology.filter().delete()
|
|
320
343
|
|
|
321
344
|
add whatever value you need afterward like it is done here with:
|
|
322
345
|
|
|
323
|
-
`
|
|
346
|
+
`bt.$ontology(name="ddd", ontology_id="ddddd").save()`
|
|
324
347
|
|
|
325
348
|
`df["assay_ontology_term_id"].unique()`
|
|
326
349
|
|
|
@@ -336,78 +359,88 @@ def populate_my_ontology(
|
|
|
336
359
|
dev_stages (list, optional): List of developmental stages. Defaults to [].
|
|
337
360
|
"""
|
|
338
361
|
|
|
339
|
-
names = bt.CellType().df().index if not celltypes else celltypes
|
|
340
|
-
records =
|
|
341
|
-
ln.save(records)
|
|
342
|
-
|
|
362
|
+
names = bt.CellType.public().df().index if not celltypes else celltypes
|
|
363
|
+
records = bt.CellType.from_values(names, field="ontology_id")
|
|
364
|
+
ln.save(records, parents=bool(celltypes))
|
|
365
|
+
bt.CellType(name="unknown", ontology_id="unknown").save()
|
|
343
366
|
# Organism
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
367
|
+
names = bt.Organism.public().df().index if not organisms else organisms
|
|
368
|
+
records = [
|
|
369
|
+
i[0] if type(i) is list else i
|
|
370
|
+
for i in [bt.Organism.from_public(ontology_id=i) for i in names]
|
|
371
|
+
]
|
|
372
|
+
ln.save(records, parents=bool(organisms))
|
|
373
|
+
bt.Organism(name="unknown", ontology_id="unknown").save()
|
|
348
374
|
# Phenotype
|
|
349
|
-
|
|
350
|
-
records =
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
375
|
+
names = bt.Phenotype.public().df().index if not sex else sex
|
|
376
|
+
records = [
|
|
377
|
+
bt.Phenotype.from_public(
|
|
378
|
+
ontology_id=i,
|
|
379
|
+
public_source=bt.PublicSource.filter(
|
|
380
|
+
entity="Phenotype", source="pato"
|
|
381
|
+
).one(),
|
|
382
|
+
)
|
|
383
|
+
for i in names
|
|
384
|
+
]
|
|
385
|
+
ln.save(records, parents=bool(sex))
|
|
386
|
+
bt.Phenotype(name="unknown", ontology_id="unknown").save()
|
|
359
387
|
# ethnicity
|
|
360
|
-
names = bt.Ethnicity().df().index if not ethnicities else ethnicities
|
|
361
|
-
records =
|
|
362
|
-
ln.save(records)
|
|
363
|
-
|
|
388
|
+
names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
|
|
389
|
+
records = bt.Ethnicity.from_values(names, field="ontology_id")
|
|
390
|
+
ln.save(records, parents=bool(ethnicities))
|
|
391
|
+
bt.Ethnicity(
|
|
364
392
|
name="unknown", ontology_id="unknown"
|
|
365
393
|
).save() # multi ethnic will have to get renamed
|
|
366
394
|
# ExperimentalFactor
|
|
367
|
-
names = bt.ExperimentalFactor().df().index if not assays else assays
|
|
368
|
-
records =
|
|
369
|
-
|
|
370
|
-
)
|
|
371
|
-
|
|
372
|
-
lb.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
|
|
373
|
-
# lookup = lb.ExperimentalFactor.lookup()
|
|
395
|
+
names = bt.ExperimentalFactor.public().df().index if not assays else assays
|
|
396
|
+
records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
|
|
397
|
+
ln.save(records, parents=bool(assays))
|
|
398
|
+
bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
|
|
399
|
+
# lookup = bt.ExperimentalFactor.lookup()
|
|
374
400
|
# lookup.smart_seq_v4.parents.add(lookup.smart_like)
|
|
375
401
|
# Tissue
|
|
376
|
-
names = bt.Tissue().df().index if not tissues else tissues
|
|
377
|
-
records =
|
|
378
|
-
ln.save(records)
|
|
379
|
-
|
|
402
|
+
names = bt.Tissue.public().df().index if not tissues else tissues
|
|
403
|
+
records = bt.Tissue.from_values(names, field="ontology_id")
|
|
404
|
+
ln.save(records, parents=bool(tissues))
|
|
405
|
+
bt.Tissue(name="unknown", ontology_id="unknown").save()
|
|
380
406
|
# DevelopmentalStage
|
|
381
407
|
names = (
|
|
382
|
-
bt.DevelopmentalStage().df().index if not dev_stages else dev_stages
|
|
383
|
-
)
|
|
384
|
-
records = lb.DevelopmentalStage.from_values(
|
|
385
|
-
names, field=lb.DevelopmentalStage.ontology_id
|
|
408
|
+
bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
|
|
386
409
|
)
|
|
410
|
+
records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
|
|
411
|
+
ln.save(records, parents=bool(dev_stages))
|
|
412
|
+
bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
|
|
413
|
+
|
|
414
|
+
names = bt.DevelopmentalStage.public(organism="mouse").df().name
|
|
415
|
+
bionty_source = bt.PublicSource.filter(
|
|
416
|
+
entity="DevelopmentalStage", organism="mouse"
|
|
417
|
+
).one()
|
|
418
|
+
records = [
|
|
419
|
+
bt.DevelopmentalStage.from_public(name=i, public_source=bionty_source)
|
|
420
|
+
for i in names.tolist()
|
|
421
|
+
]
|
|
422
|
+
records[-4] = records[-4][0]
|
|
387
423
|
ln.save(records)
|
|
388
|
-
lb.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
|
|
389
424
|
# Disease
|
|
390
|
-
names = bt.Disease().df().index if not diseases else diseases
|
|
391
|
-
records =
|
|
392
|
-
ln.save(records)
|
|
393
|
-
|
|
394
|
-
|
|
425
|
+
names = bt.Disease.public().df().index if not diseases else diseases
|
|
426
|
+
records = bt.Disease.from_values(names, field="ontology_id")
|
|
427
|
+
ln.save(records, parents=bool(diseases))
|
|
428
|
+
bt.Disease(name="normal", ontology_id="PATO:0000461").save()
|
|
429
|
+
bt.Disease(name="unknown", ontology_id="unknown").save()
|
|
395
430
|
# genes
|
|
396
|
-
for organism in
|
|
431
|
+
for organism in ["NCBITaxon:10090", "NCBITaxon:9606"]:
|
|
397
432
|
# convert onto to name
|
|
398
|
-
organism =
|
|
399
|
-
names = bt.Gene(organism=organism).df()["ensembl_gene_id"]
|
|
400
|
-
records =
|
|
433
|
+
organism = bt.Organism.filter(ontology_id=organism).one().name
|
|
434
|
+
names = bt.Gene.public(organism=organism).df()["ensembl_gene_id"]
|
|
435
|
+
records = bt.Gene.from_values(
|
|
401
436
|
names,
|
|
402
437
|
field="ensembl_gene_id",
|
|
403
|
-
|
|
404
|
-
entity="Gene", organism=organism
|
|
405
|
-
).first(),
|
|
438
|
+
organism=organism,
|
|
406
439
|
)
|
|
407
440
|
ln.save(records)
|
|
408
441
|
|
|
409
442
|
|
|
410
|
-
def is_outlier(adata, metric: str, nmads: int):
|
|
443
|
+
def is_outlier(adata: AnnData, metric: str, nmads: int):
|
|
411
444
|
"""
|
|
412
445
|
is_outlier detects outliers in adata.obs[metric]
|
|
413
446
|
|
|
@@ -426,7 +459,7 @@ def is_outlier(adata, metric: str, nmads: int):
|
|
|
426
459
|
return outlier
|
|
427
460
|
|
|
428
461
|
|
|
429
|
-
def length_normalize(adata, gene_lengths):
|
|
462
|
+
def length_normalize(adata: AnnData, gene_lengths: list):
|
|
430
463
|
"""
|
|
431
464
|
length_normalize normalizes the counts by the gene length
|
|
432
465
|
|
|
@@ -441,7 +474,7 @@ def length_normalize(adata, gene_lengths):
|
|
|
441
474
|
return adata
|
|
442
475
|
|
|
443
476
|
|
|
444
|
-
def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs):
|
|
477
|
+
def pd_load_cached(url: str, loc: str = "/tmp/", cache: bool = True, **kwargs):
|
|
445
478
|
"""
|
|
446
479
|
pd_load_cached downloads a file from a url and loads it as a pandas dataframe
|
|
447
480
|
|
|
@@ -459,3 +492,36 @@ def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs):
|
|
|
459
492
|
urllib.request.urlretrieve(url, loc)
|
|
460
493
|
# Load the data from the file
|
|
461
494
|
return pd.read_csv(loc, **kwargs)
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def translate(
|
|
498
|
+
val: Union[str, list, set, Counter, dict], t: str = "cell_type_ontology_term_id"
|
|
499
|
+
):
|
|
500
|
+
"""
|
|
501
|
+
translate translates the ontology term id to the name
|
|
502
|
+
|
|
503
|
+
Args:
|
|
504
|
+
val (str, dict, set, list, dict): the object to translate
|
|
505
|
+
t (flat, optional): the type of ontology terms.
|
|
506
|
+
one of cell_type_ontology_term_id, assay_ontology_term_id, tissue_ontology_term_id.
|
|
507
|
+
Defaults to "cell_type_ontology_term_id".
|
|
508
|
+
|
|
509
|
+
Returns:
|
|
510
|
+
dict: the mapping for the translation
|
|
511
|
+
"""
|
|
512
|
+
if t == "cell_type_ontology_term_id":
|
|
513
|
+
obj = bt.CellType.public(organism="all")
|
|
514
|
+
elif t == "assay_ontology_term_id":
|
|
515
|
+
obj = bt.ExperimentalFactor.public()
|
|
516
|
+
elif t == "tissue_ontology_term_id":
|
|
517
|
+
obj = bt.Tissue.public()
|
|
518
|
+
else:
|
|
519
|
+
return None
|
|
520
|
+
if type(val) is str:
|
|
521
|
+
return {val: obj.search(val, field=obj.ontology_id).name.iloc[0]}
|
|
522
|
+
elif type(val) is list or type(val) is set:
|
|
523
|
+
return {i: obj.search(i, field=obj.ontology_id).name.iloc[0] for i in set(val)}
|
|
524
|
+
elif type(val) is dict or type(val) is Counter:
|
|
525
|
+
return {
|
|
526
|
+
obj.search(k, field=obj.ontology_id).name.iloc[0]: v for k, v in val.items()
|
|
527
|
+
}
|
|
@@ -1,39 +1,45 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: scdataloader
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: a dataloader for single cell data in lamindb
|
|
5
|
-
Home-page: https://github.com/jkobject/
|
|
5
|
+
Home-page: https://github.com/jkobject/scDataLoader
|
|
6
6
|
License: GPL3
|
|
7
7
|
Keywords: scRNAseq,dataloader,pytorch,lamindb,scPrint
|
|
8
8
|
Author: jkobject
|
|
9
|
-
Requires-Python:
|
|
9
|
+
Requires-Python: ==3.10.*
|
|
10
10
|
Classifier: License :: Other/Proprietary License
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
15
13
|
Requires-Dist: anndata
|
|
16
14
|
Requires-Dist: biomart
|
|
15
|
+
Requires-Dist: bionty
|
|
17
16
|
Requires-Dist: cellxgene-census
|
|
18
17
|
Requires-Dist: decoupler
|
|
19
18
|
Requires-Dist: django
|
|
20
19
|
Requires-Dist: ipykernel
|
|
21
20
|
Requires-Dist: lamindb
|
|
22
21
|
Requires-Dist: leidenalg
|
|
22
|
+
Requires-Dist: lightning
|
|
23
|
+
Requires-Dist: lnschema-bionty
|
|
23
24
|
Requires-Dist: matplotlib
|
|
24
25
|
Requires-Dist: pandas (>=2.0.0)
|
|
26
|
+
Requires-Dist: scikit-misc
|
|
25
27
|
Requires-Dist: seaborn
|
|
26
28
|
Requires-Dist: torch
|
|
27
29
|
Requires-Dist: torchdata
|
|
28
|
-
Project-URL: Repository, https://github.com/jkobject/
|
|
30
|
+
Project-URL: Repository, https://github.com/jkobject/scDataLoader
|
|
29
31
|
Description-Content-Type: text/markdown
|
|
30
32
|
|
|
31
33
|
# scdataloader
|
|
32
34
|
|
|
33
35
|
[](https://codecov.io/gh/jkobject/scDataLoader)
|
|
34
36
|
[](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
|
|
37
|
+
[](https://zenodo.org/doi/10.5281/zenodo.10573143)
|
|
35
38
|
|
|
36
|
-
|
|
39
|
+
|
|
40
|
+
Awesome single cell dataloader created by @jkobject
|
|
41
|
+
|
|
42
|
+
built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
|
|
37
43
|
|
|
38
44
|
This data loader is designed to be used with:
|
|
39
45
|
|
|
@@ -51,14 +57,78 @@ It allows you to:
|
|
|
51
57
|
3. create a more complex single cell dataset
|
|
52
58
|
4. extend it to your need
|
|
53
59
|
|
|
60
|
+
## About
|
|
61
|
+
|
|
62
|
+
the idea is to use it to train models like scGPT / GeneFormer (and soon, scPrint ;)). It is:
|
|
63
|
+
|
|
64
|
+
1. loading from lamin
|
|
65
|
+
2. doing some dataset specific preprocessing if needed
|
|
66
|
+
3. creating a dataset object on top of .mapped() (that is needed for mapping genes, cell labels etc..)
|
|
67
|
+
4. passing it to a dataloader object that can work with it correctly
|
|
68
|
+
|
|
69
|
+
Currently one would have to use the preprocess function to make the dataset fit for different tools like scGPT / Geneformer. But I would want to enable it through different Collators. This is still missing and a WIP... (please do contribute!)
|
|
70
|
+
|
|
71
|
+

|
|
72
|
+
|
|
54
73
|
## Install it from PyPI
|
|
55
74
|
|
|
56
75
|
```bash
|
|
57
76
|
pip install scdataloader
|
|
58
77
|
```
|
|
59
78
|
|
|
79
|
+
### Install it locally and run the notebooks:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
git clone https://github.com/jkobject/scDataLoader.git
|
|
83
|
+
cd scDataLoader
|
|
84
|
+
poetry install
|
|
85
|
+
```
|
|
86
|
+
then run the notebooks with the poetry installed environment
|
|
87
|
+
|
|
60
88
|
## Usage
|
|
61
89
|
|
|
90
|
+
```python
|
|
91
|
+
# initialize a local lamin database
|
|
92
|
+
# !lamin init --storage ~/scdataloader --schema bionty
|
|
93
|
+
|
|
94
|
+
from scdataloader import utils
|
|
95
|
+
from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
|
|
96
|
+
|
|
97
|
+
# preprocess datasets
|
|
98
|
+
DESCRIPTION='preprocessed by scDataLoader'
|
|
99
|
+
|
|
100
|
+
cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
|
|
101
|
+
cx_dataset, len(cx_dataset.artifacts.all())
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
|
|
105
|
+
|
|
106
|
+
preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
|
|
107
|
+
|
|
108
|
+
# create dataloaders
|
|
109
|
+
from scdataloader import DataModule
|
|
110
|
+
import tqdm
|
|
111
|
+
|
|
112
|
+
datamodule = DataModule(
|
|
113
|
+
collection_name="preprocessed dataset",
|
|
114
|
+
organisms=["NCBITaxon:9606"], #organism that we will work on
|
|
115
|
+
how="most expr", # for the collator (most expr genes only will be selected)
|
|
116
|
+
max_len=1000, # only the 1000 most expressed
|
|
117
|
+
batch_size=64,
|
|
118
|
+
num_workers=1,
|
|
119
|
+
validation_split=0.1,
|
|
120
|
+
test_split=0)
|
|
121
|
+
|
|
122
|
+
for i in tqdm.tqdm(datamodule.train_dataloader()):
|
|
123
|
+
# pass #or do pass
|
|
124
|
+
print(i)
|
|
125
|
+
break
|
|
126
|
+
|
|
127
|
+
# with lightning:
|
|
128
|
+
# Trainer(model, datamodule)
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
|
|
62
132
|
see the notebooks in [docs](https://jkobject.github.io/scDataLoader/):
|
|
63
133
|
|
|
64
134
|
1. [load a dataset](https://jkobject.github.io/scDataLoader/notebooks/01_load_dataset.html)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
scdataloader/VERSION,sha256=ln2a-xATRmZxZvLnboGRC8GQSI19QdUMoAcunZLwDjI,6
|
|
2
|
+
scdataloader/__init__.py,sha256=NIlE4oTUPRZ3uSW_maozoEHp470I7PV1vMOJ4XpSmL4,122
|
|
3
|
+
scdataloader/__main__.py,sha256=UyXtFHgWxE-ecJmM_oEDLlzBDBbH-uEKAVj1A7BkwmM,6297
|
|
4
|
+
scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
|
|
5
|
+
scdataloader/collator.py,sha256=Ykjdw24GUvHdbowWUDtp28YTkaF3w65SiWTU2PKBzy4,11714
|
|
6
|
+
scdataloader/config.py,sha256=0_LoIblgdZZ19yM2qvPE-padMGQzdhuaxX20zYrhWq0,2780
|
|
7
|
+
scdataloader/data.py,sha256=faJWN--06N7irWBKcjeU6fcX5NbzyEPXs2_EVGxfBpw,12292
|
|
8
|
+
scdataloader/datamodule.py,sha256=OhHPb3jhGG5HbvahzTGxgzJ_lxbVJ4PfZspVW9h7SZk,14789
|
|
9
|
+
scdataloader/mapped.py,sha256=rhE11Xl3x_wIKu3m_wu8Is6mYsXdblu3nQpT5lNqr60,13301
|
|
10
|
+
scdataloader/preprocess.py,sha256=67ewe6b4HIjz_vTDjlOAJ4lMe4K2oCw2HHHUS-7S77M,38205
|
|
11
|
+
scdataloader/utils.py,sha256=6eKU3_cotEaQcxONMrCWzMx7U8DybabteNhk-vNqfUQ,19365
|
|
12
|
+
scdataloader-0.0.4.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
13
|
+
scdataloader-0.0.4.dist-info/METADATA,sha256=Bf8UjMwRcqSbWW8VbWrLhSb7qKQYdjZtJ7d6Oz4-rn8,39733
|
|
14
|
+
scdataloader-0.0.4.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
15
|
+
scdataloader-0.0.4.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
|
|
16
|
+
scdataloader-0.0.4.dist-info/RECORD,,
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
scdataloader/VERSION,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
2
|
-
scdataloader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
|
|
4
|
-
scdataloader/data.py,sha256=5-w4WL0Ho5RW47J37N-zdNhV4Fjs0b7lb6c6ugeTMi4,12793
|
|
5
|
-
scdataloader/mapped.py,sha256=wQN2K7GnJv-UiNIlC41HItrVMW50tECAjc8mt-QV-1I,12290
|
|
6
|
-
scdataloader/preprocess.py,sha256=sm5OPREZFJaGVF9VsTKGvT1jHT7sOouX_ql0mWx3_4Q,23103
|
|
7
|
-
scdataloader/utils.py,sha256=Ih1LLnmRZYOpIk1IoAJKyRAT361zrgBgUhwJM04V6Pw,16115
|
|
8
|
-
scdataloader-0.0.2.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
9
|
-
scdataloader-0.0.2.dist-info/METADATA,sha256=4ICXsQcdWkwrAZZVDIYG1L3d7JCpaxpr3MYlnVsD1Qw,37340
|
|
10
|
-
scdataloader-0.0.2.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
|
11
|
-
scdataloader-0.0.2.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
|
|
12
|
-
scdataloader-0.0.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|