scdataloader 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scdataloader/utils.py CHANGED
@@ -10,9 +10,15 @@ from biomart import BiomartServer
10
10
  from django.db import IntegrityError
11
11
  from scipy.sparse import csr_matrix
12
12
  from scipy.stats import median_abs_deviation
13
+ from functools import lru_cache
14
+ from collections import Counter
13
15
 
16
+ from typing import Union, List, Optional
14
17
 
15
- def createFoldersFor(filepath):
18
+ from anndata import AnnData
19
+
20
+
21
+ def createFoldersFor(filepath: str):
16
22
  """
17
23
  will recursively create folders if needed until having all the folders required to save the file in this filepath
18
24
  """
@@ -23,15 +29,25 @@ def createFoldersFor(filepath):
23
29
  os.mkdir(prevval)
24
30
 
25
31
 
26
- def _fetchFromServer(ensemble_server, attributes):
32
+ def _fetchFromServer(
33
+ ensemble_server: str, attributes: list, database: str = "hsapiens_gene_ensembl"
34
+ ):
35
+ """
36
+ Fetches data from the specified ensemble server.
37
+
38
+ Args:
39
+ ensemble_server (str): The URL of the ensemble server to fetch data from.
40
+ attributes (list): The list of attributes to fetch from the server.
41
+
42
+ Returns:
43
+ pd.DataFrame: A pandas DataFrame containing the fetched data.
44
+ """
27
45
  server = BiomartServer(ensemble_server)
28
- ensmbl = server.datasets["hsapiens_gene_ensembl"]
46
+ ensmbl = server.datasets[database]
29
47
  print(attributes)
30
48
  res = pd.read_csv(
31
49
  io.StringIO(
32
- ensmbl.search(
33
- {"attributes": attributes}, header=1
34
- ).content.decode()
50
+ ensmbl.search({"attributes": attributes}, header=1).content.decode()
35
51
  ),
36
52
  sep="\t",
37
53
  )
@@ -39,11 +55,12 @@ def _fetchFromServer(ensemble_server, attributes):
39
55
 
40
56
 
41
57
  def getBiomartTable(
42
- ensemble_server="http://jul2023.archive.ensembl.org/biomart",
43
- useCache=False,
44
- cache_folder="/tmp/biomart/",
45
- attributes=[],
46
- bypass_attributes=False,
58
+ ensemble_server: str = "http://jul2023.archive.ensembl.org/biomart",
59
+ useCache: bool = False,
60
+ cache_folder: str = "/tmp/biomart/",
61
+ attributes: List[str] = [],
62
+ bypass_attributes: bool = False,
63
+ database: str = "hsapiens_gene_ensembl",
47
64
  ):
48
65
  """generate a genelist dataframe from ensembl's biomart
49
66
 
@@ -79,7 +96,7 @@ def getBiomartTable(
79
96
  else:
80
97
  print("downloading gene names from biomart")
81
98
 
82
- res = _fetchFromServer(ensemble_server, attr + attributes)
99
+ res = _fetchFromServer(ensemble_server, attr + attributes, database=database)
83
100
  res.to_csv(cachefile, index=False)
84
101
 
85
102
  res.columns = attr + attributes
@@ -93,7 +110,7 @@ def getBiomartTable(
93
110
  return res
94
111
 
95
112
 
96
- def validate(adata, lb, organism):
113
+ def validate(adata: AnnData, organism: str):
97
114
  """
98
115
  validate checks if the adata object is valid for lamindb
99
116
 
@@ -116,8 +133,7 @@ def validate(adata, lb, organism):
116
133
  Returns:
117
134
  bool: True if the adata object is valid
118
135
  """
119
- organism = lb.Organism.filter(ontology_id=organism).one().name
120
- lb.settings.organism = organism
136
+ organism = bt.Organism.filter(ontology_id=organism).one().name
121
137
 
122
138
  if adata.var.index.duplicated().any():
123
139
  raise ValueError("Duplicate gene names found in adata.var.index")
@@ -136,70 +152,61 @@ def validate(adata, lb, organism):
136
152
  raise ValueError(
137
153
  f"Column '{val}' is missing in the provided anndata object."
138
154
  )
139
- bionty_source = lb.BiontySource.filter(
140
- entity="DevelopmentalStage", organism=organism
141
- ).one()
142
155
 
143
- if not lb.Ethnicity.validate(
156
+ if not bt.Ethnicity.validate(
144
157
  adata.obs["self_reported_ethnicity_ontology_term_id"],
145
158
  field="ontology_id",
146
159
  ).all():
147
160
  raise ValueError("Invalid ethnicity ontology term id found")
148
- if not lb.Organism.validate(
161
+ if not bt.Organism.validate(
149
162
  adata.obs["organism_ontology_term_id"], field="ontology_id"
150
163
  ).all():
151
164
  raise ValueError("Invalid organism ontology term id found")
152
- if not lb.Phenotype.validate(
165
+ if not bt.Phenotype.validate(
153
166
  adata.obs["sex_ontology_term_id"], field="ontology_id"
154
167
  ).all():
155
168
  raise ValueError("Invalid sex ontology term id found")
156
- if not lb.Disease.validate(
169
+ if not bt.Disease.validate(
157
170
  adata.obs["disease_ontology_term_id"], field="ontology_id"
158
171
  ).all():
159
172
  raise ValueError("Invalid disease ontology term id found")
160
- if not lb.CellType.validate(
173
+ if not bt.CellType.validate(
161
174
  adata.obs["cell_type_ontology_term_id"], field="ontology_id"
162
175
  ).all():
163
176
  raise ValueError("Invalid cell type ontology term id found")
164
- if (
165
- not lb.DevelopmentalStage.filter(bionty_source=bionty_source)
166
- .validate(
167
- adata.obs["development_stage_ontology_term_id"],
168
- field="ontology_id",
169
- )
170
- .all()
171
- ):
177
+ if not bt.DevelopmentalStage.validate(
178
+ adata.obs["development_stage_ontology_term_id"],
179
+ field="ontology_id",
180
+ ).all():
172
181
  raise ValueError("Invalid dev stage ontology term id found")
173
- if not lb.Tissue.validate(
182
+ if not bt.Tissue.validate(
174
183
  adata.obs["tissue_ontology_term_id"], field="ontology_id"
175
184
  ).all():
176
185
  raise ValueError("Invalid tissue ontology term id found")
177
- if not lb.ExperimentalFactor.validate(
186
+ if not bt.ExperimentalFactor.validate(
178
187
  adata.obs["assay_ontology_term_id"], field="ontology_id"
179
188
  ).all():
180
189
  raise ValueError("Invalid assay ontology term id found")
181
- if (
182
- not lb.Gene.filter(organism=lb.settings.organism)
183
- .validate(adata.var.index, field="ensembl_gene_id")
184
- .all()
185
- ):
190
+ if not bt.Gene.validate(
191
+ adata.var.index, field="ensembl_gene_id", organism=organism
192
+ ).all():
186
193
  raise ValueError("Invalid gene ensembl id found")
187
194
  return True
188
195
 
189
196
 
190
- def get_all_ancestors(val, df):
197
+ # setting a cache of 200 elements
198
+ # @lru_cache(maxsize=200)
199
+ def get_all_ancestors(val: str, df: pd.DataFrame):
191
200
  if val not in df.index:
192
201
  return set()
193
202
  parents = df.loc[val].parents__ontology_id
194
203
  if parents is None or len(parents) == 0:
195
204
  return set()
196
205
  else:
197
- return set.union(
198
- set(parents), *[get_all_ancestors(val, df) for val in parents]
199
- )
206
+ return set.union(set(parents), *[get_all_ancestors(val, df) for val in parents])
200
207
 
201
208
 
202
- def get_ancestry_mapping(all_elem, onto_df):
209
+ def get_ancestry_mapping(all_elem: list, onto_df: pd.DataFrame):
203
210
  """
204
211
  This function generates a mapping of all elements to their ancestors in the ontology dataframe.
205
212
 
@@ -234,13 +241,12 @@ def get_ancestry_mapping(all_elem, onto_df):
234
241
 
235
242
 
236
243
  def load_dataset_local(
237
- lb,
238
- remote_dataset,
239
- download_folder,
240
- name,
241
- description,
242
- use_cache=True,
243
- only=None,
244
+ remote_dataset: ln.Collection,
245
+ download_folder: str,
246
+ name: str,
247
+ description: str,
248
+ use_cache: bool = True,
249
+ only: Optional[List[int]] = None,
244
250
  ):
245
251
  """
246
252
  This function loads a remote lamindb dataset to local.
@@ -258,9 +264,7 @@ def load_dataset_local(
258
264
  lamindb.Dataset: The local dataset.
259
265
  """
260
266
  saved_files = []
261
- default_storage = ln.Storage.filter(
262
- root=ln.settings.storage.as_posix()
263
- ).one()
267
+ default_storage = ln.Storage.filter(root=ln.settings.storage.as_posix()).one()
264
268
  files = (
265
269
  remote_dataset.artifacts.all()
266
270
  if not only
@@ -275,17 +279,15 @@ def load_dataset_local(
275
279
  if len(organism) == 0:
276
280
  print("No organism detected")
277
281
  continue
278
- organism = lb.Organism.filter(ontology_id=organism[0]).one().name
279
- # lb.settings.organism = organism
282
+ organism = bt.Organism.filter(ontology_id=organism[0]).one().name
283
+ # bt.settings.organism = organism
280
284
  path = file.path
281
285
  try:
282
286
  file.save()
283
287
  except IntegrityError:
284
288
  print(f"File {file.key} already exists in storage")
285
289
  # if location already has a file, don't save again
286
- if use_cache and os.path.exists(
287
- os.path.expanduser(download_folder + file.key)
288
- ):
290
+ if use_cache and os.path.exists(os.path.expanduser(download_folder + file.key)):
289
291
  print(f"File {file.key} already exists in storage")
290
292
  else:
291
293
  path.download_to(download_folder + file.key)
@@ -295,32 +297,53 @@ def load_dataset_local(
295
297
  except IntegrityError:
296
298
  print(f"File {file.key} already exists in storage")
297
299
  saved_files.append(file)
298
- dataset = ln.Dataset(saved_files, name=name, description=description)
300
+ dataset = ln.Collection(saved_files, name=name, description=description)
299
301
  dataset.save()
300
302
  return dataset
301
303
 
302
304
 
305
+ def load_genes(organisms: Union[str, list] = "NCBITaxon:9606"): # "NCBITaxon:10090",
306
+ organismdf = []
307
+ if type(organisms) == str:
308
+ organisms = [organisms]
309
+ for organism in organisms:
310
+ genesdf = bt.Gene.filter(
311
+ organism_id=bt.Organism.filter(ontology_id=organism).first().id
312
+ ).df()
313
+ genesdf = genesdf[~genesdf["public_source_id"].isna()]
314
+ genesdf = genesdf.drop_duplicates(subset="ensembl_gene_id")
315
+ genesdf = genesdf.set_index("ensembl_gene_id").sort_index()
316
+ # mitochondrial genes
317
+ genesdf["mt"] = genesdf.symbol.astype(str).str.startswith("MT-")
318
+ # ribosomal genes
319
+ genesdf["ribo"] = genesdf.symbol.astype(str).str.startswith(("RPS", "RPL"))
320
+ # hemoglobin genes.
321
+ genesdf["hb"] = genesdf.symbol.astype(str).str.contains(("^HB[^(P)]"))
322
+ genesdf["organism"] = organism
323
+ organismdf.append(genesdf)
324
+ return pd.concat(organismdf)
325
+
326
+
303
327
  def populate_my_ontology(
304
- lb,
305
- organisms=["NCBITaxon:10090", "NCBITaxon:9606"],
306
- sex=["PATO:0000384", "PATO:0000383"],
307
- celltypes=[],
308
- ethnicities=[],
309
- assays=[],
310
- tissues=[],
311
- diseases=[],
312
- dev_stages=[],
328
+ organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"],
329
+ sex: List[str] = ["PATO:0000384", "PATO:0000383"],
330
+ celltypes: List[str] = [],
331
+ ethnicities: List[str] = [],
332
+ assays: List[str] = [],
333
+ tissues: List[str] = [],
334
+ diseases: List[str] = [],
335
+ dev_stages: List[str] = [],
313
336
  ):
314
337
  """
315
338
  creates a local version of the lamin ontologies and add the required missing values in base ontologies
316
339
 
317
340
  run this function just one for each new lamin storage
318
341
 
319
- erase everything with lb.$ontology.filter().delete()
342
+ erase everything with bt.$ontology.filter().delete()
320
343
 
321
344
  add whatever value you need afterward like it is done here with:
322
345
 
323
- `lb.$ontology(name="ddd", ontology_id="ddddd").save()`
346
+ `bt.$ontology(name="ddd", ontology_id="ddddd").save()`
324
347
 
325
348
  `df["assay_ontology_term_id"].unique()`
326
349
 
@@ -336,78 +359,88 @@ def populate_my_ontology(
336
359
  dev_stages (list, optional): List of developmental stages. Defaults to [].
337
360
  """
338
361
 
339
- names = bt.CellType().df().index if not celltypes else celltypes
340
- records = lb.CellType.from_values(names, field=lb.CellType.ontology_id)
341
- ln.save(records)
342
- lb.CellType(name="unknown", ontology_id="unknown").save()
362
+ names = bt.CellType.public().df().index if not celltypes else celltypes
363
+ records = bt.CellType.from_values(names, field="ontology_id")
364
+ ln.save(records, parents=bool(celltypes))
365
+ bt.CellType(name="unknown", ontology_id="unknown").save()
343
366
  # Organism
344
- # names = bt.Organism().df().index if not organisms else organisms
345
- # records = lb.Organism.from_values(names, field=lb.Organism.ontology_id)
346
- # ln.save(records)
347
- # lb.Organism(name="unknown", ontology_id="unknown").save()
367
+ names = bt.Organism.public().df().index if not organisms else organisms
368
+ records = [
369
+ i[0] if type(i) is list else i
370
+ for i in [bt.Organism.from_public(ontology_id=i) for i in names]
371
+ ]
372
+ ln.save(records, parents=bool(organisms))
373
+ bt.Organism(name="unknown", ontology_id="unknown").save()
348
374
  # Phenotype
349
- name = bt.Phenotype().df().index if not sex else sex
350
- records = lb.Phenotype.from_values(
351
- name,
352
- field=lb.Phenotype.ontology_id,
353
- bionty_source=lb.BiontySource.filter(
354
- entity="Phenotype", source="pato"
355
- ).one(),
356
- )
357
- ln.save(records)
358
- lb.Phenotype(name="unknown", ontology_id="unknown").save()
375
+ names = bt.Phenotype.public().df().index if not sex else sex
376
+ records = [
377
+ bt.Phenotype.from_public(
378
+ ontology_id=i,
379
+ public_source=bt.PublicSource.filter(
380
+ entity="Phenotype", source="pato"
381
+ ).one(),
382
+ )
383
+ for i in names
384
+ ]
385
+ ln.save(records, parents=bool(sex))
386
+ bt.Phenotype(name="unknown", ontology_id="unknown").save()
359
387
  # ethnicity
360
- names = bt.Ethnicity().df().index if not ethnicities else ethnicities
361
- records = lb.Ethnicity.from_values(names, field=lb.Ethnicity.ontology_id)
362
- ln.save(records)
363
- lb.Ethnicity(
388
+ names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
389
+ records = bt.Ethnicity.from_values(names, field="ontology_id")
390
+ ln.save(records, parents=bool(ethnicities))
391
+ bt.Ethnicity(
364
392
  name="unknown", ontology_id="unknown"
365
393
  ).save() # multi ethnic will have to get renamed
366
394
  # ExperimentalFactor
367
- names = bt.ExperimentalFactor().df().index if not assays else assays
368
- records = lb.ExperimentalFactor.from_values(
369
- names, field=lb.ExperimentalFactor.ontology_id
370
- )
371
- ln.save(records)
372
- lb.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
373
- # lookup = lb.ExperimentalFactor.lookup()
395
+ names = bt.ExperimentalFactor.public().df().index if not assays else assays
396
+ records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
397
+ ln.save(records, parents=bool(assays))
398
+ bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
399
+ # lookup = bt.ExperimentalFactor.lookup()
374
400
  # lookup.smart_seq_v4.parents.add(lookup.smart_like)
375
401
  # Tissue
376
- names = bt.Tissue().df().index if not tissues else tissues
377
- records = lb.Tissue.from_values(names, field=lb.Tissue.ontology_id)
378
- ln.save(records)
379
- lb.Tissue(name="unknown", ontology_id="unknown").save()
402
+ names = bt.Tissue.public().df().index if not tissues else tissues
403
+ records = bt.Tissue.from_values(names, field="ontology_id")
404
+ ln.save(records, parents=bool(tissues))
405
+ bt.Tissue(name="unknown", ontology_id="unknown").save()
380
406
  # DevelopmentalStage
381
407
  names = (
382
- bt.DevelopmentalStage().df().index if not dev_stages else dev_stages
383
- )
384
- records = lb.DevelopmentalStage.from_values(
385
- names, field=lb.DevelopmentalStage.ontology_id
408
+ bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
386
409
  )
410
+ records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
411
+ ln.save(records, parents=bool(dev_stages))
412
+ bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
413
+
414
+ names = bt.DevelopmentalStage.public(organism="mouse").df().name
415
+ bionty_source = bt.PublicSource.filter(
416
+ entity="DevelopmentalStage", organism="mouse"
417
+ ).one()
418
+ records = [
419
+ bt.DevelopmentalStage.from_public(name=i, public_source=bionty_source)
420
+ for i in names.tolist()
421
+ ]
422
+ records[-4] = records[-4][0]
387
423
  ln.save(records)
388
- lb.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
389
424
  # Disease
390
- names = bt.Disease().df().index if not diseases else diseases
391
- records = lb.Disease.from_values(names, field=lb.Disease.ontology_id)
392
- ln.save(records)
393
- lb.Disease(name="normal", ontology_id="PATO:0000461").save()
394
- lb.Disease(name="unknown", ontology_id="unknown").save()
425
+ names = bt.Disease.public().df().index if not diseases else diseases
426
+ records = bt.Disease.from_values(names, field="ontology_id")
427
+ ln.save(records, parents=bool(diseases))
428
+ bt.Disease(name="normal", ontology_id="PATO:0000461").save()
429
+ bt.Disease(name="unknown", ontology_id="unknown").save()
395
430
  # genes
396
- for organism in organisms:
431
+ for organism in ["NCBITaxon:10090", "NCBITaxon:9606"]:
397
432
  # convert onto to name
398
- organism = lb.Organism.filter(ontology_id=organism).one().name
399
- names = bt.Gene(organism=organism).df()["ensembl_gene_id"]
400
- records = lb.Gene.from_values(
433
+ organism = bt.Organism.filter(ontology_id=organism).one().name
434
+ names = bt.Gene.public(organism=organism).df()["ensembl_gene_id"]
435
+ records = bt.Gene.from_values(
401
436
  names,
402
437
  field="ensembl_gene_id",
403
- bionty_source=lb.BiontySource.filter(
404
- entity="Gene", organism=organism
405
- ).first(),
438
+ organism=organism,
406
439
  )
407
440
  ln.save(records)
408
441
 
409
442
 
410
- def is_outlier(adata, metric: str, nmads: int):
443
+ def is_outlier(adata: AnnData, metric: str, nmads: int):
411
444
  """
412
445
  is_outlier detects outliers in adata.obs[metric]
413
446
 
@@ -426,7 +459,7 @@ def is_outlier(adata, metric: str, nmads: int):
426
459
  return outlier
427
460
 
428
461
 
429
- def length_normalize(adata, gene_lengths):
462
+ def length_normalize(adata: AnnData, gene_lengths: list):
430
463
  """
431
464
  length_normalize normalizes the counts by the gene length
432
465
 
@@ -441,7 +474,7 @@ def length_normalize(adata, gene_lengths):
441
474
  return adata
442
475
 
443
476
 
444
- def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs):
477
+ def pd_load_cached(url: str, loc: str = "/tmp/", cache: bool = True, **kwargs):
445
478
  """
446
479
  pd_load_cached downloads a file from a url and loads it as a pandas dataframe
447
480
 
@@ -459,3 +492,36 @@ def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs):
459
492
  urllib.request.urlretrieve(url, loc)
460
493
  # Load the data from the file
461
494
  return pd.read_csv(loc, **kwargs)
495
+
496
+
497
+ def translate(
498
+ val: Union[str, list, set, Counter, dict], t: str = "cell_type_ontology_term_id"
499
+ ):
500
+ """
501
+ translate translates the ontology term id to the name
502
+
503
+ Args:
504
+ val (str, dict, set, list, dict): the object to translate
505
+ t (flat, optional): the type of ontology terms.
506
+ one of cell_type_ontology_term_id, assay_ontology_term_id, tissue_ontology_term_id.
507
+ Defaults to "cell_type_ontology_term_id".
508
+
509
+ Returns:
510
+ dict: the mapping for the translation
511
+ """
512
+ if t == "cell_type_ontology_term_id":
513
+ obj = bt.CellType.public(organism="all")
514
+ elif t == "assay_ontology_term_id":
515
+ obj = bt.ExperimentalFactor.public()
516
+ elif t == "tissue_ontology_term_id":
517
+ obj = bt.Tissue.public()
518
+ else:
519
+ return None
520
+ if type(val) is str:
521
+ return {val: obj.search(val, field=obj.ontology_id).name.iloc[0]}
522
+ elif type(val) is list or type(val) is set:
523
+ return {i: obj.search(i, field=obj.ontology_id).name.iloc[0] for i in set(val)}
524
+ elif type(val) is dict or type(val) is Counter:
525
+ return {
526
+ obj.search(k, field=obj.ontology_id).name.iloc[0]: v for k, v in val.items()
527
+ }
@@ -1,39 +1,45 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scdataloader
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: a dataloader for single cell data in lamindb
5
- Home-page: https://github.com/jkobject/scPrint
5
+ Home-page: https://github.com/jkobject/scDataLoader
6
6
  License: GPL3
7
7
  Keywords: scRNAseq,dataloader,pytorch,lamindb,scPrint
8
8
  Author: jkobject
9
- Requires-Python: >=3.10,<4.0
9
+ Requires-Python: ==3.10.*
10
10
  Classifier: License :: Other/Proprietary License
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.10
13
- Classifier: Programming Language :: Python :: 3.11
14
- Classifier: Programming Language :: Python :: 3.12
15
13
  Requires-Dist: anndata
16
14
  Requires-Dist: biomart
15
+ Requires-Dist: bionty
17
16
  Requires-Dist: cellxgene-census
18
17
  Requires-Dist: decoupler
19
18
  Requires-Dist: django
20
19
  Requires-Dist: ipykernel
21
20
  Requires-Dist: lamindb
22
21
  Requires-Dist: leidenalg
22
+ Requires-Dist: lightning
23
+ Requires-Dist: lnschema-bionty
23
24
  Requires-Dist: matplotlib
24
25
  Requires-Dist: pandas (>=2.0.0)
26
+ Requires-Dist: scikit-misc
25
27
  Requires-Dist: seaborn
26
28
  Requires-Dist: torch
27
29
  Requires-Dist: torchdata
28
- Project-URL: Repository, https://github.com/jkobject/scPrint
30
+ Project-URL: Repository, https://github.com/jkobject/scDataLoader
29
31
  Description-Content-Type: text/markdown
30
32
 
31
33
  # scdataloader
32
34
 
33
35
  [![codecov](https://codecov.io/gh/jkobject/scDataLoader/branch/main/graph/badge.svg?token=scDataLoader_token_here)](https://codecov.io/gh/jkobject/scDataLoader)
34
36
  [![CI](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml/badge.svg)](https://github.com/jkobject/scDataLoader/actions/workflows/main.yml)
37
+ [![DOI](https://zenodo.org/badge/731248665.svg)](https://zenodo.org/doi/10.5281/zenodo.10573143)
35
38
 
36
- Awesome single cell dataloader created by @jkobject
39
+
40
+ Awesome single cell dataloader created by @jkobject
41
+
42
+ built on top of `lamindb` and the `.mapped()` function by Sergey: https://github.com/Koncopd
37
43
 
38
44
  This data loader is designed to be used with:
39
45
 
@@ -51,14 +57,78 @@ It allows you to:
51
57
  3. create a more complex single cell dataset
52
58
  4. extend it to your need
53
59
 
60
+ ## About
61
+
62
+ the idea is to use it to train models like scGPT / GeneFormer (and soon, scPrint ;)). It is:
63
+
64
+ 1. loading from lamin
65
+ 2. doing some dataset specific preprocessing if needed
66
+ 3. creating a dataset object on top of .mapped() (that is needed for mapping genes, cell labels etc..)
67
+ 4. passing it to a dataloader object that can work with it correctly
68
+
69
+ Currently one would have to use the preprocess function to make the dataset fit for different tools like scGPT / Geneformer. But I would want to enable it through different Collators. This is still missing and a WIP... (please do contribute!)
70
+
71
+ ![docs/scdataloader.drawio.png](docs/scdataloader.drawio.png)
72
+
54
73
  ## Install it from PyPI
55
74
 
56
75
  ```bash
57
76
  pip install scdataloader
58
77
  ```
59
78
 
79
+ ### Install it locally and run the notebooks:
80
+
81
+ ```bash
82
+ git clone https://github.com/jkobject/scDataLoader.git
83
+ cd scDataLoader
84
+ poetry install
85
+ ```
86
+ then run the notebooks with the poetry installed environment
87
+
60
88
  ## Usage
61
89
 
90
+ ```python
91
+ # initialize a local lamin database
92
+ # !lamin init --storage ~/scdataloader --schema bionty
93
+
94
+ from scdataloader import utils
95
+ from scdataloader.preprocess import LaminPreprocessor, additional_postprocess, additional_preprocess
96
+
97
+ # preprocess datasets
98
+ DESCRIPTION='preprocessed by scDataLoader'
99
+
100
+ cx_dataset = ln.Collection.using(instance="laminlabs/cellxgene").filter(name="cellxgene-census", version='2023-12-15').one()
101
+ cx_dataset, len(cx_dataset.artifacts.all())
102
+
103
+
104
+ do_preprocess = LaminPreprocessor(additional_postprocess=additional_postprocess, additional_preprocess=additional_preprocess, skip_validate=True, subset_hvg=0)
105
+
106
+ preprocessed_dataset = do_preprocess(cx_dataset, name=DESCRIPTION, description=DESCRIPTION, start_at=6, version="2")
107
+
108
+ # create dataloaders
109
+ from scdataloader import DataModule
110
+ import tqdm
111
+
112
+ datamodule = DataModule(
113
+ collection_name="preprocessed dataset",
114
+ organisms=["NCBITaxon:9606"], #organism that we will work on
115
+ how="most expr", # for the collator (most expr genes only will be selected)
116
+ max_len=1000, # only the 1000 most expressed
117
+ batch_size=64,
118
+ num_workers=1,
119
+ validation_split=0.1,
120
+ test_split=0)
121
+
122
+ for i in tqdm.tqdm(datamodule.train_dataloader()):
123
+ # pass #or do pass
124
+ print(i)
125
+ break
126
+
127
+ # with lightning:
128
+ # Trainer(model, datamodule)
129
+
130
+ ```
131
+
62
132
  see the notebooks in [docs](https://jkobject.github.io/scDataLoader/):
63
133
 
64
134
  1. [load a dataset](https://jkobject.github.io/scDataLoader/notebooks/01_load_dataset.html)
@@ -0,0 +1,16 @@
1
+ scdataloader/VERSION,sha256=ln2a-xATRmZxZvLnboGRC8GQSI19QdUMoAcunZLwDjI,6
2
+ scdataloader/__init__.py,sha256=NIlE4oTUPRZ3uSW_maozoEHp470I7PV1vMOJ4XpSmL4,122
3
+ scdataloader/__main__.py,sha256=UyXtFHgWxE-ecJmM_oEDLlzBDBbH-uEKAVj1A7BkwmM,6297
4
+ scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
5
+ scdataloader/collator.py,sha256=Ykjdw24GUvHdbowWUDtp28YTkaF3w65SiWTU2PKBzy4,11714
6
+ scdataloader/config.py,sha256=0_LoIblgdZZ19yM2qvPE-padMGQzdhuaxX20zYrhWq0,2780
7
+ scdataloader/data.py,sha256=faJWN--06N7irWBKcjeU6fcX5NbzyEPXs2_EVGxfBpw,12292
8
+ scdataloader/datamodule.py,sha256=OhHPb3jhGG5HbvahzTGxgzJ_lxbVJ4PfZspVW9h7SZk,14789
9
+ scdataloader/mapped.py,sha256=rhE11Xl3x_wIKu3m_wu8Is6mYsXdblu3nQpT5lNqr60,13301
10
+ scdataloader/preprocess.py,sha256=67ewe6b4HIjz_vTDjlOAJ4lMe4K2oCw2HHHUS-7S77M,38205
11
+ scdataloader/utils.py,sha256=6eKU3_cotEaQcxONMrCWzMx7U8DybabteNhk-vNqfUQ,19365
12
+ scdataloader-0.0.4.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
13
+ scdataloader-0.0.4.dist-info/METADATA,sha256=Bf8UjMwRcqSbWW8VbWrLhSb7qKQYdjZtJ7d6Oz4-rn8,39733
14
+ scdataloader-0.0.4.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
15
+ scdataloader-0.0.4.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
16
+ scdataloader-0.0.4.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.8.1
2
+ Generator: poetry-core 1.7.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,12 +0,0 @@
1
- scdataloader/VERSION,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
2
- scdataloader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
4
- scdataloader/data.py,sha256=5-w4WL0Ho5RW47J37N-zdNhV4Fjs0b7lb6c6ugeTMi4,12793
5
- scdataloader/mapped.py,sha256=wQN2K7GnJv-UiNIlC41HItrVMW50tECAjc8mt-QV-1I,12290
6
- scdataloader/preprocess.py,sha256=sm5OPREZFJaGVF9VsTKGvT1jHT7sOouX_ql0mWx3_4Q,23103
7
- scdataloader/utils.py,sha256=Ih1LLnmRZYOpIk1IoAJKyRAT361zrgBgUhwJM04V6Pw,16115
8
- scdataloader-0.0.2.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
9
- scdataloader-0.0.2.dist-info/METADATA,sha256=4ICXsQcdWkwrAZZVDIYG1L3d7JCpaxpr3MYlnVsD1Qw,37340
10
- scdataloader-0.0.2.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
11
- scdataloader-0.0.2.dist-info/entry_points.txt,sha256=nLqucZaa5wiF7-1FCgMXO916WDQ9Qm0TcxQp0f1DwE4,59
12
- scdataloader-0.0.2.dist-info/RECORD,,