scdataloader 2.0.11__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scdataloader/data.py CHANGED
@@ -282,19 +282,25 @@ class Dataset(torchDataset):
282
282
  elif clss == "cell_type_ontology_term_id":
283
283
  parentdf = (
284
284
  bt.CellType.filter()
285
- .df(include=["parents__ontology_id", "ontology_id"])
285
+ .to_dataframe(
286
+ limit=None, include=["parents__ontology_id", "ontology_id"]
287
+ )
286
288
  .set_index("ontology_id")
287
289
  )
288
290
  elif clss == "tissue_ontology_term_id":
289
291
  parentdf = (
290
292
  bt.Tissue.filter()
291
- .df(include=["parents__ontology_id", "ontology_id"])
293
+ .to_dataframe(
294
+ limit=None, include=["parents__ontology_id", "ontology_id"]
295
+ )
292
296
  .set_index("ontology_id")
293
297
  )
294
298
  elif clss == "disease_ontology_term_id":
295
299
  parentdf = (
296
300
  bt.Disease.filter()
297
- .df(include=["parents__ontology_id", "ontology_id"])
301
+ .to_dataframe(
302
+ limit=None, include=["parents__ontology_id", "ontology_id"]
303
+ )
298
304
  .set_index("ontology_id")
299
305
  )
300
306
  elif clss in [
@@ -304,19 +310,25 @@ class Dataset(torchDataset):
304
310
  ]:
305
311
  parentdf = (
306
312
  bt.DevelopmentalStage.filter()
307
- .df(include=["parents__ontology_id", "ontology_id"])
313
+ .to_dataframe(
314
+ limit=None, include=["parents__ontology_id", "ontology_id"]
315
+ )
308
316
  .set_index("ontology_id")
309
317
  )
310
318
  elif clss == "assay_ontology_term_id":
311
319
  parentdf = (
312
320
  bt.ExperimentalFactor.filter()
313
- .df(include=["parents__ontology_id", "ontology_id"])
321
+ .to_dataframe(
322
+ limit=None, include=["parents__ontology_id", "ontology_id"]
323
+ )
314
324
  .set_index("ontology_id")
315
325
  )
316
326
  elif clss == "self_reported_ethnicity_ontology_term_id":
317
327
  parentdf = (
318
328
  bt.Ethnicity.filter()
319
- .df(include=["parents__ontology_id", "ontology_id"])
329
+ .to_dataframe(
330
+ limit=None, include=["parents__ontology_id", "ontology_id"]
331
+ )
320
332
  .set_index("ontology_id")
321
333
  )
322
334
 
@@ -264,8 +264,9 @@ class Preprocessor:
264
264
  # For genes that are already ENS IDs, use them directly
265
265
  prev_size = adata.shape[1]
266
266
  # Handle symbol genes
267
+ cols_to_use = adata.var.columns.difference(genesdf.columns)
267
268
  if self.is_symbol:
268
- new_var = adata.var.merge(
269
+ new_var = adata.var[cols_to_use].merge(
269
270
  genesdf.drop_duplicates("symbol").set_index("symbol", drop=False),
270
271
  left_index=True,
271
272
  right_index=True,
@@ -275,7 +276,7 @@ class Preprocessor:
275
276
  adata = adata[:, new_var.index]
276
277
  new_var.index = new_var["ensembl_gene_id"]
277
278
  else:
278
- new_var = adata.var.merge(
279
+ new_var = adata.var[cols_to_use].merge(
279
280
  genesdf, left_index=True, right_index=True, how="inner"
280
281
  )
281
282
  adata = adata[:, new_var.index]
@@ -654,7 +655,7 @@ class LaminPreprocessor(Preprocessor):
654
655
  # Reconstruct collection using keys
655
656
  dataset = ln.Collection(
656
657
  [ln.Artifact.filter(key=k).one() for k in files],
657
- name=name,
658
+ key=name,
658
659
  description=description,
659
660
  )
660
661
  dataset.save()
@@ -819,9 +820,15 @@ def additional_postprocess(adata):
819
820
  # else:
820
821
  print("starting post processing")
821
822
  sc.pp.neighbors(adata, use_rep="X_pca")
822
- sc.tl.leiden(adata, key_added="leiden_2", resolution=2.0)
823
- sc.tl.leiden(adata, key_added="leiden_1", resolution=1.0)
824
- sc.tl.leiden(adata, key_added="leiden_0.5", resolution=0.5)
823
+ sc.tl.leiden(
824
+ adata, key_added="leiden_2", resolution=2.0, flavor="igraph", n_iterations=2
825
+ )
826
+ sc.tl.leiden(
827
+ adata, key_added="leiden_1", resolution=1.0, flavor="igraph", n_iterations=2
828
+ )
829
+ sc.tl.leiden(
830
+ adata, key_added="leiden_0.5", resolution=0.5, flavor="igraph", n_iterations=2
831
+ )
825
832
  sc.tl.umap(adata)
826
833
  mid = adata.uns["dataset_id"] if "dataset_id" in adata.uns else "unknown_id"
827
834
  sc.pl.umap(
scdataloader/utils.py CHANGED
@@ -389,7 +389,7 @@ def load_dataset_local(
389
389
  except IntegrityError:
390
390
  print(f"File {file.key} already exists in storage")
391
391
  saved_files.append(file)
392
- dataset = ln.Collection(saved_files, name=name, description=description)
392
+ dataset = ln.Collection(saved_files, key=name, description=description)
393
393
  dataset.save()
394
394
  return dataset
395
395
 
@@ -412,7 +412,7 @@ def load_genes(
412
412
  for organism in organisms:
413
413
  genesdf = bt.Gene.filter(
414
414
  organism_id=bt.Organism.filter(ontology_id=organism).first().id
415
- ).df()
415
+ ).to_dataframe(limit=None)
416
416
  genesdf.loc[genesdf.ensembl_gene_id.isna(), "ensembl_gene_id"] = genesdf.loc[
417
417
  genesdf.ensembl_gene_id.isna(), "stable_id"
418
418
  ]
@@ -440,6 +440,7 @@ def load_genes(
440
440
  "ncbi_gene_ids",
441
441
  "synonyms",
442
442
  "description",
443
+ "abbr",
443
444
  ]:
444
445
  if col in organismdf.columns:
445
446
  organismdf.drop(columns=[col], inplace=True)
@@ -453,12 +454,16 @@ def _adding_scbasecamp_genes(
453
454
  ):
454
455
  if len(species) == 0:
455
456
  species = set(
456
- bt.Organism.using("laminlabs/arc-virtual-cell-atlas").df().ontology_id
457
+ bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
458
+ .to_dataframe(limit=None)
459
+ .ontology_id
457
460
  ) - set(["NCBITaxon:10090", "NCBITaxon:9606"])
458
461
  species = list(species)
459
462
  for i in set(
460
- bt.Organism.using("laminlabs/arc-virtual-cell-atlas").df().ontology_id
461
- ) - set(bt.Organism.filter().df().ontology_id):
463
+ bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
464
+ .to_dataframe(limit=None)
465
+ .ontology_id
466
+ ) - set(bt.Organism.filter().to_dataframe(limit=None).ontology_id):
462
467
  print(i)
463
468
  rec = (
464
469
  bt.Organism.using("laminlabs/arc-virtual-cell-atlas")
@@ -485,7 +490,7 @@ def _adding_scbasecamp_genes(
485
490
  bt.Gene.using("laminlabs/arc-virtual-cell-atlas")
486
491
  .filter(organism__ontology_id=i)
487
492
  .all()
488
- .df()
493
+ .to_dataframe(limit=None)
489
494
  )
490
495
  genes = []
491
496
  org = bt.Organism.filter(ontology_id=i).one()
@@ -545,7 +550,11 @@ def populate_my_ontology(
545
550
  if len(celltypes) == 0:
546
551
  bt.CellType.import_source()
547
552
  else:
548
- names = bt.CellType.public().df().index if not celltypes else celltypes
553
+ names = (
554
+ bt.CellType.public().to_dataframe().index
555
+ if not celltypes
556
+ else celltypes
557
+ )
549
558
  records = bt.CellType.from_values(names, field="ontology_id")
550
559
  ln.save(records)
551
560
  elem = bt.CellType(name="unknown", ontology_id="unknown")
@@ -555,7 +564,7 @@ def populate_my_ontology(
555
564
  if organisms_clade is not None:
556
565
  records = []
557
566
  for organism_clade in organisms_clade:
558
- names = bt.Organism.public(organism=organism_clade).df().index
567
+ names = bt.Organism.public(organism=organism_clade).to_dataframe().index
559
568
  source = bt.Source.filter(name="ensembl", organism=organism_clade).last()
560
569
  for name in names:
561
570
  try:
@@ -581,7 +590,7 @@ def populate_my_ontology(
581
590
  ln.save([elem], ignore_conflicts=True)
582
591
  # Phenotype
583
592
  if sex is not None:
584
- names = bt.Phenotype.public().df().index if not sex else sex
593
+ names = bt.Phenotype.public().to_dataframe().index if not sex else sex
585
594
  source = bt.Source.filter(name="pato").first()
586
595
  records = [
587
596
  bt.Phenotype.from_source(ontology_id=i, source=source) for i in names
@@ -594,7 +603,11 @@ def populate_my_ontology(
594
603
  if len(ethnicities) == 0:
595
604
  bt.Ethnicity.import_source()
596
605
  else:
597
- names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
606
+ names = (
607
+ bt.Ethnicity.public().to_dataframe().index
608
+ if not ethnicities
609
+ else ethnicities
610
+ )
598
611
  records = bt.Ethnicity.from_values(names, field="ontology_id")
599
612
  ln.save(records)
600
613
  elem = bt.Ethnicity(name="unknown", ontology_id="unknown")
@@ -604,7 +617,11 @@ def populate_my_ontology(
604
617
  if len(assays) == 0:
605
618
  bt.ExperimentalFactor.import_source()
606
619
  else:
607
- names = bt.ExperimentalFactor.public().df().index if not assays else assays
620
+ names = (
621
+ bt.ExperimentalFactor.public().to_dataframe().index
622
+ if not assays
623
+ else assays
624
+ )
608
625
  records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
609
626
  ln.save(records)
610
627
  elem = bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
@@ -616,7 +633,11 @@ def populate_my_ontology(
616
633
  if len(tissues) == 0:
617
634
  bt.Tissue.import_source()
618
635
  else:
619
- names = bt.Tissue.public().df().index if not tissues else tissues
636
+ names = (
637
+ bt.Tissue.public().to_dataframe().index
638
+ if not tissues
639
+ else tissues
640
+ )
620
641
  records = bt.Tissue.from_values(names, field="ontology_id")
621
642
  ln.save(records)
622
643
  elem = bt.Tissue(name="unknown", ontology_id="unknown").save()
@@ -629,7 +650,7 @@ def populate_my_ontology(
629
650
  bt.DevelopmentalStage.import_source(source=source)
630
651
  else:
631
652
  names = (
632
- bt.DevelopmentalStage.public().df().index
653
+ bt.DevelopmentalStage.public().to_dataframe().index
633
654
  if not dev_stages
634
655
  else dev_stages
635
656
  )
@@ -641,7 +662,11 @@ def populate_my_ontology(
641
662
  if len(diseases) == 0:
642
663
  bt.Disease.import_source()
643
664
  else:
644
- names = bt.Disease.public().df().index if not diseases else diseases
665
+ names = (
666
+ bt.Disease.public().to_dataframe().index
667
+ if not diseases
668
+ else diseases
669
+ )
645
670
  records = bt.Disease.from_values(names, field="ontology_id")
646
671
  ln.save(records)
647
672
  bt.Disease(name="normal", ontology_id="PATO:0000461").save()
@@ -650,7 +675,9 @@ def populate_my_ontology(
650
675
  for organism in genes_from:
651
676
  # convert onto to name
652
677
  organism = bt.Organism.filter(ontology_id=organism).one().name
653
- names = bt.Gene.public(organism=organism).df()["ensembl_gene_id"]
678
+ names = bt.Gene.public(organism=organism).to_dataframe()[
679
+ "ensembl_gene_id"
680
+ ]
654
681
 
655
682
  # Process names in blocks of 10,000 elements
656
683
  block_size = 10000
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scdataloader
3
- Version: 2.0.11
3
+ Version: 2.1.0
4
4
  Summary: a dataloader for single cell data in lamindb
5
5
  Project-URL: repository, https://github.com/jkobject/scDataLoader
6
6
  Author-email: jkobject <jkobject@gmail.com>
@@ -14,7 +14,7 @@ Requires-Dist: cellxgene-census>=0.1.0
14
14
  Requires-Dist: django>=4.0.0
15
15
  Requires-Dist: ipykernel>=6.20.0
16
16
  Requires-Dist: jupytext>=1.16.0
17
- Requires-Dist: lamindb[bionty,jupyter,zarr]==1.6.2
17
+ Requires-Dist: lamindb[gcp]==2.1.1
18
18
  Requires-Dist: leidenalg>=0.8.0
19
19
  Requires-Dist: lightning>=2.3.0
20
20
  Requires-Dist: matplotlib>=3.5.0
@@ -162,7 +162,7 @@ adata = preprocessor(adata)
162
162
 
163
163
  art = ln.Artifact(adata, description="test")
164
164
  art.save()
165
- ln.Collection(art, name="test", description="test").save()
165
+ ln.Collection(art, key="test", description="test").save()
166
166
 
167
167
  datamodule = DataModule(
168
168
  collection_name="test",
@@ -376,7 +376,7 @@ using absolute paths. You need to do 3 things:
376
376
 
377
377
  ```python
378
378
  import lamin as ln
379
- ln.Storage.df()
379
+ ln.Storage.to_dataframe(limit=None)
380
380
  # view what is your current storage id (in my case it was GZgLW1TQ)
381
381
  ln.Storage.filter(uid="GZgLW1TI").update(
382
382
  root=Path("your_new_locations").as_posix().rstrip("/")
@@ -0,0 +1,15 @@
1
+ scdataloader/__init__.py,sha256=Z5HURehoWw1GrecImmTXIkv4ih8Q5RxNQWPm8zjjXOA,226
2
+ scdataloader/__main__.py,sha256=xPOtrEpQQQZUGTnm8KTvsQcA_jR45oMG_VHqd0Ny7_M,8677
3
+ scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
4
+ scdataloader/collator.py,sha256=VcFJcVAIeKvYkG1DPRXzoBaw2wQ6D_0lsv5Mcv-9USI,17419
5
+ scdataloader/config.py,sha256=wGlCR3tWyEVa69ajovJKYc86CTCJR8e1xC7BTlUOJQE,34582
6
+ scdataloader/data.py,sha256=9883deD-fA-oS_6USNUokiJuVLLUo7_MsxU3qqwoWlA,25568
7
+ scdataloader/datamodule.py,sha256=pFBGUOHl3ibi8QhiV8x5ukjzVjnJMsZWNw3Ekk3P83Y,43810
8
+ scdataloader/mapped.py,sha256=h9YKQ8SG9tyZL8c6_Wu5Xov5ODGK6FzVuFopz58xwN4,29887
9
+ scdataloader/preprocess.py,sha256=hKnDhZoZkAClZRByOJcvSRykkdY6bhTbIsKge7NV-A0,40964
10
+ scdataloader/utils.py,sha256=4CbcCAR93az4wSVAra8xyOvq01iM57rqCdLbDejj2pw,28328
11
+ scdataloader-2.1.0.dist-info/METADATA,sha256=eHSH7fSk8AofI1fFE4SHXxlACiFS2U7tD16cHApR3hc,13451
12
+ scdataloader-2.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
13
+ scdataloader-2.1.0.dist-info/entry_points.txt,sha256=VXAN1m_CjbdLJ6SKYR0sBLGDV4wvv31ri7fWWuwbpno,60
14
+ scdataloader-2.1.0.dist-info/licenses/LICENSE,sha256=rGy_eYmnxtbOvKs7qt5V0czSWxJwgX_MlgMyTZwDHbc,1073
15
+ scdataloader-2.1.0.dist-info/RECORD,,
@@ -1,15 +0,0 @@
1
- scdataloader/__init__.py,sha256=Z5HURehoWw1GrecImmTXIkv4ih8Q5RxNQWPm8zjjXOA,226
2
- scdataloader/__main__.py,sha256=xPOtrEpQQQZUGTnm8KTvsQcA_jR45oMG_VHqd0Ny7_M,8677
3
- scdataloader/base.py,sha256=M1gD59OffRdLOgS1vHKygOomUoAMuzjpRtAfM3SBKF8,338
4
- scdataloader/collator.py,sha256=VcFJcVAIeKvYkG1DPRXzoBaw2wQ6D_0lsv5Mcv-9USI,17419
5
- scdataloader/config.py,sha256=wGlCR3tWyEVa69ajovJKYc86CTCJR8e1xC7BTlUOJQE,34582
6
- scdataloader/data.py,sha256=tXvONJNgcdMQIRh2KlAq9KCsf-Sz2L4GUlcGyf1OMhw,25160
7
- scdataloader/datamodule.py,sha256=pFBGUOHl3ibi8QhiV8x5ukjzVjnJMsZWNw3Ekk3P83Y,43810
8
- scdataloader/mapped.py,sha256=h9YKQ8SG9tyZL8c6_Wu5Xov5ODGK6FzVuFopz58xwN4,29887
9
- scdataloader/preprocess.py,sha256=VFmyJluk4drR4fcH5qBAcJLf0cJg26ElA0HDuHOK68s,40730
10
- scdataloader/utils.py,sha256=iiVQUKV_TDGWgg-6HWkGVIPfyHhz6csIUKclQSpoVCk,27737
11
- scdataloader-2.0.11.dist-info/METADATA,sha256=a8ypn147Y4ZqbiwnpDCw6VpNI0bbeDqVnr1vkBE7TW0,13449
12
- scdataloader-2.0.11.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
13
- scdataloader-2.0.11.dist-info/entry_points.txt,sha256=VXAN1m_CjbdLJ6SKYR0sBLGDV4wvv31ri7fWWuwbpno,60
14
- scdataloader-2.0.11.dist-info/licenses/LICENSE,sha256=rGy_eYmnxtbOvKs7qt5V0czSWxJwgX_MlgMyTZwDHbc,1073
15
- scdataloader-2.0.11.dist-info/RECORD,,