scdataloader 1.9.1__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -135,3 +135,6 @@ figures/*/*.png
135
135
  figures/*.png
136
136
  figures/add_postp_clust.py
137
137
  figures/age_relabel.py
138
+ notebooks/figures/umap_*.png
139
+ notebooks/data/
140
+ data/gene_names/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Jérémie Kalfon
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scdataloader
3
- Version: 1.9.1
3
+ Version: 2.0.0
4
4
  Summary: a dataloader for single cell data in lamindb
5
5
  Project-URL: repository, https://github.com/jkobject/scDataLoader
6
6
  Author-email: jkobject <jkobject@gmail.com>
@@ -12,14 +12,13 @@ Requires-Dist: anndata>=0.9.0
12
12
  Requires-Dist: biomart>=0.9.0
13
13
  Requires-Dist: cellxgene-census>=0.1.0
14
14
  Requires-Dist: django>=4.0.0
15
- Requires-Dist: harmonypy>=0.0.10
16
15
  Requires-Dist: ipykernel>=6.20.0
17
16
  Requires-Dist: jupytext>=1.16.0
18
- Requires-Dist: lamindb[bionty,cellregistry,jupyter,ourprojects,zarr]<2,>=1.0.4
17
+ Requires-Dist: lamindb[bionty,cellregistry,jupyter,zarr]==1.0.4
19
18
  Requires-Dist: leidenalg>=0.8.0
19
+ Requires-Dist: lightning>=2.3.0
20
20
  Requires-Dist: matplotlib>=3.5.0
21
21
  Requires-Dist: numpy==1.26.0
22
- Requires-Dist: palantir>=1.3.3
23
22
  Requires-Dist: pandas>=2.0.0
24
23
  Requires-Dist: pytorch-lightning>=2.3.0
25
24
  Requires-Dist: scikit-misc>=0.5.0
@@ -71,7 +70,16 @@ It allows you to:
71
70
  3. create a more complex single cell dataset
72
71
  4. extend it to your need
73
72
 
74
- built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
73
+ built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
74
+
75
+ ```
76
+ Portions of the mapped.py file are derived from Lamin Labs
77
+ Copyright 2024 Lamin Labs
78
+ Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
79
+ The rest of the package is licensed under MIT License, see LICENSE for details
80
+ Please see https://github.com/laminlabs/lamindb/blob/main/lamindb/core/_mapped_collection.py
81
+ for the original implementation
82
+ ```
75
83
 
76
84
  The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).
77
85
 
@@ -28,7 +28,16 @@ It allows you to:
28
28
  3. create a more complex single cell dataset
29
29
  4. extend it to your need
30
30
 
31
- built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
31
+ built on top of `lamindb` and the `.mapped()` function by Sergei: https://github.com/Koncopd
32
+
33
+ ```
34
+ Portions of the mapped.py file are derived from Lamin Labs
35
+ Copyright 2024 Lamin Labs
36
+ Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
37
+ The rest of the package is licensed under MIT License, see LICENSE for details
38
+ Please see https://github.com/laminlabs/lamindb/blob/main/lamindb/core/_mapped_collection.py
39
+ for the original implementation
40
+ ```
32
41
 
33
42
  The package has been designed together with the [scPRINT paper](https://doi.org/10.1101/2024.07.29.605556) and [model](https://github.com/cantinilab/scPRINT).
34
43
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "scdataloader"
3
- version = "1.9.1"
3
+ version = "2.0.0"
4
4
  description = "a dataloader for single cell data in lamindb"
5
5
  authors = [
6
6
  {name = "jkobject", email = "jkobject@gmail.com"}
@@ -11,7 +11,7 @@ requires-python = ">=3.10,<3.14"
11
11
  keywords = ["scRNAseq", "dataloader", "pytorch", "lamindb", "scPRINT"]
12
12
  dependencies = [
13
13
  "numpy==1.26.0",
14
- "lamindb[bionty,ourprojects,jupyter,cellregistry,zarr]>=1.0.4,<2",
14
+ "lamindb[bionty,jupyter,cellregistry,zarr]==1.0.4",
15
15
  "cellxgene-census>=0.1.0",
16
16
  "torch==2.2.0",
17
17
  "pytorch-lightning>=2.3.0",
@@ -26,10 +26,9 @@ dependencies = [
26
26
  "leidenalg>=0.8.0",
27
27
  "django>=4.0.0",
28
28
  "scikit-misc>=0.5.0",
29
- "palantir>=1.3.3",
30
- "harmonypy>=0.0.10",
31
29
  "jupytext>=1.16.0",
32
-
30
+ "lightning>=2.3.0",
31
+ "pytorch-lightning>=2.3.0",
33
32
  ]
34
33
 
35
34
  [project.optional-dependencies]
@@ -1,7 +1,8 @@
1
+ from importlib.metadata import version
2
+
1
3
  from .collator import Collator
2
4
  from .data import Dataset, SimpleAnnDataset
3
5
  from .datamodule import DataModule
4
6
  from .preprocess import Preprocessor
5
- from importlib.metadata import version
6
7
 
7
8
  __version__ = version("scdataloader")
@@ -3,7 +3,7 @@ from typing import Optional
3
3
  import numpy as np
4
4
  from torch import Tensor, long
5
5
 
6
- from .utils import downsample_profile, load_genes
6
+ from .utils import load_genes
7
7
 
8
8
 
9
9
  class Collator:
@@ -22,8 +22,6 @@ class Collator:
22
22
  organism_name: str = "organism_ontology_term_id",
23
23
  class_names: list[str] = [],
24
24
  genelist: list[str] = [],
25
- downsample: Optional[float] = None, # don't use it for training!
26
- save_output: Optional[str] = None,
27
25
  ):
28
26
  """
29
27
  This class is responsible for collating data for the scPRINT model. It handles the
@@ -57,13 +55,8 @@ class Collator:
57
55
  class_names (list, optional): List of other classes to be considered. Defaults to [].
58
56
  genelist (list, optional): List of genes to be considered. Defaults to [].
59
57
  If [] all genes will be considered
60
- downsample (float, optional): Downsample the profile to a certain number of cells. Defaults to None.
61
- This is usually done by the scPRINT model during training but this option allows you to do it directly from the collator
62
- save_output (str, optional): If not None, saves the output to a file. Defaults to None.
63
- This is mainly for debugging purposes
64
58
  """
65
59
  self.organisms = organisms
66
- self.genedf = load_genes(organisms)
67
60
  self.max_len = max_len
68
61
  self.n_bins = n_bins
69
62
  self.add_zero_genes = add_zero_genes
@@ -75,14 +68,14 @@ class Collator:
75
68
  self.organism_name = organism_name
76
69
  self.tp_name = tp_name
77
70
  self.class_names = class_names
78
- self.save_output = save_output
79
71
  self.start_idx = {}
80
72
  self.accepted_genes = {}
81
- self.downsample = downsample
82
73
  self.to_subset = {}
83
- self._setup(org_to_id, valid_genes, genelist)
74
+ self._setup(None, org_to_id, valid_genes, genelist)
84
75
 
85
- def _setup(self, org_to_id=None, valid_genes=[], genelist=[]):
76
+ def _setup(self, genedf=None, org_to_id=None, valid_genes=[], genelist=[]):
77
+ if genedf is None:
78
+ genedf = load_genes(self.organisms)
86
79
  self.org_to_id = org_to_id
87
80
  self.to_subset = {}
88
81
  self.accepted_genes = {}
@@ -92,14 +85,17 @@ class Collator:
92
85
  if org_to_id is not None
93
86
  else set(self.organisms)
94
87
  )
88
+ if len(valid_genes) > 0:
89
+ if len(set(valid_genes) - set(genedf.index)) > 0:
90
+ print("Some valid genes are not in the genedf!!!")
91
+ tot = genedf[genedf.index.isin(valid_genes)]
92
+ else:
93
+ tot = genedf
95
94
  for organism in self.organisms:
96
- ogenedf = self.genedf[self.genedf.organism == organism]
97
- if len(valid_genes) > 0:
98
- tot = self.genedf[self.genedf.index.isin(valid_genes)]
99
- else:
100
- tot = self.genedf
101
95
  org = org_to_id[organism] if org_to_id is not None else organism
102
96
  self.start_idx.update({org: np.where(tot.organism == organism)[0][0]})
97
+
98
+ ogenedf = genedf[genedf.organism == organism]
103
99
  if len(valid_genes) > 0:
104
100
  self.accepted_genes.update({org: ogenedf.index.isin(valid_genes)})
105
101
  if len(genelist) > 0:
@@ -148,7 +144,6 @@ class Collator:
148
144
  :, self.accepted_genes[organism_id]
149
145
  ]
150
146
  if self.how == "most expr":
151
- nnz_loc = np.where(expr > 0)[0]
152
147
  if "knn_cells" in elem:
153
148
  nnz_loc = np.where(expr + elem["knn_cells"].sum(0) > 0)[0]
154
149
  ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
@@ -161,14 +156,18 @@ class Collator:
161
156
  # loc = np.argsort(expr)[-(self.max_len) :][::-1]
162
157
  elif self.how == "random expr":
163
158
  nnz_loc = np.where(expr > 0)[0]
164
- loc = nnz_loc[
165
- np.random.choice(
166
- len(nnz_loc),
167
- self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc),
168
- replace=False,
169
- # p=(expr.max() + (expr[nnz_loc])*19) / expr.max(), # 20 at most times more likely to be selected
170
- )
171
- ]
159
+ loc = (
160
+ nnz_loc[
161
+ np.random.choice(
162
+ len(nnz_loc),
163
+ self.max_len,
164
+ replace=False,
165
+ # p=(expr.max() + (expr[nnz_loc])*19) / expr.max(), # 20 at most times more likely to be selected
166
+ )
167
+ ]
168
+ if self.max_len < len(nnz_loc)
169
+ else nnz_loc
170
+ )
172
171
  elif self.how in ["all", "some"]:
173
172
  loc = np.arange(len(expr))
174
173
  else:
@@ -179,23 +178,19 @@ class Collator:
179
178
  "all",
180
179
  "some",
181
180
  ]:
181
+ ma = self.add_zero_genes + (
182
+ 0 if self.max_len < len(nnz_loc) else self.max_len - len(nnz_loc)
183
+ )
182
184
  if "knn_cells" in elem:
183
185
  # we complete with genes expressed in the knn
184
- nnz_loc = np.where(elem["knn_cells"].sum(0) > 0)[0]
185
- ma = self.max_len if self.max_len < len(nnz_loc) else len(nnz_loc)
186
186
  # which is not a zero_loc in this context
187
- zero_loc = np.argsort(elem["knn_cells"].sum(0))[-(ma):][::-1]
187
+ zero_loc = np.argsort(elem["knn_cells"].sum(0))[-ma:][::-1]
188
188
  else:
189
189
  zero_loc = np.where(expr == 0)[0]
190
190
  zero_loc = zero_loc[
191
191
  np.random.choice(
192
192
  len(zero_loc),
193
- self.add_zero_genes
194
- + (
195
- 0
196
- if self.max_len < len(nnz_loc)
197
- else self.max_len - len(nnz_loc)
198
- ),
193
+ ma,
199
194
  replace=False,
200
195
  )
201
196
  ]
@@ -255,13 +250,6 @@ class Collator:
255
250
  ret.update({"knn_cells": Tensor(knn_cells)})
256
251
  if len(dataset) > 0:
257
252
  ret.update({"dataset": Tensor(dataset).to(long)})
258
- if self.downsample is not None:
259
- ret["x"] = downsample_profile(ret["x"], self.downsample)
260
- if self.save_output is not None:
261
- with open(self.save_output, "a") as f:
262
- np.savetxt(f, ret["x"].numpy())
263
- with open(self.save_output + "_loc", "a") as f:
264
- np.savetxt(f, gene_locs)
265
253
  return ret
266
254
 
267
255
 
@@ -113,26 +113,34 @@ COARSE_ASSAY = {
113
113
 
114
114
 
115
115
  MAIN_HUMAN_MOUSE_DEV_STAGE_MAP = {
116
- "HsapDv:0010000": [
116
+ "HsapDv:0010000": [ # postnatal stage
117
117
  "MmusDv:0000092", # postnatal stage
118
118
  ],
119
- "HsapDv:0000258": [ # mature stage
119
+ "HsapDv:0000258": [ # mature stage >15
120
120
  "MmusDv:0000110", # mature stage
121
- "HsapDv:0000204", #
121
+ "HsapDv:0000204", #
122
122
  ],
123
- "HsapDv:0000227": [ # late adult stage
123
+ "HsapDv:0000087": [], # adult stage >19
124
+ "HsapDv:0000227": [ # late adult stage > 40
124
125
  "MmusDv:0000091", # 20 month-old stage
125
126
  "MmusDv:0000089", # 18 month-old stage
127
+ "HsapDv:0000091", # > 45
128
+ "HsapDv:0000093", # > 65
129
+ ],
130
+ "HsapDv:0000272": [ # 60-79 year-old stage
131
+ "HsapDv:0000094", # 60-79 year-old stage
126
132
  ],
127
- "HsapDv:0000272": [], # 60-79 year-old stage
128
133
  "HsapDv:0000095": [], # 80 year-old and over stage
129
- "HsapDv:0000267": [ # middle aged stage
134
+ "HsapDv:0000267": [ # middle aged stage >40 <60
130
135
  "MmusDv:0000087", # 16 month-old stage
131
136
  "UBERON:0018241", # prime adult stage
132
137
  "MmusDv:0000083", # 12 month-old stage
133
138
  "HsapDv:0000092", # same
134
139
  ],
135
- "HsapDv:0000266": [ # young adult stage
140
+ "HsapDv:0000266": [ # young adult stage <40
141
+ "HsapDv:0000088", # mature stage
142
+ "HsapDv:0000090", # 25 - 44
143
+ "HsapDv:0000086", # adolescent stage
136
144
  "MmusDv:0000050", # 6 weeks
137
145
  "HsapDv:0000089", # same
138
146
  "MmusDv:0000051", # 7 weeks
@@ -163,22 +171,30 @@ MAIN_HUMAN_MOUSE_DEV_STAGE_MAP = {
163
171
  "MmusDv:0000099", # 26 weeks
164
172
  "MmusDv:0000102", # 29 weeks
165
173
  ],
166
- "HsapDv:0000265": [], # child stage (1-4 yo)
174
+ "HsapDv:0000265": [ # child stage (1-4 yo)
175
+ "HsapDv:0000084", # 2-5 yo
176
+ ],
167
177
  "HsapDv:0000271": [ # juvenile stage (5-14 yo)
168
178
  "MmusDv:0000048", # 4 weeks
169
179
  "MmusDv:0000049", # 5 weeks
180
+ "HsapDv:0000081", # child
181
+ "HsapDv:0000085", # 6-11 yo
170
182
  ],
171
- "HsapDv:0000260": [ # infant stage
183
+ "HsapDv:0000260": [ # infant stage <2
172
184
  "MmusDv:0000046", # 2 weeks
173
185
  "MmusDv:0000045", # 1 week
174
186
  "MmusDv:0000047", # 3 weeks
175
187
  "HsapDv:0000083",
188
+ "HsapDv:0000256", # under 1 yo
176
189
  ],
177
190
  "HsapDv:0000262": [ # newborn stage (0-28 days)
178
191
  "MmusDv:0000036", # Theiler stage 27
179
192
  "MmusDv:0000037", # Theiler stage 28
180
193
  "MmusDv:0000113", # 4-7 days
194
+ "HsapDv:0000174", # 1 month-old stage
195
+ "HsapDv:0000082", # newborn stage
181
196
  ],
197
+ "HsapDv:0000002": [], # embryonic stage
182
198
  "HsapDv:0000007": [], # Carnegie stage 03
183
199
  "HsapDv:0000008": [], # Carnegie stage 04
184
200
  "HsapDv:0000009": [], # Carnegie stage 05