gengeneeval 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/PKG-INFO +111 -4
  2. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/README.md +107 -1
  3. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/pyproject.toml +4 -3
  4. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/__init__.py +52 -1
  5. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/data/__init__.py +14 -0
  6. gengeneeval-0.3.0/src/geneval/data/lazy_loader.py +562 -0
  7. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/evaluator.py +46 -0
  8. gengeneeval-0.3.0/src/geneval/lazy_evaluator.py +424 -0
  9. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/metrics/__init__.py +25 -0
  10. gengeneeval-0.3.0/src/geneval/metrics/accelerated.py +857 -0
  11. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/LICENSE +0 -0
  12. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/cli.py +0 -0
  13. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/config.py +0 -0
  14. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/core.py +0 -0
  15. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/data/gene_expression_datamodule.py +0 -0
  16. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/data/loader.py +0 -0
  17. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/evaluators/__init__.py +0 -0
  18. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/evaluators/base_evaluator.py +0 -0
  19. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/evaluators/gene_expression_evaluator.py +0 -0
  20. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/metrics/base_metric.py +0 -0
  21. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/metrics/correlation.py +0 -0
  22. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/metrics/distances.py +0 -0
  23. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/metrics/metrics.py +0 -0
  24. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/metrics/reconstruction.py +0 -0
  25. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/models/__init__.py +0 -0
  26. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/models/base_model.py +0 -0
  27. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/results.py +0 -0
  28. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/testing.py +0 -0
  29. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/utils/__init__.py +0 -0
  30. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/utils/io.py +0 -0
  31. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/utils/preprocessing.py +0 -0
  32. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/visualization/__init__.py +0 -0
  33. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/visualization/plots.py +0 -0
  34. {gengeneeval-0.2.0 → gengeneeval-0.3.0}/src/geneval/visualization/visualizer.py +0 -0
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gengeneeval
3
- Version: 0.2.0
4
- Summary: Comprehensive evaluation of generated gene expression data. Computes metrics between real and generated datasets with support for condition matching, train/test splits, and publication-quality visualizations.
3
+ Version: 0.3.0
4
+ Summary: Comprehensive evaluation of generated gene expression data. Computes metrics between real and generated datasets with support for condition matching, train/test splits, memory-efficient lazy loading, CPU parallelization, GPU acceleration, and publication-quality visualizations.
5
5
  License: MIT
6
6
  License-File: LICENSE
7
- Keywords: gene expression,evaluation,metrics,single-cell,generative models,benchmarking
7
+ Keywords: gene expression,evaluation,metrics,single-cell,generative models,benchmarking,memory-efficient
8
8
  Author: GenEval Team
9
9
  Author-email: geneval@example.com
10
10
  Requires-Python: >=3.8,<4.0
@@ -24,6 +24,7 @@ Provides-Extra: full
24
24
  Provides-Extra: gpu
25
25
  Requires-Dist: anndata (>=0.8.0)
26
26
  Requires-Dist: geomloss (>=0.2.1) ; extra == "full" or extra == "gpu"
27
+ Requires-Dist: joblib (>=1.0.0)
27
28
  Requires-Dist: matplotlib (>=3.5.0)
28
29
  Requires-Dist: numpy (>=1.21.0)
29
30
  Requires-Dist: pandas (>=1.3.0)
@@ -46,7 +47,7 @@ Description-Content-Type: text/markdown
46
47
 
47
48
  **Comprehensive evaluation of generated gene expression data against real datasets.**
48
49
 
49
- GenEval is a modular, object-oriented Python framework for computing metrics between real and generated gene expression datasets stored in AnnData (h5ad) format. It supports condition-based matching, train/test splits, and generates publication-quality visualizations.
50
+ GenEval is a modular, object-oriented Python framework for computing metrics between real and generated gene expression datasets stored in AnnData (h5ad) format. It supports condition-based matching, train/test splits, memory-efficient lazy loading for large datasets, and generates publication-quality visualizations.
50
51
 
51
52
  ## Features
52
53
 
@@ -77,6 +78,10 @@ All metrics are computed **per-gene** (returning a vector) and **aggregated**:
77
78
  - ✅ Condition-based matching (perturbation, cell type, etc.)
78
79
  - ✅ Train/test split support
79
80
  - ✅ Per-gene and aggregate metrics
81
+ - ✅ **Memory-efficient lazy loading** for large datasets
82
+ - ✅ **Batched evaluation** to avoid OOM errors
83
+ - ✅ **CPU parallelization** via joblib (multi-core speedup)
84
+ - ✅ **GPU acceleration** via PyTorch (10-100x speedup)
80
85
  - ✅ Modular, extensible architecture
81
86
  - ✅ Command-line interface
82
87
  - ✅ Publication-quality visualizations
@@ -140,6 +145,108 @@ geneval --real real.h5ad --generated generated.h5ad \
140
145
  --output results/
141
146
  ```
142
147
 
148
+ ### Memory-Efficient Mode (for Large Datasets)
149
+
150
+ For datasets too large to fit in memory, use the lazy evaluation API:
151
+
152
+ ```python
153
+ from geneval import evaluate_lazy, load_data_lazy
154
+
155
+ # Memory-efficient evaluation (streams data one condition at a time)
156
+ results = evaluate_lazy(
157
+ real_path="large_real.h5ad",
158
+ generated_path="large_generated.h5ad",
159
+ condition_columns=["perturbation"],
160
+ batch_size=256, # Process in batches
161
+ use_backed=True, # Memory-mapped file access
162
+ output_dir="eval_output/",
163
+ save_per_condition=True, # Save each condition to disk
164
+ )
165
+
166
+ # Get summary statistics
167
+ print(results.get_summary())
168
+
169
+ # Or use the lazy loader directly for custom workflows
170
+ with load_data_lazy("real.h5ad", "gen.h5ad", ["perturbation"]) as loader:
171
+ print(f"Memory estimate: {loader.estimate_memory_usage()}")
172
+
173
+ # Process one condition at a time
174
+ for key, real, gen, info in loader.iterate_conditions():
175
+ # Your custom evaluation logic
176
+ pass
177
+ ```
178
+
179
+ ### Accelerated Evaluation (CPU Parallelization & GPU)
180
+
181
+ GenEval supports CPU parallelization and GPU acceleration for significant speedups:
182
+
183
+ ```python
184
+ from geneval import evaluate, get_available_backends
185
+
186
+ # Check available backends
187
+ print(get_available_backends())
188
+ # {'joblib': True, 'torch': True, 'geomloss': True, 'cuda': True, 'mps': False}
189
+
190
+ # Parallel CPU evaluation (use all cores)
191
+ results = evaluate(
192
+ real_path="real.h5ad",
193
+ generated_path="generated.h5ad",
194
+ condition_columns=["perturbation"],
195
+ n_jobs=-1, # Use all available CPU cores
196
+ )
197
+
198
+ # GPU-accelerated evaluation
199
+ results = evaluate(
200
+ real_path="real.h5ad",
201
+ generated_path="generated.h5ad",
202
+ condition_columns=["perturbation"],
203
+ device="cuda", # Use NVIDIA GPU
204
+ )
205
+
206
+ # Combined: parallel CPU + auto device selection
207
+ results = evaluate(..., n_jobs=8, device="auto")
208
+ ```
209
+
210
+ #### Low-level Accelerated API
211
+
212
+ For custom workflows, use the accelerated metrics directly:
213
+
214
+ ```python
215
+ from geneval.metrics.accelerated import (
216
+ compute_metrics_accelerated,
217
+ GPUWasserstein1,
218
+ GPUMMD,
219
+ vectorized_wasserstein1,
220
+ )
221
+ import numpy as np
222
+
223
+ # Load your data
224
+ real = np.random.randn(1000, 5000) # 1000 cells, 5000 genes
225
+ generated = np.random.randn(1000, 5000)
226
+
227
+ # Compute multiple metrics with acceleration
228
+ results = compute_metrics_accelerated(
229
+ real, generated,
230
+ metrics=["wasserstein_1", "wasserstein_2", "mmd", "energy"],
231
+ n_jobs=8, # CPU parallelization
232
+ device="cuda", # GPU acceleration
233
+ verbose=True,
234
+ )
235
+
236
+ # Access results
237
+ print(f"W1: {results['wasserstein_1'].aggregate_value:.4f}")
238
+ print(f"MMD: {results['mmd'].aggregate_value:.4f}")
239
+ ```
240
+
241
+ #### Performance Tips
242
+
243
+ | Optimization | Speedup | When to Use |
244
+ |--------------|---------|-------------|
245
+ | `n_jobs=-1` (all cores) | 4-16x | Always (if joblib available) |
246
+ | `device="cuda"` | 10-100x | Large datasets, NVIDIA GPU available |
247
+ | `device="mps"` | 5-20x | Apple Silicon Macs |
248
+ | Vectorized NumPy | 2-5x | Automatic fallback |
249
+
143
250
  ## Expected Data Format
144
251
 
145
252
  GenEval expects AnnData (h5ad) files with:
@@ -7,7 +7,7 @@
7
7
 
8
8
  **Comprehensive evaluation of generated gene expression data against real datasets.**
9
9
 
10
- GenEval is a modular, object-oriented Python framework for computing metrics between real and generated gene expression datasets stored in AnnData (h5ad) format. It supports condition-based matching, train/test splits, and generates publication-quality visualizations.
10
+ GenEval is a modular, object-oriented Python framework for computing metrics between real and generated gene expression datasets stored in AnnData (h5ad) format. It supports condition-based matching, train/test splits, memory-efficient lazy loading for large datasets, and generates publication-quality visualizations.
11
11
 
12
12
  ## Features
13
13
 
@@ -38,6 +38,10 @@ All metrics are computed **per-gene** (returning a vector) and **aggregated**:
38
38
  - ✅ Condition-based matching (perturbation, cell type, etc.)
39
39
  - ✅ Train/test split support
40
40
  - ✅ Per-gene and aggregate metrics
41
+ - ✅ **Memory-efficient lazy loading** for large datasets
42
+ - ✅ **Batched evaluation** to avoid OOM errors
43
+ - ✅ **CPU parallelization** via joblib (multi-core speedup)
44
+ - ✅ **GPU acceleration** via PyTorch (10-100x speedup)
41
45
  - ✅ Modular, extensible architecture
42
46
  - ✅ Command-line interface
43
47
  - ✅ Publication-quality visualizations
@@ -101,6 +105,108 @@ geneval --real real.h5ad --generated generated.h5ad \
101
105
  --output results/
102
106
  ```
103
107
 
108
+ ### Memory-Efficient Mode (for Large Datasets)
109
+
110
+ For datasets too large to fit in memory, use the lazy evaluation API:
111
+
112
+ ```python
113
+ from geneval import evaluate_lazy, load_data_lazy
114
+
115
+ # Memory-efficient evaluation (streams data one condition at a time)
116
+ results = evaluate_lazy(
117
+ real_path="large_real.h5ad",
118
+ generated_path="large_generated.h5ad",
119
+ condition_columns=["perturbation"],
120
+ batch_size=256, # Process in batches
121
+ use_backed=True, # Memory-mapped file access
122
+ output_dir="eval_output/",
123
+ save_per_condition=True, # Save each condition to disk
124
+ )
125
+
126
+ # Get summary statistics
127
+ print(results.get_summary())
128
+
129
+ # Or use the lazy loader directly for custom workflows
130
+ with load_data_lazy("real.h5ad", "gen.h5ad", ["perturbation"]) as loader:
131
+ print(f"Memory estimate: {loader.estimate_memory_usage()}")
132
+
133
+ # Process one condition at a time
134
+ for key, real, gen, info in loader.iterate_conditions():
135
+ # Your custom evaluation logic
136
+ pass
137
+ ```
138
+
139
+ ### Accelerated Evaluation (CPU Parallelization & GPU)
140
+
141
+ GenEval supports CPU parallelization and GPU acceleration for significant speedups:
142
+
143
+ ```python
144
+ from geneval import evaluate, get_available_backends
145
+
146
+ # Check available backends
147
+ print(get_available_backends())
148
+ # {'joblib': True, 'torch': True, 'geomloss': True, 'cuda': True, 'mps': False}
149
+
150
+ # Parallel CPU evaluation (use all cores)
151
+ results = evaluate(
152
+ real_path="real.h5ad",
153
+ generated_path="generated.h5ad",
154
+ condition_columns=["perturbation"],
155
+ n_jobs=-1, # Use all available CPU cores
156
+ )
157
+
158
+ # GPU-accelerated evaluation
159
+ results = evaluate(
160
+ real_path="real.h5ad",
161
+ generated_path="generated.h5ad",
162
+ condition_columns=["perturbation"],
163
+ device="cuda", # Use NVIDIA GPU
164
+ )
165
+
166
+ # Combined: parallel CPU + auto device selection
167
+ results = evaluate(..., n_jobs=8, device="auto")
168
+ ```
169
+
170
+ #### Low-level Accelerated API
171
+
172
+ For custom workflows, use the accelerated metrics directly:
173
+
174
+ ```python
175
+ from geneval.metrics.accelerated import (
176
+ compute_metrics_accelerated,
177
+ GPUWasserstein1,
178
+ GPUMMD,
179
+ vectorized_wasserstein1,
180
+ )
181
+ import numpy as np
182
+
183
+ # Load your data
184
+ real = np.random.randn(1000, 5000) # 1000 cells, 5000 genes
185
+ generated = np.random.randn(1000, 5000)
186
+
187
+ # Compute multiple metrics with acceleration
188
+ results = compute_metrics_accelerated(
189
+ real, generated,
190
+ metrics=["wasserstein_1", "wasserstein_2", "mmd", "energy"],
191
+ n_jobs=8, # CPU parallelization
192
+ device="cuda", # GPU acceleration
193
+ verbose=True,
194
+ )
195
+
196
+ # Access results
197
+ print(f"W1: {results['wasserstein_1'].aggregate_value:.4f}")
198
+ print(f"MMD: {results['mmd'].aggregate_value:.4f}")
199
+ ```
200
+
201
+ #### Performance Tips
202
+
203
+ | Optimization | Speedup | When to Use |
204
+ |--------------|---------|-------------|
205
+ | `n_jobs=-1` (all cores) | 4-16x | Always (if joblib available) |
206
+ | `device="cuda"` | 10-100x | Large datasets, NVIDIA GPU available |
207
+ | `device="mps"` | 5-20x | Apple Silicon Macs |
208
+ | Vectorized NumPy | 2-5x | Automatic fallback |
209
+
104
210
  ## Expected Data Format
105
211
 
106
212
  GenEval expects AnnData (h5ad) files with:
@@ -1,13 +1,13 @@
1
1
  [tool.poetry]
2
2
  name = "gengeneeval"
3
- version = "0.2.0"
4
- description = "Comprehensive evaluation of generated gene expression data. Computes metrics between real and generated datasets with support for condition matching, train/test splits, and publication-quality visualizations."
3
+ version = "0.3.0"
4
+ description = "Comprehensive evaluation of generated gene expression data. Computes metrics between real and generated datasets with support for condition matching, train/test splits, memory-efficient lazy loading, CPU parallelization, GPU acceleration, and publication-quality visualizations."
5
5
  authors = ["GenEval Team <geneval@example.com>"]
6
6
  license = "MIT"
7
7
  readme = "README.md"
8
8
  homepage = "https://github.com/AndreaRubbi/GenGeneEval"
9
9
  repository = "https://github.com/AndreaRubbi/GenGeneEval"
10
- keywords = ["gene expression", "evaluation", "metrics", "single-cell", "generative models", "benchmarking"]
10
+ keywords = ["gene expression", "evaluation", "metrics", "single-cell", "generative models", "benchmarking", "memory-efficient"]
11
11
  classifiers = [
12
12
  "Development Status :: 4 - Beta",
13
13
  "Intended Audience :: Science/Research",
@@ -29,6 +29,7 @@ scipy = ">=1.7.0"
29
29
  torch = ">=1.9.0"
30
30
  matplotlib = ">=3.5.0"
31
31
  seaborn = ">=0.11.0"
32
+ joblib = ">=1.0.0"
32
33
  geomloss = {version = ">=0.2.1", optional = true}
33
34
  pykeops = {version = ">=1.4.0", optional = true}
34
35
  umap-learn = {version = ">=0.5.0", optional = true}
@@ -8,6 +8,7 @@ Features:
8
8
  - Multiple distance and correlation metrics (per-gene and aggregate)
9
9
  - Condition-based matching (perturbation, cell type, etc.)
10
10
  - Train/test split support
11
+ - Memory-efficient lazy loading for large datasets
11
12
  - Publication-quality visualizations
12
13
  - Command-line interface
13
14
 
@@ -20,12 +21,22 @@ Quick Start:
20
21
  ... output_dir="output/"
21
22
  ... )
22
23
 
24
+ Memory-Efficient Mode (for large datasets):
25
+ >>> from geneval import evaluate_lazy
26
+ >>> results = evaluate_lazy(
27
+ ... real_path="real.h5ad",
28
+ ... generated_path="generated.h5ad",
29
+ ... condition_columns=["perturbation"],
30
+ ... batch_size=256,
31
+ ... use_backed=True, # Memory-mapped access
32
+ ... )
33
+
23
34
  CLI Usage:
24
35
  $ geneval --real real.h5ad --generated generated.h5ad \\
25
36
  --conditions perturbation cell_type --output results/
26
37
  """
27
38
 
28
- __version__ = "0.2.0"
39
+ __version__ = "0.3.0"
29
40
  __author__ = "GenEval Team"
30
41
 
31
42
  # Main evaluation interface
@@ -35,12 +46,26 @@ from .evaluator import (
35
46
  MetricRegistry,
36
47
  )
37
48
 
49
+ # Memory-efficient evaluation
50
+ from .lazy_evaluator import (
51
+ evaluate_lazy,
52
+ MemoryEfficientEvaluator,
53
+ StreamingEvaluationResult,
54
+ )
55
+
38
56
  # Data loading
39
57
  from .data.loader import (
40
58
  GeneExpressionDataLoader,
41
59
  load_data,
42
60
  )
43
61
 
62
+ # Memory-efficient data loading
63
+ from .data.lazy_loader import (
64
+ LazyGeneExpressionDataLoader,
65
+ load_data_lazy,
66
+ ConditionBatch,
67
+ )
68
+
44
69
  # Results
45
70
  from .results import (
46
71
  EvaluationResult,
@@ -76,6 +101,14 @@ from .metrics.reconstruction import (
76
101
  R2Score,
77
102
  )
78
103
 
104
+ # Accelerated computation
105
+ from .metrics.accelerated import (
106
+ AccelerationConfig,
107
+ ParallelMetricComputer,
108
+ get_available_backends,
109
+ compute_metrics_accelerated,
110
+ )
111
+
79
112
  # Visualization
80
113
  from .visualization.visualizer import (
81
114
  EvaluationVisualizer,
@@ -99,9 +132,17 @@ __all__ = [
99
132
  "evaluate",
100
133
  "GeneEvalEvaluator",
101
134
  "MetricRegistry",
135
+ # Memory-efficient evaluation
136
+ "evaluate_lazy",
137
+ "MemoryEfficientEvaluator",
138
+ "StreamingEvaluationResult",
102
139
  # Data loading
103
140
  "GeneExpressionDataLoader",
104
141
  "load_data",
142
+ # Memory-efficient data loading
143
+ "LazyGeneExpressionDataLoader",
144
+ "load_data_lazy",
145
+ "ConditionBatch",
105
146
  # Results
106
147
  "EvaluationResult",
107
148
  "SplitResult",
@@ -123,6 +164,16 @@ __all__ = [
123
164
  "EnergyDistance",
124
165
  "MultivariateWasserstein",
125
166
  "MultivariateMMD",
167
+ # Reconstruction metrics
168
+ "MSEDistance",
169
+ "RMSEDistance",
170
+ "MAEDistance",
171
+ "R2Score",
172
+ # Acceleration
173
+ "AccelerationConfig",
174
+ "ParallelMetricComputer",
175
+ "get_available_backends",
176
+ "compute_metrics_accelerated",
126
177
  # Visualization
127
178
  "EvaluationVisualizer",
128
179
  "visualize",
@@ -2,6 +2,7 @@
2
2
  Data loading module for gene expression evaluation.
3
3
 
4
4
  Provides data loaders for paired real and generated datasets.
5
+ Includes both standard and memory-efficient lazy loading options.
5
6
  """
6
7
 
7
8
  from .loader import (
@@ -9,15 +10,28 @@ from .loader import (
9
10
  load_data,
10
11
  DataLoaderError,
11
12
  )
13
+ from .lazy_loader import (
14
+ LazyGeneExpressionDataLoader,
15
+ load_data_lazy,
16
+ LazyDataLoaderError,
17
+ ConditionBatch,
18
+ )
12
19
  from .gene_expression_datamodule import (
13
20
  GeneExpressionDataModule,
14
21
  DataModuleError,
15
22
  )
16
23
 
17
24
  __all__ = [
25
+ # Standard loader
18
26
  "GeneExpressionDataLoader",
19
27
  "load_data",
20
28
  "DataLoaderError",
29
+ # Lazy loader (memory-efficient)
30
+ "LazyGeneExpressionDataLoader",
31
+ "load_data_lazy",
32
+ "LazyDataLoaderError",
33
+ "ConditionBatch",
34
+ # DataModule
21
35
  "GeneExpressionDataModule",
22
36
  "DataModuleError",
23
37
  ]