gengeneeval 0.1.1__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/PKG-INFO +42 -5
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/README.md +39 -2
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/pyproject.toml +3 -3
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/__init__.py +45 -1
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/data/__init__.py +14 -0
- gengeneeval-0.2.1/src/geneval/data/lazy_loader.py +562 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/evaluator.py +4 -0
- gengeneeval-0.2.1/src/geneval/lazy_evaluator.py +424 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/metrics/__init__.py +19 -0
- gengeneeval-0.2.1/src/geneval/metrics/reconstruction.py +243 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/LICENSE +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/cli.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/config.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/core.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/data/gene_expression_datamodule.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/data/loader.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/evaluators/__init__.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/evaluators/base_evaluator.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/evaluators/gene_expression_evaluator.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/metrics/base_metric.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/metrics/correlation.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/metrics/distances.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/metrics/metrics.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/models/__init__.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/models/base_model.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/results.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/testing.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/utils/__init__.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/utils/io.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/utils/preprocessing.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/visualization/__init__.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/visualization/plots.py +0 -0
- {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/visualization/visualizer.py +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gengeneeval
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: Comprehensive evaluation of generated gene expression data. Computes metrics between real and generated datasets with support for condition matching, train/test splits, and publication-quality visualizations.
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Comprehensive evaluation of generated gene expression data. Computes metrics between real and generated datasets with support for condition matching, train/test splits, memory-efficient lazy loading, and publication-quality visualizations.
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
7
|
-
Keywords: gene expression,evaluation,metrics,single-cell,generative models,benchmarking
|
|
7
|
+
Keywords: gene expression,evaluation,metrics,single-cell,generative models,benchmarking,memory-efficient
|
|
8
8
|
Author: GenEval Team
|
|
9
9
|
Author-email: geneval@example.com
|
|
10
10
|
Requires-Python: >=3.8,<4.0
|
|
@@ -42,11 +42,11 @@ Description-Content-Type: text/markdown
|
|
|
42
42
|
[](https://badge.fury.io/py/gengeneeval)
|
|
43
43
|
[](https://www.python.org/downloads/)
|
|
44
44
|
[](https://opensource.org/licenses/MIT)
|
|
45
|
-
[](https://github.com/AndreaRubbi/GenGeneEval/actions)
|
|
46
46
|
|
|
47
47
|
**Comprehensive evaluation of generated gene expression data against real datasets.**
|
|
48
48
|
|
|
49
|
-
GenEval is a modular, object-oriented Python framework for computing metrics between real and generated gene expression datasets stored in AnnData (h5ad) format. It supports condition-based matching, train/test splits, and generates publication-quality visualizations.
|
|
49
|
+
GenEval is a modular, object-oriented Python framework for computing metrics between real and generated gene expression datasets stored in AnnData (h5ad) format. It supports condition-based matching, train/test splits, memory-efficient lazy loading for large datasets, and generates publication-quality visualizations.
|
|
50
50
|
|
|
51
51
|
## Features
|
|
52
52
|
|
|
@@ -55,6 +55,10 @@ All metrics are computed **per-gene** (returning a vector) and **aggregated**:
|
|
|
55
55
|
|
|
56
56
|
| Metric | Description | Direction |
|
|
57
57
|
|--------|-------------|-----------|
|
|
58
|
+
| **MSE** | Mean Squared Error | Lower is better |
|
|
59
|
+
| **RMSE** | Root Mean Squared Error | Lower is better |
|
|
60
|
+
| **MAE** | Mean Absolute Error | Lower is better |
|
|
61
|
+
| **R²** | Coefficient of Determination | Higher is better |
|
|
58
62
|
| **Pearson Correlation** | Linear correlation between expression profiles | Higher is better |
|
|
59
63
|
| **Spearman Correlation** | Rank correlation (robust to outliers) | Higher is better |
|
|
60
64
|
| **Wasserstein-1** | Earth Mover's Distance (L1) | Lower is better |
|
|
@@ -73,6 +77,8 @@ All metrics are computed **per-gene** (returning a vector) and **aggregated**:
|
|
|
73
77
|
- ✅ Condition-based matching (perturbation, cell type, etc.)
|
|
74
78
|
- ✅ Train/test split support
|
|
75
79
|
- ✅ Per-gene and aggregate metrics
|
|
80
|
+
- ✅ **Memory-efficient lazy loading** for large datasets
|
|
81
|
+
- ✅ **Batched evaluation** to avoid OOM errors
|
|
76
82
|
- ✅ Modular, extensible architecture
|
|
77
83
|
- ✅ Command-line interface
|
|
78
84
|
- ✅ Publication-quality visualizations
|
|
@@ -136,6 +142,37 @@ geneval --real real.h5ad --generated generated.h5ad \
|
|
|
136
142
|
--output results/
|
|
137
143
|
```
|
|
138
144
|
|
|
145
|
+
### Memory-Efficient Mode (for Large Datasets)
|
|
146
|
+
|
|
147
|
+
For datasets too large to fit in memory, use the lazy evaluation API:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from geneval import evaluate_lazy, load_data_lazy
|
|
151
|
+
|
|
152
|
+
# Memory-efficient evaluation (streams data one condition at a time)
|
|
153
|
+
results = evaluate_lazy(
|
|
154
|
+
real_path="large_real.h5ad",
|
|
155
|
+
generated_path="large_generated.h5ad",
|
|
156
|
+
condition_columns=["perturbation"],
|
|
157
|
+
batch_size=256, # Process in batches
|
|
158
|
+
use_backed=True, # Memory-mapped file access
|
|
159
|
+
output_dir="eval_output/",
|
|
160
|
+
save_per_condition=True, # Save each condition to disk
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Get summary statistics
|
|
164
|
+
print(results.get_summary())
|
|
165
|
+
|
|
166
|
+
# Or use the lazy loader directly for custom workflows
|
|
167
|
+
with load_data_lazy("real.h5ad", "gen.h5ad", ["perturbation"]) as loader:
|
|
168
|
+
print(f"Memory estimate: {loader.estimate_memory_usage()}")
|
|
169
|
+
|
|
170
|
+
# Process one condition at a time
|
|
171
|
+
for key, real, gen, info in loader.iterate_conditions():
|
|
172
|
+
# Your custom evaluation logic
|
|
173
|
+
pass
|
|
174
|
+
```
|
|
175
|
+
|
|
139
176
|
## Expected Data Format
|
|
140
177
|
|
|
141
178
|
GenEval expects AnnData (h5ad) files with:
|
|
@@ -3,11 +3,11 @@
|
|
|
3
3
|
[](https://badge.fury.io/py/gengeneeval)
|
|
4
4
|
[](https://www.python.org/downloads/)
|
|
5
5
|
[](https://opensource.org/licenses/MIT)
|
|
6
|
-
[](https://github.com/AndreaRubbi/GenGeneEval/actions)
|
|
7
7
|
|
|
8
8
|
**Comprehensive evaluation of generated gene expression data against real datasets.**
|
|
9
9
|
|
|
10
|
-
GenEval is a modular, object-oriented Python framework for computing metrics between real and generated gene expression datasets stored in AnnData (h5ad) format. It supports condition-based matching, train/test splits, and generates publication-quality visualizations.
|
|
10
|
+
GenEval is a modular, object-oriented Python framework for computing metrics between real and generated gene expression datasets stored in AnnData (h5ad) format. It supports condition-based matching, train/test splits, memory-efficient lazy loading for large datasets, and generates publication-quality visualizations.
|
|
11
11
|
|
|
12
12
|
## Features
|
|
13
13
|
|
|
@@ -16,6 +16,10 @@ All metrics are computed **per-gene** (returning a vector) and **aggregated**:
|
|
|
16
16
|
|
|
17
17
|
| Metric | Description | Direction |
|
|
18
18
|
|--------|-------------|-----------|
|
|
19
|
+
| **MSE** | Mean Squared Error | Lower is better |
|
|
20
|
+
| **RMSE** | Root Mean Squared Error | Lower is better |
|
|
21
|
+
| **MAE** | Mean Absolute Error | Lower is better |
|
|
22
|
+
| **R²** | Coefficient of Determination | Higher is better |
|
|
19
23
|
| **Pearson Correlation** | Linear correlation between expression profiles | Higher is better |
|
|
20
24
|
| **Spearman Correlation** | Rank correlation (robust to outliers) | Higher is better |
|
|
21
25
|
| **Wasserstein-1** | Earth Mover's Distance (L1) | Lower is better |
|
|
@@ -34,6 +38,8 @@ All metrics are computed **per-gene** (returning a vector) and **aggregated**:
|
|
|
34
38
|
- ✅ Condition-based matching (perturbation, cell type, etc.)
|
|
35
39
|
- ✅ Train/test split support
|
|
36
40
|
- ✅ Per-gene and aggregate metrics
|
|
41
|
+
- ✅ **Memory-efficient lazy loading** for large datasets
|
|
42
|
+
- ✅ **Batched evaluation** to avoid OOM errors
|
|
37
43
|
- ✅ Modular, extensible architecture
|
|
38
44
|
- ✅ Command-line interface
|
|
39
45
|
- ✅ Publication-quality visualizations
|
|
@@ -97,6 +103,37 @@ geneval --real real.h5ad --generated generated.h5ad \
|
|
|
97
103
|
--output results/
|
|
98
104
|
```
|
|
99
105
|
|
|
106
|
+
### Memory-Efficient Mode (for Large Datasets)
|
|
107
|
+
|
|
108
|
+
For datasets too large to fit in memory, use the lazy evaluation API:
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from geneval import evaluate_lazy, load_data_lazy
|
|
112
|
+
|
|
113
|
+
# Memory-efficient evaluation (streams data one condition at a time)
|
|
114
|
+
results = evaluate_lazy(
|
|
115
|
+
real_path="large_real.h5ad",
|
|
116
|
+
generated_path="large_generated.h5ad",
|
|
117
|
+
condition_columns=["perturbation"],
|
|
118
|
+
batch_size=256, # Process in batches
|
|
119
|
+
use_backed=True, # Memory-mapped file access
|
|
120
|
+
output_dir="eval_output/",
|
|
121
|
+
save_per_condition=True, # Save each condition to disk
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Get summary statistics
|
|
125
|
+
print(results.get_summary())
|
|
126
|
+
|
|
127
|
+
# Or use the lazy loader directly for custom workflows
|
|
128
|
+
with load_data_lazy("real.h5ad", "gen.h5ad", ["perturbation"]) as loader:
|
|
129
|
+
print(f"Memory estimate: {loader.estimate_memory_usage()}")
|
|
130
|
+
|
|
131
|
+
# Process one condition at a time
|
|
132
|
+
for key, real, gen, info in loader.iterate_conditions():
|
|
133
|
+
# Your custom evaluation logic
|
|
134
|
+
pass
|
|
135
|
+
```
|
|
136
|
+
|
|
100
137
|
## Expected Data Format
|
|
101
138
|
|
|
102
139
|
GenEval expects AnnData (h5ad) files with:
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "gengeneeval"
|
|
3
|
-
version = "0.
|
|
4
|
-
description = "Comprehensive evaluation of generated gene expression data. Computes metrics between real and generated datasets with support for condition matching, train/test splits, and publication-quality visualizations."
|
|
3
|
+
version = "0.2.1"
|
|
4
|
+
description = "Comprehensive evaluation of generated gene expression data. Computes metrics between real and generated datasets with support for condition matching, train/test splits, memory-efficient lazy loading, and publication-quality visualizations."
|
|
5
5
|
authors = ["GenEval Team <geneval@example.com>"]
|
|
6
6
|
license = "MIT"
|
|
7
7
|
readme = "README.md"
|
|
8
8
|
homepage = "https://github.com/AndreaRubbi/GenGeneEval"
|
|
9
9
|
repository = "https://github.com/AndreaRubbi/GenGeneEval"
|
|
10
|
-
keywords = ["gene expression", "evaluation", "metrics", "single-cell", "generative models", "benchmarking"]
|
|
10
|
+
keywords = ["gene expression", "evaluation", "metrics", "single-cell", "generative models", "benchmarking", "memory-efficient"]
|
|
11
11
|
classifiers = [
|
|
12
12
|
"Development Status :: 4 - Beta",
|
|
13
13
|
"Intended Audience :: Science/Research",
|
|
@@ -8,6 +8,7 @@ Features:
|
|
|
8
8
|
- Multiple distance and correlation metrics (per-gene and aggregate)
|
|
9
9
|
- Condition-based matching (perturbation, cell type, etc.)
|
|
10
10
|
- Train/test split support
|
|
11
|
+
- Memory-efficient lazy loading for large datasets
|
|
11
12
|
- Publication-quality visualizations
|
|
12
13
|
- Command-line interface
|
|
13
14
|
|
|
@@ -20,12 +21,22 @@ Quick Start:
|
|
|
20
21
|
... output_dir="output/"
|
|
21
22
|
... )
|
|
22
23
|
|
|
24
|
+
Memory-Efficient Mode (for large datasets):
|
|
25
|
+
>>> from geneval import evaluate_lazy
|
|
26
|
+
>>> results = evaluate_lazy(
|
|
27
|
+
... real_path="real.h5ad",
|
|
28
|
+
... generated_path="generated.h5ad",
|
|
29
|
+
... condition_columns=["perturbation"],
|
|
30
|
+
... batch_size=256,
|
|
31
|
+
... use_backed=True, # Memory-mapped access
|
|
32
|
+
... )
|
|
33
|
+
|
|
23
34
|
CLI Usage:
|
|
24
35
|
$ geneval --real real.h5ad --generated generated.h5ad \\
|
|
25
36
|
--conditions perturbation cell_type --output results/
|
|
26
37
|
"""
|
|
27
38
|
|
|
28
|
-
__version__ = "0.
|
|
39
|
+
__version__ = "0.2.1"
|
|
29
40
|
__author__ = "GenEval Team"
|
|
30
41
|
|
|
31
42
|
# Main evaluation interface
|
|
@@ -35,12 +46,26 @@ from .evaluator import (
|
|
|
35
46
|
MetricRegistry,
|
|
36
47
|
)
|
|
37
48
|
|
|
49
|
+
# Memory-efficient evaluation
|
|
50
|
+
from .lazy_evaluator import (
|
|
51
|
+
evaluate_lazy,
|
|
52
|
+
MemoryEfficientEvaluator,
|
|
53
|
+
StreamingEvaluationResult,
|
|
54
|
+
)
|
|
55
|
+
|
|
38
56
|
# Data loading
|
|
39
57
|
from .data.loader import (
|
|
40
58
|
GeneExpressionDataLoader,
|
|
41
59
|
load_data,
|
|
42
60
|
)
|
|
43
61
|
|
|
62
|
+
# Memory-efficient data loading
|
|
63
|
+
from .data.lazy_loader import (
|
|
64
|
+
LazyGeneExpressionDataLoader,
|
|
65
|
+
load_data_lazy,
|
|
66
|
+
ConditionBatch,
|
|
67
|
+
)
|
|
68
|
+
|
|
44
69
|
# Results
|
|
45
70
|
from .results import (
|
|
46
71
|
EvaluationResult,
|
|
@@ -69,6 +94,12 @@ from .metrics.distances import (
|
|
|
69
94
|
MultivariateWasserstein,
|
|
70
95
|
MultivariateMMD,
|
|
71
96
|
)
|
|
97
|
+
from .metrics.reconstruction import (
|
|
98
|
+
MSEDistance,
|
|
99
|
+
RMSEDistance,
|
|
100
|
+
MAEDistance,
|
|
101
|
+
R2Score,
|
|
102
|
+
)
|
|
72
103
|
|
|
73
104
|
# Visualization
|
|
74
105
|
from .visualization.visualizer import (
|
|
@@ -93,9 +124,17 @@ __all__ = [
|
|
|
93
124
|
"evaluate",
|
|
94
125
|
"GeneEvalEvaluator",
|
|
95
126
|
"MetricRegistry",
|
|
127
|
+
# Memory-efficient evaluation
|
|
128
|
+
"evaluate_lazy",
|
|
129
|
+
"MemoryEfficientEvaluator",
|
|
130
|
+
"StreamingEvaluationResult",
|
|
96
131
|
# Data loading
|
|
97
132
|
"GeneExpressionDataLoader",
|
|
98
133
|
"load_data",
|
|
134
|
+
# Memory-efficient data loading
|
|
135
|
+
"LazyGeneExpressionDataLoader",
|
|
136
|
+
"load_data_lazy",
|
|
137
|
+
"ConditionBatch",
|
|
99
138
|
# Results
|
|
100
139
|
"EvaluationResult",
|
|
101
140
|
"SplitResult",
|
|
@@ -117,6 +156,11 @@ __all__ = [
|
|
|
117
156
|
"EnergyDistance",
|
|
118
157
|
"MultivariateWasserstein",
|
|
119
158
|
"MultivariateMMD",
|
|
159
|
+
# Reconstruction metrics
|
|
160
|
+
"MSEDistance",
|
|
161
|
+
"RMSEDistance",
|
|
162
|
+
"MAEDistance",
|
|
163
|
+
"R2Score",
|
|
120
164
|
# Visualization
|
|
121
165
|
"EvaluationVisualizer",
|
|
122
166
|
"visualize",
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
Data loading module for gene expression evaluation.
|
|
3
3
|
|
|
4
4
|
Provides data loaders for paired real and generated datasets.
|
|
5
|
+
Includes both standard and memory-efficient lazy loading options.
|
|
5
6
|
"""
|
|
6
7
|
|
|
7
8
|
from .loader import (
|
|
@@ -9,15 +10,28 @@ from .loader import (
|
|
|
9
10
|
load_data,
|
|
10
11
|
DataLoaderError,
|
|
11
12
|
)
|
|
13
|
+
from .lazy_loader import (
|
|
14
|
+
LazyGeneExpressionDataLoader,
|
|
15
|
+
load_data_lazy,
|
|
16
|
+
LazyDataLoaderError,
|
|
17
|
+
ConditionBatch,
|
|
18
|
+
)
|
|
12
19
|
from .gene_expression_datamodule import (
|
|
13
20
|
GeneExpressionDataModule,
|
|
14
21
|
DataModuleError,
|
|
15
22
|
)
|
|
16
23
|
|
|
17
24
|
__all__ = [
|
|
25
|
+
# Standard loader
|
|
18
26
|
"GeneExpressionDataLoader",
|
|
19
27
|
"load_data",
|
|
20
28
|
"DataLoaderError",
|
|
29
|
+
# Lazy loader (memory-efficient)
|
|
30
|
+
"LazyGeneExpressionDataLoader",
|
|
31
|
+
"load_data_lazy",
|
|
32
|
+
"LazyDataLoaderError",
|
|
33
|
+
"ConditionBatch",
|
|
34
|
+
# DataModule
|
|
21
35
|
"GeneExpressionDataModule",
|
|
22
36
|
"DataModuleError",
|
|
23
37
|
]
|