gengeneeval 0.1.1__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/PKG-INFO +42 -5
  2. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/README.md +39 -2
  3. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/pyproject.toml +3 -3
  4. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/__init__.py +45 -1
  5. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/data/__init__.py +14 -0
  6. gengeneeval-0.2.1/src/geneval/data/lazy_loader.py +562 -0
  7. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/evaluator.py +4 -0
  8. gengeneeval-0.2.1/src/geneval/lazy_evaluator.py +424 -0
  9. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/metrics/__init__.py +19 -0
  10. gengeneeval-0.2.1/src/geneval/metrics/reconstruction.py +243 -0
  11. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/LICENSE +0 -0
  12. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/cli.py +0 -0
  13. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/config.py +0 -0
  14. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/core.py +0 -0
  15. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/data/gene_expression_datamodule.py +0 -0
  16. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/data/loader.py +0 -0
  17. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/evaluators/__init__.py +0 -0
  18. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/evaluators/base_evaluator.py +0 -0
  19. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/evaluators/gene_expression_evaluator.py +0 -0
  20. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/metrics/base_metric.py +0 -0
  21. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/metrics/correlation.py +0 -0
  22. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/metrics/distances.py +0 -0
  23. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/metrics/metrics.py +0 -0
  24. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/models/__init__.py +0 -0
  25. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/models/base_model.py +0 -0
  26. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/results.py +0 -0
  27. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/testing.py +0 -0
  28. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/utils/__init__.py +0 -0
  29. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/utils/io.py +0 -0
  30. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/utils/preprocessing.py +0 -0
  31. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/visualization/__init__.py +0 -0
  32. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/visualization/plots.py +0 -0
  33. {gengeneeval-0.1.1 → gengeneeval-0.2.1}/src/geneval/visualization/visualizer.py +0 -0
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gengeneeval
3
- Version: 0.1.1
4
- Summary: Comprehensive evaluation of generated gene expression data. Computes metrics between real and generated datasets with support for condition matching, train/test splits, and publication-quality visualizations.
3
+ Version: 0.2.1
4
+ Summary: Comprehensive evaluation of generated gene expression data. Computes metrics between real and generated datasets with support for condition matching, train/test splits, memory-efficient lazy loading, and publication-quality visualizations.
5
5
  License: MIT
6
6
  License-File: LICENSE
7
- Keywords: gene expression,evaluation,metrics,single-cell,generative models,benchmarking
7
+ Keywords: gene expression,evaluation,metrics,single-cell,generative models,benchmarking,memory-efficient
8
8
  Author: GenEval Team
9
9
  Author-email: geneval@example.com
10
10
  Requires-Python: >=3.8,<4.0
@@ -42,11 +42,11 @@ Description-Content-Type: text/markdown
42
42
  [![PyPI version](https://badge.fury.io/py/gengeneeval.svg)](https://badge.fury.io/py/gengeneeval)
43
43
  [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
44
44
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
45
- [![Tests](https://github.com/AndreaRubbi/GenGeneEval/actions/workflows/tests.yml/badge.svg)](https://github.com/AndreaRubbi/GenGeneEval/actions)
45
+ [![Tests](https://github.com/AndreaRubbi/GenGeneEval/actions/workflows/test.yml/badge.svg)](https://github.com/AndreaRubbi/GenGeneEval/actions)
46
46
 
47
47
  **Comprehensive evaluation of generated gene expression data against real datasets.**
48
48
 
49
- GenEval is a modular, object-oriented Python framework for computing metrics between real and generated gene expression datasets stored in AnnData (h5ad) format. It supports condition-based matching, train/test splits, and generates publication-quality visualizations.
49
+ GenEval is a modular, object-oriented Python framework for computing metrics between real and generated gene expression datasets stored in AnnData (h5ad) format. It supports condition-based matching, train/test splits, memory-efficient lazy loading for large datasets, and generates publication-quality visualizations.
50
50
 
51
51
  ## Features
52
52
 
@@ -55,6 +55,10 @@ All metrics are computed **per-gene** (returning a vector) and **aggregated**:
55
55
 
56
56
  | Metric | Description | Direction |
57
57
  |--------|-------------|-----------|
58
+ | **MSE** | Mean Squared Error | Lower is better |
59
+ | **RMSE** | Root Mean Squared Error | Lower is better |
60
+ | **MAE** | Mean Absolute Error | Lower is better |
61
+ | **R²** | Coefficient of Determination | Higher is better |
58
62
  | **Pearson Correlation** | Linear correlation between expression profiles | Higher is better |
59
63
  | **Spearman Correlation** | Rank correlation (robust to outliers) | Higher is better |
60
64
  | **Wasserstein-1** | Earth Mover's Distance (L1) | Lower is better |
@@ -73,6 +77,8 @@ All metrics are computed **per-gene** (returning a vector) and **aggregated**:
73
77
  - ✅ Condition-based matching (perturbation, cell type, etc.)
74
78
  - ✅ Train/test split support
75
79
  - ✅ Per-gene and aggregate metrics
80
+ - ✅ **Memory-efficient lazy loading** for large datasets
81
+ - ✅ **Batched evaluation** to avoid OOM errors
76
82
  - ✅ Modular, extensible architecture
77
83
  - ✅ Command-line interface
78
84
  - ✅ Publication-quality visualizations
@@ -136,6 +142,37 @@ geneval --real real.h5ad --generated generated.h5ad \
136
142
  --output results/
137
143
  ```
138
144
 
145
+ ### Memory-Efficient Mode (for Large Datasets)
146
+
147
+ For datasets too large to fit in memory, use the lazy evaluation API:
148
+
149
+ ```python
150
+ from geneval import evaluate_lazy, load_data_lazy
151
+
152
+ # Memory-efficient evaluation (streams data one condition at a time)
153
+ results = evaluate_lazy(
154
+ real_path="large_real.h5ad",
155
+ generated_path="large_generated.h5ad",
156
+ condition_columns=["perturbation"],
157
+ batch_size=256, # Process in batches
158
+ use_backed=True, # Memory-mapped file access
159
+ output_dir="eval_output/",
160
+ save_per_condition=True, # Save each condition to disk
161
+ )
162
+
163
+ # Get summary statistics
164
+ print(results.get_summary())
165
+
166
+ # Or use the lazy loader directly for custom workflows
167
+ with load_data_lazy("real.h5ad", "gen.h5ad", ["perturbation"]) as loader:
168
+ print(f"Memory estimate: {loader.estimate_memory_usage()}")
169
+
170
+ # Process one condition at a time
171
+ for key, real, gen, info in loader.iterate_conditions():
172
+ # Your custom evaluation logic
173
+ pass
174
+ ```
175
+
139
176
  ## Expected Data Format
140
177
 
141
178
  GenEval expects AnnData (h5ad) files with:
@@ -3,11 +3,11 @@
3
3
  [![PyPI version](https://badge.fury.io/py/gengeneeval.svg)](https://badge.fury.io/py/gengeneeval)
4
4
  [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
5
5
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
- [![Tests](https://github.com/AndreaRubbi/GenGeneEval/actions/workflows/tests.yml/badge.svg)](https://github.com/AndreaRubbi/GenGeneEval/actions)
6
+ [![Tests](https://github.com/AndreaRubbi/GenGeneEval/actions/workflows/test.yml/badge.svg)](https://github.com/AndreaRubbi/GenGeneEval/actions)
7
7
 
8
8
  **Comprehensive evaluation of generated gene expression data against real datasets.**
9
9
 
10
- GenEval is a modular, object-oriented Python framework for computing metrics between real and generated gene expression datasets stored in AnnData (h5ad) format. It supports condition-based matching, train/test splits, and generates publication-quality visualizations.
10
+ GenEval is a modular, object-oriented Python framework for computing metrics between real and generated gene expression datasets stored in AnnData (h5ad) format. It supports condition-based matching, train/test splits, memory-efficient lazy loading for large datasets, and generates publication-quality visualizations.
11
11
 
12
12
  ## Features
13
13
 
@@ -16,6 +16,10 @@ All metrics are computed **per-gene** (returning a vector) and **aggregated**:
16
16
 
17
17
  | Metric | Description | Direction |
18
18
  |--------|-------------|-----------|
19
+ | **MSE** | Mean Squared Error | Lower is better |
20
+ | **RMSE** | Root Mean Squared Error | Lower is better |
21
+ | **MAE** | Mean Absolute Error | Lower is better |
22
+ | **R²** | Coefficient of Determination | Higher is better |
19
23
  | **Pearson Correlation** | Linear correlation between expression profiles | Higher is better |
20
24
  | **Spearman Correlation** | Rank correlation (robust to outliers) | Higher is better |
21
25
  | **Wasserstein-1** | Earth Mover's Distance (L1) | Lower is better |
@@ -34,6 +38,8 @@ All metrics are computed **per-gene** (returning a vector) and **aggregated**:
34
38
  - ✅ Condition-based matching (perturbation, cell type, etc.)
35
39
  - ✅ Train/test split support
36
40
  - ✅ Per-gene and aggregate metrics
41
+ - ✅ **Memory-efficient lazy loading** for large datasets
42
+ - ✅ **Batched evaluation** to avoid OOM errors
37
43
  - ✅ Modular, extensible architecture
38
44
  - ✅ Command-line interface
39
45
  - ✅ Publication-quality visualizations
@@ -97,6 +103,37 @@ geneval --real real.h5ad --generated generated.h5ad \
97
103
  --output results/
98
104
  ```
99
105
 
106
+ ### Memory-Efficient Mode (for Large Datasets)
107
+
108
+ For datasets too large to fit in memory, use the lazy evaluation API:
109
+
110
+ ```python
111
+ from geneval import evaluate_lazy, load_data_lazy
112
+
113
+ # Memory-efficient evaluation (streams data one condition at a time)
114
+ results = evaluate_lazy(
115
+ real_path="large_real.h5ad",
116
+ generated_path="large_generated.h5ad",
117
+ condition_columns=["perturbation"],
118
+ batch_size=256, # Process in batches
119
+ use_backed=True, # Memory-mapped file access
120
+ output_dir="eval_output/",
121
+ save_per_condition=True, # Save each condition to disk
122
+ )
123
+
124
+ # Get summary statistics
125
+ print(results.get_summary())
126
+
127
+ # Or use the lazy loader directly for custom workflows
128
+ with load_data_lazy("real.h5ad", "gen.h5ad", ["perturbation"]) as loader:
129
+ print(f"Memory estimate: {loader.estimate_memory_usage()}")
130
+
131
+ # Process one condition at a time
132
+ for key, real, gen, info in loader.iterate_conditions():
133
+ # Your custom evaluation logic
134
+ pass
135
+ ```
136
+
100
137
  ## Expected Data Format
101
138
 
102
139
  GenEval expects AnnData (h5ad) files with:
@@ -1,13 +1,13 @@
1
1
  [tool.poetry]
2
2
  name = "gengeneeval"
3
- version = "0.1.1"
4
- description = "Comprehensive evaluation of generated gene expression data. Computes metrics between real and generated datasets with support for condition matching, train/test splits, and publication-quality visualizations."
3
+ version = "0.2.1"
4
+ description = "Comprehensive evaluation of generated gene expression data. Computes metrics between real and generated datasets with support for condition matching, train/test splits, memory-efficient lazy loading, and publication-quality visualizations."
5
5
  authors = ["GenEval Team <geneval@example.com>"]
6
6
  license = "MIT"
7
7
  readme = "README.md"
8
8
  homepage = "https://github.com/AndreaRubbi/GenGeneEval"
9
9
  repository = "https://github.com/AndreaRubbi/GenGeneEval"
10
- keywords = ["gene expression", "evaluation", "metrics", "single-cell", "generative models", "benchmarking"]
10
+ keywords = ["gene expression", "evaluation", "metrics", "single-cell", "generative models", "benchmarking", "memory-efficient"]
11
11
  classifiers = [
12
12
  "Development Status :: 4 - Beta",
13
13
  "Intended Audience :: Science/Research",
@@ -8,6 +8,7 @@ Features:
8
8
  - Multiple distance and correlation metrics (per-gene and aggregate)
9
9
  - Condition-based matching (perturbation, cell type, etc.)
10
10
  - Train/test split support
11
+ - Memory-efficient lazy loading for large datasets
11
12
  - Publication-quality visualizations
12
13
  - Command-line interface
13
14
 
@@ -20,12 +21,22 @@ Quick Start:
20
21
  ... output_dir="output/"
21
22
  ... )
22
23
 
24
+ Memory-Efficient Mode (for large datasets):
25
+ >>> from geneval import evaluate_lazy
26
+ >>> results = evaluate_lazy(
27
+ ... real_path="real.h5ad",
28
+ ... generated_path="generated.h5ad",
29
+ ... condition_columns=["perturbation"],
30
+ ... batch_size=256,
31
+ ... use_backed=True, # Memory-mapped access
32
+ ... )
33
+
23
34
  CLI Usage:
24
35
  $ geneval --real real.h5ad --generated generated.h5ad \\
25
36
  --conditions perturbation cell_type --output results/
26
37
  """
27
38
 
28
- __version__ = "0.1.1"
39
+ __version__ = "0.2.1"
29
40
  __author__ = "GenEval Team"
30
41
 
31
42
  # Main evaluation interface
@@ -35,12 +46,26 @@ from .evaluator import (
35
46
  MetricRegistry,
36
47
  )
37
48
 
49
+ # Memory-efficient evaluation
50
+ from .lazy_evaluator import (
51
+ evaluate_lazy,
52
+ MemoryEfficientEvaluator,
53
+ StreamingEvaluationResult,
54
+ )
55
+
38
56
  # Data loading
39
57
  from .data.loader import (
40
58
  GeneExpressionDataLoader,
41
59
  load_data,
42
60
  )
43
61
 
62
+ # Memory-efficient data loading
63
+ from .data.lazy_loader import (
64
+ LazyGeneExpressionDataLoader,
65
+ load_data_lazy,
66
+ ConditionBatch,
67
+ )
68
+
44
69
  # Results
45
70
  from .results import (
46
71
  EvaluationResult,
@@ -69,6 +94,12 @@ from .metrics.distances import (
69
94
  MultivariateWasserstein,
70
95
  MultivariateMMD,
71
96
  )
97
+ from .metrics.reconstruction import (
98
+ MSEDistance,
99
+ RMSEDistance,
100
+ MAEDistance,
101
+ R2Score,
102
+ )
72
103
 
73
104
  # Visualization
74
105
  from .visualization.visualizer import (
@@ -93,9 +124,17 @@ __all__ = [
93
124
  "evaluate",
94
125
  "GeneEvalEvaluator",
95
126
  "MetricRegistry",
127
+ # Memory-efficient evaluation
128
+ "evaluate_lazy",
129
+ "MemoryEfficientEvaluator",
130
+ "StreamingEvaluationResult",
96
131
  # Data loading
97
132
  "GeneExpressionDataLoader",
98
133
  "load_data",
134
+ # Memory-efficient data loading
135
+ "LazyGeneExpressionDataLoader",
136
+ "load_data_lazy",
137
+ "ConditionBatch",
99
138
  # Results
100
139
  "EvaluationResult",
101
140
  "SplitResult",
@@ -117,6 +156,11 @@ __all__ = [
117
156
  "EnergyDistance",
118
157
  "MultivariateWasserstein",
119
158
  "MultivariateMMD",
159
+ # Reconstruction metrics
160
+ "MSEDistance",
161
+ "RMSEDistance",
162
+ "MAEDistance",
163
+ "R2Score",
120
164
  # Visualization
121
165
  "EvaluationVisualizer",
122
166
  "visualize",
@@ -2,6 +2,7 @@
2
2
  Data loading module for gene expression evaluation.
3
3
 
4
4
  Provides data loaders for paired real and generated datasets.
5
+ Includes both standard and memory-efficient lazy loading options.
5
6
  """
6
7
 
7
8
  from .loader import (
@@ -9,15 +10,28 @@ from .loader import (
9
10
  load_data,
10
11
  DataLoaderError,
11
12
  )
13
+ from .lazy_loader import (
14
+ LazyGeneExpressionDataLoader,
15
+ load_data_lazy,
16
+ LazyDataLoaderError,
17
+ ConditionBatch,
18
+ )
12
19
  from .gene_expression_datamodule import (
13
20
  GeneExpressionDataModule,
14
21
  DataModuleError,
15
22
  )
16
23
 
17
24
  __all__ = [
25
+ # Standard loader
18
26
  "GeneExpressionDataLoader",
19
27
  "load_data",
20
28
  "DataLoaderError",
29
+ # Lazy loader (memory-efficient)
30
+ "LazyGeneExpressionDataLoader",
31
+ "load_data_lazy",
32
+ "LazyDataLoaderError",
33
+ "ConditionBatch",
34
+ # DataModule
21
35
  "GeneExpressionDataModule",
22
36
  "DataModuleError",
23
37
  ]