gemss 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gemss/__init__.py +14 -0
- gemss/config/__init__.py +52 -0
- gemss/config/algorithm_settings.json +17 -0
- gemss/config/config.py +457 -0
- gemss/config/constants.py +26 -0
- gemss/config/generated_dataset_parameters.json +11 -0
- gemss/config/solution_postprocessing_settings.json +6 -0
- gemss/data_handling/__init__.py +16 -0
- gemss/data_handling/data_processing.py +269 -0
- gemss/data_handling/generate_artificial_dataset.py +338 -0
- gemss/diagnostics/__init__.py +19 -0
- gemss/diagnostics/performance_tests.py +700 -0
- gemss/diagnostics/recommendation_messages.py +153 -0
- gemss/diagnostics/recommendations.py +427 -0
- gemss/experiment_assessment/__init__.py +37 -0
- gemss/experiment_assessment/case_analysis.py +436 -0
- gemss/experiment_assessment/experiment_results_analysis.py +765 -0
- gemss/experiment_assessment/experiment_results_interactive.py +365 -0
- gemss/experiment_assessment/experiment_results_visualizations.py +764 -0
- gemss/feature_selection/__init__.py +36 -0
- gemss/feature_selection/inference.py +407 -0
- gemss/feature_selection/models.py +384 -0
- gemss/postprocessing/__init__.py +45 -0
- gemss/postprocessing/outliers.py +434 -0
- gemss/postprocessing/result_postprocessing.py +640 -0
- gemss/postprocessing/simple_regressions.py +600 -0
- gemss/postprocessing/tabpfn_evaluation.py +298 -0
- gemss/utils/__init__.py +53 -0
- gemss/utils/utils.py +830 -0
- gemss/utils/visualizations.py +940 -0
- gemss-1.0.1.dist-info/METADATA +285 -0
- gemss-1.0.1.dist-info/RECORD +34 -0
- gemss-1.0.1.dist-info/WHEEL +4 -0
- gemss-1.0.1.dist-info/licenses/LICENSE +21 -0
gemss/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GEMSS Package
|
|
3
|
+
|
|
4
|
+
GEMSS (Gaussian Ensemble for Multiple Sparse Solutions) is an algorithm for
|
|
5
|
+
feature selection in high-dimensional data. It is intended to be used during
|
|
6
|
+
dataset analysis to identify relevant features for predictive modeling.
|
|
7
|
+
|
|
8
|
+
GEMSS is a Bayesian variational method that approximates multimodal
|
|
9
|
+
posteriors by Gaussian mixtures to recover diversified sparse feature sets.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import importlib.metadata
|
|
13
|
+
|
|
14
|
+
__version__ = importlib.metadata.version('gemss')
|
gemss/config/__init__.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration package for GEMSS (Gaussian Ensemble for Multiple Sparse Solutions).
|
|
3
|
+
|
|
4
|
+
This package contains configuration loading functionality and JSON parameter files.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from gemss.config.config import (
|
|
8
|
+
BATCH_SIZE,
|
|
9
|
+
BINARIZE,
|
|
10
|
+
BINARY_RESPONSE_RATIO,
|
|
11
|
+
DATASET_SEED,
|
|
12
|
+
DESIRED_SPARSITY,
|
|
13
|
+
IS_REGULARIZED,
|
|
14
|
+
LAMBDA_JACCARD,
|
|
15
|
+
LEARNING_RATE,
|
|
16
|
+
MIN_MU_THRESHOLD,
|
|
17
|
+
N_CANDIDATE_SOLUTIONS,
|
|
18
|
+
N_FEATURES,
|
|
19
|
+
N_GENERATING_SOLUTIONS,
|
|
20
|
+
N_ITER,
|
|
21
|
+
N_SAMPLES,
|
|
22
|
+
NAN_RATIO,
|
|
23
|
+
NOISE_STD,
|
|
24
|
+
OUTLIER_DEVIATION_THRESHOLDS,
|
|
25
|
+
PRIOR_SPARSITY,
|
|
26
|
+
PRIOR_TYPE,
|
|
27
|
+
SAMPLE_MORE_PRIORS_COEFF,
|
|
28
|
+
SPARSITY,
|
|
29
|
+
STUDENT_DF,
|
|
30
|
+
STUDENT_SCALE,
|
|
31
|
+
USE_MEDIAN_FOR_OUTLIER_DETECTION,
|
|
32
|
+
VAR_SLAB,
|
|
33
|
+
VAR_SPIKE,
|
|
34
|
+
WEIGHT_SLAB,
|
|
35
|
+
WEIGHT_SPIKE,
|
|
36
|
+
ConfigurationManager,
|
|
37
|
+
as_dict,
|
|
38
|
+
check_sparsities,
|
|
39
|
+
display_current_config,
|
|
40
|
+
get_core_algorithm_params,
|
|
41
|
+
get_current_config,
|
|
42
|
+
get_params_by_category,
|
|
43
|
+
)
|
|
44
|
+
from gemss.config.constants import (
|
|
45
|
+
CONFIG_FILES,
|
|
46
|
+
CONFIG_PACKAGE_NAME,
|
|
47
|
+
DATA_DIR,
|
|
48
|
+
EXPERIMENT_RESULTS_DIR,
|
|
49
|
+
PROJECT_ABBREV,
|
|
50
|
+
PROJECT_NAME,
|
|
51
|
+
ROOT_DIR,
|
|
52
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"N_ITER": 4000,
|
|
3
|
+
"SAMPLE_MORE_PRIORS_COEFF": 1,
|
|
4
|
+
"WEIGHT_SPIKE": 0.1,
|
|
5
|
+
"LAMBDA_JACCARD": 500,
|
|
6
|
+
"STUDENT_SCALE": 1,
|
|
7
|
+
"LEARNING_RATE": 0.002,
|
|
8
|
+
"IS_REGULARIZED": true,
|
|
9
|
+
"BATCH_SIZE": 32,
|
|
10
|
+
"VAR_SLAB": 100,
|
|
11
|
+
"VAR_SPIKE": 0.1,
|
|
12
|
+
"PRIOR_SPARSITY": 3,
|
|
13
|
+
"STUDENT_DF": 1,
|
|
14
|
+
"PRIOR_TYPE": "sss",
|
|
15
|
+
"WEIGHT_SLAB": 0.9,
|
|
16
|
+
"N_CANDIDATE_SOLUTIONS": 8
|
|
17
|
+
}
|
gemss/config/config.py
ADDED
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration loader for GEMSS (Gaussian Ensemble for Multiple Sparse Solutions).
|
|
3
|
+
|
|
4
|
+
This module provides efficient loading and management of experiment parameters
|
|
5
|
+
from JSON configuration files co-located with this module.
|
|
6
|
+
|
|
7
|
+
Parameter Categories:
|
|
8
|
+
- Artificial Dataset: Parameters for synthetic data generation (development/demo only)
|
|
9
|
+
- Algorithm: Core algorithm parameters (used for both synthetic and real data)
|
|
10
|
+
- Postprocessing: Solution recovery and analysis parameters
|
|
11
|
+
|
|
12
|
+
Features:
|
|
13
|
+
- Lazy loading with caching
|
|
14
|
+
- Comprehensive parameter validation
|
|
15
|
+
- Structured parameter access by category
|
|
16
|
+
- Rich display functionality for notebooks
|
|
17
|
+
- Efficient dictionary conversion for logging
|
|
18
|
+
|
|
19
|
+
Usage:
|
|
20
|
+
import gemss.config as config
|
|
21
|
+
# Core algorithm: config.N_CANDIDATE_SOLUTIONS, config.PRIOR_TYPE, etc.
|
|
22
|
+
# Access parameters: config.N_SAMPLES, config.N_CANDIDATE_SOLUTIONS, etc.
|
|
23
|
+
# Artificial data: config.N_SAMPLES, config.N_FEATURES, etc. (demo only)
|
|
24
|
+
# Display configuration: config.display_current_config(config.as_dict())
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import json
|
|
28
|
+
from collections import OrderedDict
|
|
29
|
+
from functools import cache, lru_cache
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Any, Literal
|
|
32
|
+
|
|
33
|
+
from .constants import CONFIG_FILES
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from IPython.display import Markdown
|
|
37
|
+
from IPython.display import display as ipython_display
|
|
38
|
+
except ImportError:
|
|
39
|
+
Markdown = None # noqa: N816
|
|
40
|
+
ipython_display = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ConfigurationManager:
|
|
44
|
+
"""
|
|
45
|
+
Efficient configuration manager with lazy loading and parameter categorization.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
# Parameter category definitions
|
|
49
|
+
ARTIFICIAL_DATASET_PARAMS = [
|
|
50
|
+
'N_SAMPLES',
|
|
51
|
+
'N_FEATURES',
|
|
52
|
+
'N_GENERATING_SOLUTIONS',
|
|
53
|
+
'SPARSITY',
|
|
54
|
+
'NOISE_STD',
|
|
55
|
+
'NAN_RATIO',
|
|
56
|
+
'BINARIZE',
|
|
57
|
+
'BINARY_RESPONSE_RATIO',
|
|
58
|
+
'DATASET_SEED',
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
ALGORITHM_PARAMS = [
|
|
62
|
+
'N_CANDIDATE_SOLUTIONS',
|
|
63
|
+
'N_ITER',
|
|
64
|
+
'PRIOR_TYPE',
|
|
65
|
+
'PRIOR_SPARSITY',
|
|
66
|
+
'SAMPLE_MORE_PRIORS_COEFF',
|
|
67
|
+
'STUDENT_DF',
|
|
68
|
+
'STUDENT_SCALE',
|
|
69
|
+
'VAR_SLAB',
|
|
70
|
+
'VAR_SPIKE',
|
|
71
|
+
'WEIGHT_SLAB',
|
|
72
|
+
'WEIGHT_SPIKE',
|
|
73
|
+
'IS_REGULARIZED',
|
|
74
|
+
'LAMBDA_JACCARD',
|
|
75
|
+
'BATCH_SIZE',
|
|
76
|
+
'LEARNING_RATE',
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
POSTPROCESSING_PARAMS = [
|
|
80
|
+
'DESIRED_SPARSITY',
|
|
81
|
+
'MIN_MU_THRESHOLD',
|
|
82
|
+
'USE_MEDIAN_FOR_OUTLIER_DETECTION',
|
|
83
|
+
'OUTLIER_DEVIATION_THRESHOLDS',
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
# Parameter descriptions for display
|
|
87
|
+
PARAM_DESCRIPTIONS = {
|
|
88
|
+
# Artificial dataset generation (development/demo only)
|
|
89
|
+
'N_SAMPLES': 'Number of samples (rows) in the synthetic dataset.',
|
|
90
|
+
'N_FEATURES': 'Number of features (columns) in the synthetic dataset.',
|
|
91
|
+
'N_GENERATING_SOLUTIONS': "Number of distinct sparse solutions ('true' supports).",
|
|
92
|
+
'SPARSITY': 'Number of nonzero features per solution (support size).',
|
|
93
|
+
'NOISE_STD': 'Standard deviation of noise added to synthetic data.',
|
|
94
|
+
'NAN_RATIO': 'Proportion of missing values (NaNs) in the synthetic dataset.',
|
|
95
|
+
'BINARIZE': 'Whether to binarize the synthetic response variable.',
|
|
96
|
+
'BINARY_RESPONSE_RATIO': 'Proportion of synthetic samples assigned label 1.',
|
|
97
|
+
'DATASET_SEED': 'Random seed for synthetic data reproducibility.',
|
|
98
|
+
# Algorithm settings
|
|
99
|
+
'N_CANDIDATE_SOLUTIONS': (
|
|
100
|
+
'Desired number of candidate solutions (components of the Gaussian mixture '
|
|
101
|
+
'approximating the variational posterior). Set to 2-3x expected true solutions.'
|
|
102
|
+
),
|
|
103
|
+
'N_ITER': 'Number of optimization iterations.',
|
|
104
|
+
'PRIOR_TYPE': "Prior type ('ss', 'sss', or 'student')",
|
|
105
|
+
'PRIOR_SPARSITY': (
|
|
106
|
+
"Expected number of nonzero features per component. Used only in 'sss' prior"
|
|
107
|
+
),
|
|
108
|
+
'SAMPLE_MORE_PRIORS_COEFF': (
|
|
109
|
+
'Coefficient for increased support sampling. Experimental use only.'
|
|
110
|
+
),
|
|
111
|
+
'STUDENT_DF': (
|
|
112
|
+
"Degrees of freedom for the Student-t prior. Used only if PRIOR_TYPE is 'student'."
|
|
113
|
+
),
|
|
114
|
+
'STUDENT_SCALE': (
|
|
115
|
+
"Scale parameter for the Student-t prior. Used only if PRIOR_TYPE is 'student'."
|
|
116
|
+
),
|
|
117
|
+
'VAR_SLAB': ("Variance of the 'slab' in 'ss' or 'sss' prior. Ignored for 'student' prior."),
|
|
118
|
+
'VAR_SPIKE': (
|
|
119
|
+
"Variance of the 'spike' in 'ss' or 'sss' prior. Ignored for 'student' prior."
|
|
120
|
+
),
|
|
121
|
+
'WEIGHT_SLAB': ("Weight of the 'slab' in the 'ss' prior. Ignored for other priors."),
|
|
122
|
+
'WEIGHT_SPIKE': ("Weight of the 'spike' in the 'ss' prior. Ignored for other priors."),
|
|
123
|
+
'IS_REGULARIZED': 'Whether to use Jaccard similarity penalty.',
|
|
124
|
+
'LAMBDA_JACCARD': (
|
|
125
|
+
'Regularization strength for Jaccard penalty. Applies only if IS_REGULARIZED is True.'
|
|
126
|
+
),
|
|
127
|
+
'BATCH_SIZE': 'Minibatch size for stochastic updates in the SGD optimization.',
|
|
128
|
+
'LEARNING_RATE': 'Learning rate for the Adam optimizer.',
|
|
129
|
+
# Postprocessing
|
|
130
|
+
'DESIRED_SPARSITY': 'Desired number of features in final solution.',
|
|
131
|
+
'MIN_MU_THRESHOLD': (
|
|
132
|
+
'Minimum mu threshold for feature selection. Specific for each dataset.'
|
|
133
|
+
),
|
|
134
|
+
'USE_MEDIAN_FOR_OUTLIER_DETECTION': (
|
|
135
|
+
'Whether to use median and MAD or mean and STD when selecting features '
|
|
136
|
+
'by outlier detection.'
|
|
137
|
+
),
|
|
138
|
+
'OUTLIER_DEVIATION_THRESHOLDS': (
|
|
139
|
+
'List of thresholding values of MAD or STD to define outliers.'
|
|
140
|
+
),
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
def __init__(self):
|
|
144
|
+
self._config_dir = Path(__file__).parent
|
|
145
|
+
self._cache = {}
|
|
146
|
+
|
|
147
|
+
@cache
|
|
148
|
+
def _load_json_file(self, filename: str) -> dict[str, Any]:
|
|
149
|
+
"""Load and cache JSON file contents."""
|
|
150
|
+
file_path = self._config_dir / filename
|
|
151
|
+
try:
|
|
152
|
+
with file_path.open() as f:
|
|
153
|
+
return json.load(f)
|
|
154
|
+
except FileNotFoundError:
|
|
155
|
+
raise FileNotFoundError(f'Configuration file not found: {file_path}')
|
|
156
|
+
except json.JSONDecodeError as e:
|
|
157
|
+
raise ValueError(f'Invalid JSON in {file_path}: {e}')
|
|
158
|
+
|
|
159
|
+
@cache
|
|
160
|
+
def get_artificial_dataset_params(self) -> dict[str, Any]:
|
|
161
|
+
"""Get artificial dataset generation parameters (for development/demo only)."""
|
|
162
|
+
params = self._load_json_file(CONFIG_FILES['ARTIFICIAL_DATASET'])
|
|
163
|
+
# Use explicit order defined by ARTIFICIAL_DATASET_PARAMS list
|
|
164
|
+
ordered_params = OrderedDict()
|
|
165
|
+
for k in self.ARTIFICIAL_DATASET_PARAMS:
|
|
166
|
+
if k in params:
|
|
167
|
+
ordered_params[k] = params[k]
|
|
168
|
+
return dict(ordered_params)
|
|
169
|
+
|
|
170
|
+
@cache
|
|
171
|
+
def get_algorithm_params(self) -> dict[str, Any]:
|
|
172
|
+
"""Get algorithm parameters."""
|
|
173
|
+
params = self._load_json_file(CONFIG_FILES['ALGORITHM'])
|
|
174
|
+
# Use explicit order defined by ALGORITHM_PARAMS list
|
|
175
|
+
ordered_params = OrderedDict()
|
|
176
|
+
for k in self.ALGORITHM_PARAMS:
|
|
177
|
+
if k in params:
|
|
178
|
+
ordered_params[k] = params[k]
|
|
179
|
+
return dict(ordered_params)
|
|
180
|
+
|
|
181
|
+
@cache
|
|
182
|
+
def get_postprocessing_params(self) -> dict[str, Any]:
|
|
183
|
+
"""Get postprocessing parameters."""
|
|
184
|
+
params = self._load_json_file(CONFIG_FILES['POSTPROCESSING'])
|
|
185
|
+
# Use explicit order defined by POSTPROCESSING_PARAMS list
|
|
186
|
+
ordered_params = OrderedDict()
|
|
187
|
+
for k in self.POSTPROCESSING_PARAMS:
|
|
188
|
+
if k in params:
|
|
189
|
+
ordered_params[k] = params[k]
|
|
190
|
+
return dict(ordered_params)
|
|
191
|
+
|
|
192
|
+
@lru_cache(maxsize=1)
|
|
193
|
+
def get_all_params(self) -> dict[str, Any]:
|
|
194
|
+
"""Get all parameters in a single dictionary, preserving fixed order."""
|
|
195
|
+
all_params = OrderedDict()
|
|
196
|
+
|
|
197
|
+
# 1. Add Artificial Dataset Params
|
|
198
|
+
dataset_params = self.get_artificial_dataset_params()
|
|
199
|
+
for k in self.ARTIFICIAL_DATASET_PARAMS:
|
|
200
|
+
if k in dataset_params:
|
|
201
|
+
all_params[k] = dataset_params[k]
|
|
202
|
+
|
|
203
|
+
# 2. Add Algorithm Params
|
|
204
|
+
algorithm_params = self.get_algorithm_params()
|
|
205
|
+
for k in self.ALGORITHM_PARAMS:
|
|
206
|
+
if k in algorithm_params:
|
|
207
|
+
all_params[k] = algorithm_params[k]
|
|
208
|
+
|
|
209
|
+
# 3. Add Postprocessing Params
|
|
210
|
+
postprocessing_params = self.get_postprocessing_params()
|
|
211
|
+
for k in self.POSTPROCESSING_PARAMS:
|
|
212
|
+
if k in postprocessing_params:
|
|
213
|
+
all_params[k] = postprocessing_params[k]
|
|
214
|
+
|
|
215
|
+
# Return as a regular dict (which retains order in Python 3.7+),
|
|
216
|
+
# but the order is explicitly set by the OrderedDict logic above.
|
|
217
|
+
return dict(all_params)
|
|
218
|
+
|
|
219
|
+
def get_params_by_category(self, category: str) -> dict[str, Any]:
|
|
220
|
+
"""Get parameters filtered by category (efficient, uses cached dicts)."""
|
|
221
|
+
if category in ('artificial_dataset', 'dataset'):
|
|
222
|
+
return self.get_artificial_dataset_params()
|
|
223
|
+
elif category == 'algorithm':
|
|
224
|
+
return self.get_algorithm_params()
|
|
225
|
+
elif category == 'postprocessing':
|
|
226
|
+
return self.get_postprocessing_params()
|
|
227
|
+
elif category == 'all':
|
|
228
|
+
return self.get_all_params()
|
|
229
|
+
else:
|
|
230
|
+
raise ValueError(
|
|
231
|
+
f'Unknown category: {category}. '
|
|
232
|
+
"Valid: 'artificial_dataset', 'algorithm', 'postprocessing', 'all'"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
# Global configuration manager instance
|
|
237
|
+
_config_manager = ConfigurationManager()
|
|
238
|
+
|
|
239
|
+
# Load all parameters at module level for backward compatibility
|
|
240
|
+
_all_params = _config_manager.get_all_params()
|
|
241
|
+
|
|
242
|
+
# Artificial dataset parameters (for synthetic data generation - development/demo only)
|
|
243
|
+
N_SAMPLES = _all_params['N_SAMPLES']
|
|
244
|
+
N_FEATURES = _all_params['N_FEATURES']
|
|
245
|
+
N_GENERATING_SOLUTIONS = _all_params['N_GENERATING_SOLUTIONS']
|
|
246
|
+
SPARSITY = _all_params['SPARSITY']
|
|
247
|
+
NOISE_STD = _all_params['NOISE_STD']
|
|
248
|
+
NAN_RATIO = _all_params['NAN_RATIO']
|
|
249
|
+
BINARIZE = _all_params['BINARIZE']
|
|
250
|
+
BINARY_RESPONSE_RATIO = _all_params['BINARY_RESPONSE_RATIO']
|
|
251
|
+
DATASET_SEED = _all_params['DATASET_SEED']
|
|
252
|
+
|
|
253
|
+
# Algorithm parameters
|
|
254
|
+
N_CANDIDATE_SOLUTIONS = _all_params['N_CANDIDATE_SOLUTIONS']
|
|
255
|
+
N_ITER = _all_params['N_ITER']
|
|
256
|
+
PRIOR_TYPE = _all_params['PRIOR_TYPE']
|
|
257
|
+
PRIOR_SPARSITY = _all_params.get('PRIOR_SPARSITY')
|
|
258
|
+
SAMPLE_MORE_PRIORS_COEFF = _all_params.get('SAMPLE_MORE_PRIORS_COEFF', 1.0)
|
|
259
|
+
STUDENT_DF = _all_params['STUDENT_DF']
|
|
260
|
+
STUDENT_SCALE = _all_params['STUDENT_SCALE']
|
|
261
|
+
VAR_SLAB = _all_params['VAR_SLAB']
|
|
262
|
+
VAR_SPIKE = _all_params['VAR_SPIKE']
|
|
263
|
+
WEIGHT_SLAB = _all_params['WEIGHT_SLAB']
|
|
264
|
+
WEIGHT_SPIKE = _all_params['WEIGHT_SPIKE']
|
|
265
|
+
IS_REGULARIZED = _all_params['IS_REGULARIZED']
|
|
266
|
+
LAMBDA_JACCARD = _all_params['LAMBDA_JACCARD']
|
|
267
|
+
BATCH_SIZE = _all_params['BATCH_SIZE']
|
|
268
|
+
LEARNING_RATE = _all_params['LEARNING_RATE']
|
|
269
|
+
|
|
270
|
+
# Postprocessing parameters
|
|
271
|
+
DESIRED_SPARSITY = _all_params['DESIRED_SPARSITY']
|
|
272
|
+
MIN_MU_THRESHOLD = _all_params['MIN_MU_THRESHOLD']
|
|
273
|
+
USE_MEDIAN_FOR_OUTLIER_DETECTION = _all_params['USE_MEDIAN_FOR_OUTLIER_DETECTION']
|
|
274
|
+
OUTLIER_DEVIATION_THRESHOLDS = _all_params['OUTLIER_DEVIATION_THRESHOLDS']
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def check_sparsities(artificial_dataset: bool = True) -> None:
|
|
278
|
+
"""
|
|
279
|
+
Print sparsity settings for verification.
|
|
280
|
+
Parameters
|
|
281
|
+
----------
|
|
282
|
+
artificial_dataset : bool
|
|
283
|
+
Whether to include artificial dataset sparsity settings.
|
|
284
|
+
Applicable only if synthetic data is used.
|
|
285
|
+
"""
|
|
286
|
+
print('Sparsity settings:')
|
|
287
|
+
if artificial_dataset:
|
|
288
|
+
print(f' - True sparsity of artificial dataset: {SPARSITY}')
|
|
289
|
+
print(f' - Prior sparsity: {PRIOR_SPARSITY}')
|
|
290
|
+
print(f' - Desired sparsity: {DESIRED_SPARSITY}')
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def as_dict() -> dict[str, Any]:
|
|
294
|
+
"""Return all configuration parameters as a dictionary."""
|
|
295
|
+
return _config_manager.get_all_params().copy()
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def get_core_algorithm_params() -> dict[str, Any]:
|
|
299
|
+
"""
|
|
300
|
+
Get core algorithm parameters only (excludes artificial dataset parameters).
|
|
301
|
+
|
|
302
|
+
This function returns parameters needed for the algorithm to work with
|
|
303
|
+
real user datasets. Use this when you don't need synthetic data generation.
|
|
304
|
+
|
|
305
|
+
Returns
|
|
306
|
+
-------
|
|
307
|
+
dict[str, Any]
|
|
308
|
+
Dictionary containing algorithm and postprocessing parameters only
|
|
309
|
+
"""
|
|
310
|
+
algorithm_params = _config_manager.get_params_by_category('algorithm')
|
|
311
|
+
postprocessing_params = _config_manager.get_params_by_category('postprocessing')
|
|
312
|
+
|
|
313
|
+
# Merge, maintaining the order defined in the class lists
|
|
314
|
+
core_params = OrderedDict()
|
|
315
|
+
for k in ConfigurationManager.ALGORITHM_PARAMS:
|
|
316
|
+
if k in algorithm_params:
|
|
317
|
+
core_params[k] = algorithm_params[k]
|
|
318
|
+
for k in ConfigurationManager.POSTPROCESSING_PARAMS:
|
|
319
|
+
if k in postprocessing_params:
|
|
320
|
+
core_params[k] = postprocessing_params[k]
|
|
321
|
+
|
|
322
|
+
return dict(core_params)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def get_params_by_category(category: str) -> dict[str, Any]:
|
|
326
|
+
"""
|
|
327
|
+
Get parameters filtered by category.
|
|
328
|
+
|
|
329
|
+
Parameters
|
|
330
|
+
----------
|
|
331
|
+
category : str
|
|
332
|
+
Category name: 'artificial_dataset' (or 'dataset' for compatibility),
|
|
333
|
+
'algorithm', 'postprocessing', or 'all'
|
|
334
|
+
|
|
335
|
+
Returns
|
|
336
|
+
-------
|
|
337
|
+
dict[str, Any]
|
|
338
|
+
Filtered parameters dictionary
|
|
339
|
+
"""
|
|
340
|
+
return _config_manager.get_params_by_category(category)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def get_current_config(
|
|
344
|
+
constants: dict[str, Any] | None = None,
|
|
345
|
+
constant_type: Literal[
|
|
346
|
+
'algorithm',
|
|
347
|
+
'postprocessing',
|
|
348
|
+
'algorithm_and_postprocessing',
|
|
349
|
+
'dataset',
|
|
350
|
+
'all',
|
|
351
|
+
] = 'all',
|
|
352
|
+
) -> str:
|
|
353
|
+
"""
|
|
354
|
+
Get configuration parameters in a formatted table.
|
|
355
|
+
|
|
356
|
+
Parameters
|
|
357
|
+
----------
|
|
358
|
+
constants : dict[str, Any] | None, optional
|
|
359
|
+
Configuration parameters to display. If None, uses current config.
|
|
360
|
+
constant_type : str
|
|
361
|
+
Parameter category to display:
|
|
362
|
+
'algorithm', 'postprocessing', 'algorithm_and_postprocessing', 'dataset', 'all'
|
|
363
|
+
|
|
364
|
+
Returns
|
|
365
|
+
-------
|
|
366
|
+
str
|
|
367
|
+
Formatted table of configuration parameters.
|
|
368
|
+
"""
|
|
369
|
+
if constants is None:
|
|
370
|
+
constants = as_dict()
|
|
371
|
+
|
|
372
|
+
# Map legacy category names and handle special cases
|
|
373
|
+
if constant_type in ('artificial_data', 'dataset'):
|
|
374
|
+
category = 'artificial_dataset'
|
|
375
|
+
elif constant_type == 'algorithm_and_postprocessing':
|
|
376
|
+
# Special case: combine algorithm and postprocessing parameters
|
|
377
|
+
algo_params = get_params_by_category('algorithm')
|
|
378
|
+
post_params = get_params_by_category('postprocessing')
|
|
379
|
+
filtered_constants = {**algo_params, **post_params}
|
|
380
|
+
constants = {k: v for k, v in constants.items() if k in filtered_constants}
|
|
381
|
+
else:
|
|
382
|
+
category = constant_type
|
|
383
|
+
|
|
384
|
+
if constant_type != 'algorithm_and_postprocessing':
|
|
385
|
+
if category != 'all':
|
|
386
|
+
filtered_constants = get_params_by_category(category)
|
|
387
|
+
constants = {k: v for k, v in constants.items() if k in filtered_constants}
|
|
388
|
+
|
|
389
|
+
if not constants:
|
|
390
|
+
return 'No parameters to display.'
|
|
391
|
+
|
|
392
|
+
# Create formatted table
|
|
393
|
+
table_lines = [
|
|
394
|
+
'| Parameter | Current Value | Description |',
|
|
395
|
+
'|-----------|---------------|-------------|',
|
|
396
|
+
]
|
|
397
|
+
|
|
398
|
+
for param_name in constants.keys():
|
|
399
|
+
param_value = constants[param_name]
|
|
400
|
+
description = ConfigurationManager.PARAM_DESCRIPTIONS.get(
|
|
401
|
+
param_name, 'Configuration parameter'
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
# Format value based on type
|
|
405
|
+
if isinstance(param_value, float):
|
|
406
|
+
formatted_value = f'{param_value:.6g}'
|
|
407
|
+
elif isinstance(param_value, str):
|
|
408
|
+
formatted_value = f'"{param_value}"'
|
|
409
|
+
else:
|
|
410
|
+
formatted_value = str(param_value)
|
|
411
|
+
|
|
412
|
+
table_lines.append(f'| `{param_name}` | {formatted_value} | {description} |')
|
|
413
|
+
|
|
414
|
+
return '\n'.join(table_lines)
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def display_current_config(
|
|
418
|
+
constants: dict[str, Any] | None = None,
|
|
419
|
+
constant_type: Literal[
|
|
420
|
+
'algorithm',
|
|
421
|
+
'postprocessing',
|
|
422
|
+
'algorithm_and_postprocessing',
|
|
423
|
+
'dataset',
|
|
424
|
+
'all',
|
|
425
|
+
] = 'all',
|
|
426
|
+
) -> None:
|
|
427
|
+
"""
|
|
428
|
+
Display configuration parameters in a formatted table.
|
|
429
|
+
|
|
430
|
+
Parameters
|
|
431
|
+
----------
|
|
432
|
+
constants : dict[str, Any] | None, optional
|
|
433
|
+
Configuration parameters to display. If None, uses current config.
|
|
434
|
+
constant_type : str
|
|
435
|
+
Parameter category to display:
|
|
436
|
+
'algorithm', 'postprocessing', 'algorithm_and_postprocessing', 'dataset', 'all'
|
|
437
|
+
"""
|
|
438
|
+
if ipython_display is None:
|
|
439
|
+
print('IPython not available. Cannot display formatted configuration.')
|
|
440
|
+
return
|
|
441
|
+
|
|
442
|
+
table_lines = get_current_config(
|
|
443
|
+
constants=constants,
|
|
444
|
+
constant_type=constant_type,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Map legacy category names and handle special cases
|
|
448
|
+
if constant_type in ('artificial_data', 'dataset'):
|
|
449
|
+
section_title = 'artificial dataset parameters'
|
|
450
|
+
elif constant_type == 'algorithm_and_postprocessing':
|
|
451
|
+
section_title = 'algorithm and postprocessing parameters'
|
|
452
|
+
else:
|
|
453
|
+
section_title = f'{constant_type} parameters'
|
|
454
|
+
|
|
455
|
+
ipython_display(Markdown(f'## Configuration: {section_title}'))
|
|
456
|
+
ipython_display(Markdown(table_lines))
|
|
457
|
+
return
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Project constants for GEMSS (Gaussian Ensemble for Multiple Sparse Solutions).
|
|
3
|
+
|
|
4
|
+
This module contains essential project-related constants including file names,
|
|
5
|
+
paths, and project metadata used throughout the configuration system.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Final
|
|
10
|
+
|
|
11
|
+
# Configuration file names
|
|
12
|
+
CONFIG_FILES: Final = {
|
|
13
|
+
'ARTIFICIAL_DATASET': 'generated_dataset_parameters.json',
|
|
14
|
+
'ALGORITHM': 'algorithm_settings.json',
|
|
15
|
+
'POSTPROCESSING': 'solution_postprocessing_settings.json',
|
|
16
|
+
}
|
|
17
|
+
ROOT_DIR = Path(__file__).resolve().parent.parent.parent
|
|
18
|
+
DATA_DIR = ROOT_DIR / 'data'
|
|
19
|
+
|
|
20
|
+
# Project metadata
|
|
21
|
+
PROJECT_NAME: Final = 'Gaussian Ensemble for Multiple Sparse Solutions'
|
|
22
|
+
PROJECT_ABBREV: Final = 'GEMSS'
|
|
23
|
+
CONFIG_PACKAGE_NAME: Final = 'gemss.config'
|
|
24
|
+
|
|
25
|
+
# Experiment results directory
|
|
26
|
+
EXPERIMENT_RESULTS_DIR: Final = ROOT_DIR / 'scripts' / 'results'
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data handling utilities for GEMSS experiments.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from gemss.data_handling.data_processing import (
|
|
6
|
+
get_df_from_X,
|
|
7
|
+
get_feature_name_mapping,
|
|
8
|
+
load_data,
|
|
9
|
+
preprocess_features,
|
|
10
|
+
preprocess_non_numeric_features,
|
|
11
|
+
)
|
|
12
|
+
from gemss.data_handling.generate_artificial_dataset import (
|
|
13
|
+
generate_artificial_dataset,
|
|
14
|
+
generate_multi_solution_data,
|
|
15
|
+
show_overview_of_generated_data,
|
|
16
|
+
)
|