gemss 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. gemss/__init__.py +14 -0
  2. gemss/config/__init__.py +52 -0
  3. gemss/config/algorithm_settings.json +17 -0
  4. gemss/config/config.py +457 -0
  5. gemss/config/constants.py +26 -0
  6. gemss/config/generated_dataset_parameters.json +11 -0
  7. gemss/config/solution_postprocessing_settings.json +6 -0
  8. gemss/data_handling/__init__.py +16 -0
  9. gemss/data_handling/data_processing.py +269 -0
  10. gemss/data_handling/generate_artificial_dataset.py +338 -0
  11. gemss/diagnostics/__init__.py +19 -0
  12. gemss/diagnostics/performance_tests.py +700 -0
  13. gemss/diagnostics/recommendation_messages.py +153 -0
  14. gemss/diagnostics/recommendations.py +427 -0
  15. gemss/experiment_assessment/__init__.py +37 -0
  16. gemss/experiment_assessment/case_analysis.py +436 -0
  17. gemss/experiment_assessment/experiment_results_analysis.py +765 -0
  18. gemss/experiment_assessment/experiment_results_interactive.py +365 -0
  19. gemss/experiment_assessment/experiment_results_visualizations.py +764 -0
  20. gemss/feature_selection/__init__.py +36 -0
  21. gemss/feature_selection/inference.py +407 -0
  22. gemss/feature_selection/models.py +384 -0
  23. gemss/postprocessing/__init__.py +45 -0
  24. gemss/postprocessing/outliers.py +434 -0
  25. gemss/postprocessing/result_postprocessing.py +640 -0
  26. gemss/postprocessing/simple_regressions.py +600 -0
  27. gemss/postprocessing/tabpfn_evaluation.py +298 -0
  28. gemss/utils/__init__.py +53 -0
  29. gemss/utils/utils.py +830 -0
  30. gemss/utils/visualizations.py +940 -0
  31. gemss-1.0.1.dist-info/METADATA +285 -0
  32. gemss-1.0.1.dist-info/RECORD +34 -0
  33. gemss-1.0.1.dist-info/WHEEL +4 -0
  34. gemss-1.0.1.dist-info/licenses/LICENSE +21 -0
gemss/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ """
2
+ GEMSS Package
3
+
4
+ GEMSS (Gaussian Ensemble for Multiple Sparse Solutions) is an algorithm for
5
+ feature selection in high-dimensional data. It is intended to be used during
6
+ dataset analysis to identify relevant features for predictive modeling.
7
+
8
+ GEMSS is a Bayesian variational method that approximates multimodal
9
+ posteriors by Gaussian mixtures to recover diversified sparse feature sets.
10
+ """
11
+
12
+ import importlib.metadata
13
+
14
+ __version__ = importlib.metadata.version('gemss')
@@ -0,0 +1,52 @@
1
+ """
2
+ Configuration package for GEMSS (Gaussian Ensemble for Multiple Sparse Solutions).
3
+
4
+ This package contains configuration loading functionality and JSON parameter files.
5
+ """
6
+
7
+ from gemss.config.config import (
8
+ BATCH_SIZE,
9
+ BINARIZE,
10
+ BINARY_RESPONSE_RATIO,
11
+ DATASET_SEED,
12
+ DESIRED_SPARSITY,
13
+ IS_REGULARIZED,
14
+ LAMBDA_JACCARD,
15
+ LEARNING_RATE,
16
+ MIN_MU_THRESHOLD,
17
+ N_CANDIDATE_SOLUTIONS,
18
+ N_FEATURES,
19
+ N_GENERATING_SOLUTIONS,
20
+ N_ITER,
21
+ N_SAMPLES,
22
+ NAN_RATIO,
23
+ NOISE_STD,
24
+ OUTLIER_DEVIATION_THRESHOLDS,
25
+ PRIOR_SPARSITY,
26
+ PRIOR_TYPE,
27
+ SAMPLE_MORE_PRIORS_COEFF,
28
+ SPARSITY,
29
+ STUDENT_DF,
30
+ STUDENT_SCALE,
31
+ USE_MEDIAN_FOR_OUTLIER_DETECTION,
32
+ VAR_SLAB,
33
+ VAR_SPIKE,
34
+ WEIGHT_SLAB,
35
+ WEIGHT_SPIKE,
36
+ ConfigurationManager,
37
+ as_dict,
38
+ check_sparsities,
39
+ display_current_config,
40
+ get_core_algorithm_params,
41
+ get_current_config,
42
+ get_params_by_category,
43
+ )
44
+ from gemss.config.constants import (
45
+ CONFIG_FILES,
46
+ CONFIG_PACKAGE_NAME,
47
+ DATA_DIR,
48
+ EXPERIMENT_RESULTS_DIR,
49
+ PROJECT_ABBREV,
50
+ PROJECT_NAME,
51
+ ROOT_DIR,
52
+ )
@@ -0,0 +1,17 @@
1
+ {
2
+ "N_ITER": 4000,
3
+ "SAMPLE_MORE_PRIORS_COEFF": 1,
4
+ "WEIGHT_SPIKE": 0.1,
5
+ "LAMBDA_JACCARD": 500,
6
+ "STUDENT_SCALE": 1,
7
+ "LEARNING_RATE": 0.002,
8
+ "IS_REGULARIZED": true,
9
+ "BATCH_SIZE": 32,
10
+ "VAR_SLAB": 100,
11
+ "VAR_SPIKE": 0.1,
12
+ "PRIOR_SPARSITY": 3,
13
+ "STUDENT_DF": 1,
14
+ "PRIOR_TYPE": "sss",
15
+ "WEIGHT_SLAB": 0.9,
16
+ "N_CANDIDATE_SOLUTIONS": 8
17
+ }
gemss/config/config.py ADDED
@@ -0,0 +1,457 @@
1
+ """
2
+ Configuration loader for GEMSS (Gaussian Ensemble for Multiple Sparse Solutions).
3
+
4
+ This module provides efficient loading and management of experiment parameters
5
+ from JSON configuration files co-located with this module.
6
+
7
+ Parameter Categories:
8
+ - Artificial Dataset: Parameters for synthetic data generation (development/demo only)
9
+ - Algorithm: Core algorithm parameters (used for both synthetic and real data)
10
+ - Postprocessing: Solution recovery and analysis parameters
11
+
12
+ Features:
13
+ - Lazy loading with caching
14
+ - Comprehensive parameter validation
15
+ - Structured parameter access by category
16
+ - Rich display functionality for notebooks
17
+ - Efficient dictionary conversion for logging
18
+
19
+ Usage:
20
+ import gemss.config as config
21
+ # Core algorithm: config.N_CANDIDATE_SOLUTIONS, config.PRIOR_TYPE, etc.
22
+ # Access parameters: config.N_SAMPLES, config.N_CANDIDATE_SOLUTIONS, etc.
23
+ # Artificial data: config.N_SAMPLES, config.N_FEATURES, etc. (demo only)
24
+ # Display configuration: config.display_current_config(config.as_dict())
25
+ """
26
+
27
+ import json
28
+ from collections import OrderedDict
29
+ from functools import cache, lru_cache
30
+ from pathlib import Path
31
+ from typing import Any, Literal
32
+
33
+ from .constants import CONFIG_FILES
34
+
35
+ try:
36
+ from IPython.display import Markdown
37
+ from IPython.display import display as ipython_display
38
+ except ImportError:
39
+ Markdown = None # noqa: N816
40
+ ipython_display = None
41
+
42
+
43
+ class ConfigurationManager:
44
+ """
45
+ Efficient configuration manager with lazy loading and parameter categorization.
46
+ """
47
+
48
+ # Parameter category definitions
49
+ ARTIFICIAL_DATASET_PARAMS = [
50
+ 'N_SAMPLES',
51
+ 'N_FEATURES',
52
+ 'N_GENERATING_SOLUTIONS',
53
+ 'SPARSITY',
54
+ 'NOISE_STD',
55
+ 'NAN_RATIO',
56
+ 'BINARIZE',
57
+ 'BINARY_RESPONSE_RATIO',
58
+ 'DATASET_SEED',
59
+ ]
60
+
61
+ ALGORITHM_PARAMS = [
62
+ 'N_CANDIDATE_SOLUTIONS',
63
+ 'N_ITER',
64
+ 'PRIOR_TYPE',
65
+ 'PRIOR_SPARSITY',
66
+ 'SAMPLE_MORE_PRIORS_COEFF',
67
+ 'STUDENT_DF',
68
+ 'STUDENT_SCALE',
69
+ 'VAR_SLAB',
70
+ 'VAR_SPIKE',
71
+ 'WEIGHT_SLAB',
72
+ 'WEIGHT_SPIKE',
73
+ 'IS_REGULARIZED',
74
+ 'LAMBDA_JACCARD',
75
+ 'BATCH_SIZE',
76
+ 'LEARNING_RATE',
77
+ ]
78
+
79
+ POSTPROCESSING_PARAMS = [
80
+ 'DESIRED_SPARSITY',
81
+ 'MIN_MU_THRESHOLD',
82
+ 'USE_MEDIAN_FOR_OUTLIER_DETECTION',
83
+ 'OUTLIER_DEVIATION_THRESHOLDS',
84
+ ]
85
+
86
+ # Parameter descriptions for display
87
+ PARAM_DESCRIPTIONS = {
88
+ # Artificial dataset generation (development/demo only)
89
+ 'N_SAMPLES': 'Number of samples (rows) in the synthetic dataset.',
90
+ 'N_FEATURES': 'Number of features (columns) in the synthetic dataset.',
91
+ 'N_GENERATING_SOLUTIONS': "Number of distinct sparse solutions ('true' supports).",
92
+ 'SPARSITY': 'Number of nonzero features per solution (support size).',
93
+ 'NOISE_STD': 'Standard deviation of noise added to synthetic data.',
94
+ 'NAN_RATIO': 'Proportion of missing values (NaNs) in the synthetic dataset.',
95
+ 'BINARIZE': 'Whether to binarize the synthetic response variable.',
96
+ 'BINARY_RESPONSE_RATIO': 'Proportion of synthetic samples assigned label 1.',
97
+ 'DATASET_SEED': 'Random seed for synthetic data reproducibility.',
98
+ # Algorithm settings
99
+ 'N_CANDIDATE_SOLUTIONS': (
100
+ 'Desired number of candidate solutions (components of the Gaussian mixture '
101
+ 'approximating the variational posterior). Set to 2-3x expected true solutions.'
102
+ ),
103
+ 'N_ITER': 'Number of optimization iterations.',
104
+ 'PRIOR_TYPE': "Prior type ('ss', 'sss', or 'student')",
105
+ 'PRIOR_SPARSITY': (
106
+ "Expected number of nonzero features per component. Used only in 'sss' prior"
107
+ ),
108
+ 'SAMPLE_MORE_PRIORS_COEFF': (
109
+ 'Coefficient for increased support sampling. Experimental use only.'
110
+ ),
111
+ 'STUDENT_DF': (
112
+ "Degrees of freedom for the Student-t prior. Used only if PRIOR_TYPE is 'student'."
113
+ ),
114
+ 'STUDENT_SCALE': (
115
+ "Scale parameter for the Student-t prior. Used only if PRIOR_TYPE is 'student'."
116
+ ),
117
+ 'VAR_SLAB': ("Variance of the 'slab' in 'ss' or 'sss' prior. Ignored for 'student' prior."),
118
+ 'VAR_SPIKE': (
119
+ "Variance of the 'spike' in 'ss' or 'sss' prior. Ignored for 'student' prior."
120
+ ),
121
+ 'WEIGHT_SLAB': ("Weight of the 'slab' in the 'ss' prior. Ignored for other priors."),
122
+ 'WEIGHT_SPIKE': ("Weight of the 'spike' in the 'ss' prior. Ignored for other priors."),
123
+ 'IS_REGULARIZED': 'Whether to use Jaccard similarity penalty.',
124
+ 'LAMBDA_JACCARD': (
125
+ 'Regularization strength for Jaccard penalty. Applies only if IS_REGULARIZED is True.'
126
+ ),
127
+ 'BATCH_SIZE': 'Minibatch size for stochastic updates in the SGD optimization.',
128
+ 'LEARNING_RATE': 'Learning rate for the Adam optimizer.',
129
+ # Postprocessing
130
+ 'DESIRED_SPARSITY': 'Desired number of features in final solution.',
131
+ 'MIN_MU_THRESHOLD': (
132
+ 'Minimum mu threshold for feature selection. Specific for each dataset.'
133
+ ),
134
+ 'USE_MEDIAN_FOR_OUTLIER_DETECTION': (
135
+ 'Whether to use median and MAD or mean and STD when selecting features '
136
+ 'by outlier detection.'
137
+ ),
138
+ 'OUTLIER_DEVIATION_THRESHOLDS': (
139
+ 'List of thresholding values of MAD or STD to define outliers.'
140
+ ),
141
+ }
142
+
143
+ def __init__(self):
144
+ self._config_dir = Path(__file__).parent
145
+ self._cache = {}
146
+
147
+ @cache
148
+ def _load_json_file(self, filename: str) -> dict[str, Any]:
149
+ """Load and cache JSON file contents."""
150
+ file_path = self._config_dir / filename
151
+ try:
152
+ with file_path.open() as f:
153
+ return json.load(f)
154
+ except FileNotFoundError:
155
+ raise FileNotFoundError(f'Configuration file not found: {file_path}')
156
+ except json.JSONDecodeError as e:
157
+ raise ValueError(f'Invalid JSON in {file_path}: {e}')
158
+
159
+ @cache
160
+ def get_artificial_dataset_params(self) -> dict[str, Any]:
161
+ """Get artificial dataset generation parameters (for development/demo only)."""
162
+ params = self._load_json_file(CONFIG_FILES['ARTIFICIAL_DATASET'])
163
+ # Use explicit order defined by ARTIFICIAL_DATASET_PARAMS list
164
+ ordered_params = OrderedDict()
165
+ for k in self.ARTIFICIAL_DATASET_PARAMS:
166
+ if k in params:
167
+ ordered_params[k] = params[k]
168
+ return dict(ordered_params)
169
+
170
+ @cache
171
+ def get_algorithm_params(self) -> dict[str, Any]:
172
+ """Get algorithm parameters."""
173
+ params = self._load_json_file(CONFIG_FILES['ALGORITHM'])
174
+ # Use explicit order defined by ALGORITHM_PARAMS list
175
+ ordered_params = OrderedDict()
176
+ for k in self.ALGORITHM_PARAMS:
177
+ if k in params:
178
+ ordered_params[k] = params[k]
179
+ return dict(ordered_params)
180
+
181
+ @cache
182
+ def get_postprocessing_params(self) -> dict[str, Any]:
183
+ """Get postprocessing parameters."""
184
+ params = self._load_json_file(CONFIG_FILES['POSTPROCESSING'])
185
+ # Use explicit order defined by POSTPROCESSING_PARAMS list
186
+ ordered_params = OrderedDict()
187
+ for k in self.POSTPROCESSING_PARAMS:
188
+ if k in params:
189
+ ordered_params[k] = params[k]
190
+ return dict(ordered_params)
191
+
192
+ @lru_cache(maxsize=1)
193
+ def get_all_params(self) -> dict[str, Any]:
194
+ """Get all parameters in a single dictionary, preserving fixed order."""
195
+ all_params = OrderedDict()
196
+
197
+ # 1. Add Artificial Dataset Params
198
+ dataset_params = self.get_artificial_dataset_params()
199
+ for k in self.ARTIFICIAL_DATASET_PARAMS:
200
+ if k in dataset_params:
201
+ all_params[k] = dataset_params[k]
202
+
203
+ # 2. Add Algorithm Params
204
+ algorithm_params = self.get_algorithm_params()
205
+ for k in self.ALGORITHM_PARAMS:
206
+ if k in algorithm_params:
207
+ all_params[k] = algorithm_params[k]
208
+
209
+ # 3. Add Postprocessing Params
210
+ postprocessing_params = self.get_postprocessing_params()
211
+ for k in self.POSTPROCESSING_PARAMS:
212
+ if k in postprocessing_params:
213
+ all_params[k] = postprocessing_params[k]
214
+
215
+ # Return as a regular dict (which retains order in Python 3.7+),
216
+ # but the order is explicitly set by the OrderedDict logic above.
217
+ return dict(all_params)
218
+
219
+ def get_params_by_category(self, category: str) -> dict[str, Any]:
220
+ """Get parameters filtered by category (efficient, uses cached dicts)."""
221
+ if category in ('artificial_dataset', 'dataset'):
222
+ return self.get_artificial_dataset_params()
223
+ elif category == 'algorithm':
224
+ return self.get_algorithm_params()
225
+ elif category == 'postprocessing':
226
+ return self.get_postprocessing_params()
227
+ elif category == 'all':
228
+ return self.get_all_params()
229
+ else:
230
+ raise ValueError(
231
+ f'Unknown category: {category}. '
232
+ "Valid: 'artificial_dataset', 'algorithm', 'postprocessing', 'all'"
233
+ )
234
+
235
+
236
+ # Global configuration manager instance
237
+ _config_manager = ConfigurationManager()
238
+
239
+ # Load all parameters at module level for backward compatibility
240
+ _all_params = _config_manager.get_all_params()
241
+
242
+ # Artificial dataset parameters (for synthetic data generation - development/demo only)
243
+ N_SAMPLES = _all_params['N_SAMPLES']
244
+ N_FEATURES = _all_params['N_FEATURES']
245
+ N_GENERATING_SOLUTIONS = _all_params['N_GENERATING_SOLUTIONS']
246
+ SPARSITY = _all_params['SPARSITY']
247
+ NOISE_STD = _all_params['NOISE_STD']
248
+ NAN_RATIO = _all_params['NAN_RATIO']
249
+ BINARIZE = _all_params['BINARIZE']
250
+ BINARY_RESPONSE_RATIO = _all_params['BINARY_RESPONSE_RATIO']
251
+ DATASET_SEED = _all_params['DATASET_SEED']
252
+
253
+ # Algorithm parameters
254
+ N_CANDIDATE_SOLUTIONS = _all_params['N_CANDIDATE_SOLUTIONS']
255
+ N_ITER = _all_params['N_ITER']
256
+ PRIOR_TYPE = _all_params['PRIOR_TYPE']
257
+ PRIOR_SPARSITY = _all_params.get('PRIOR_SPARSITY')
258
+ SAMPLE_MORE_PRIORS_COEFF = _all_params.get('SAMPLE_MORE_PRIORS_COEFF', 1.0)
259
+ STUDENT_DF = _all_params['STUDENT_DF']
260
+ STUDENT_SCALE = _all_params['STUDENT_SCALE']
261
+ VAR_SLAB = _all_params['VAR_SLAB']
262
+ VAR_SPIKE = _all_params['VAR_SPIKE']
263
+ WEIGHT_SLAB = _all_params['WEIGHT_SLAB']
264
+ WEIGHT_SPIKE = _all_params['WEIGHT_SPIKE']
265
+ IS_REGULARIZED = _all_params['IS_REGULARIZED']
266
+ LAMBDA_JACCARD = _all_params['LAMBDA_JACCARD']
267
+ BATCH_SIZE = _all_params['BATCH_SIZE']
268
+ LEARNING_RATE = _all_params['LEARNING_RATE']
269
+
270
+ # Postprocessing parameters
271
+ DESIRED_SPARSITY = _all_params['DESIRED_SPARSITY']
272
+ MIN_MU_THRESHOLD = _all_params['MIN_MU_THRESHOLD']
273
+ USE_MEDIAN_FOR_OUTLIER_DETECTION = _all_params['USE_MEDIAN_FOR_OUTLIER_DETECTION']
274
+ OUTLIER_DEVIATION_THRESHOLDS = _all_params['OUTLIER_DEVIATION_THRESHOLDS']
275
+
276
+
277
+ def check_sparsities(artificial_dataset: bool = True) -> None:
278
+ """
279
+ Print sparsity settings for verification.
280
+ Parameters
281
+ ----------
282
+ artificial_dataset : bool
283
+ Whether to include artificial dataset sparsity settings.
284
+ Applicable only if synthetic data is used.
285
+ """
286
+ print('Sparsity settings:')
287
+ if artificial_dataset:
288
+ print(f' - True sparsity of artificial dataset: {SPARSITY}')
289
+ print(f' - Prior sparsity: {PRIOR_SPARSITY}')
290
+ print(f' - Desired sparsity: {DESIRED_SPARSITY}')
291
+
292
+
293
+ def as_dict() -> dict[str, Any]:
294
+ """Return all configuration parameters as a dictionary."""
295
+ return _config_manager.get_all_params().copy()
296
+
297
+
298
+ def get_core_algorithm_params() -> dict[str, Any]:
299
+ """
300
+ Get core algorithm parameters only (excludes artificial dataset parameters).
301
+
302
+ This function returns parameters needed for the algorithm to work with
303
+ real user datasets. Use this when you don't need synthetic data generation.
304
+
305
+ Returns
306
+ -------
307
+ dict[str, Any]
308
+ Dictionary containing algorithm and postprocessing parameters only
309
+ """
310
+ algorithm_params = _config_manager.get_params_by_category('algorithm')
311
+ postprocessing_params = _config_manager.get_params_by_category('postprocessing')
312
+
313
+ # Merge, maintaining the order defined in the class lists
314
+ core_params = OrderedDict()
315
+ for k in ConfigurationManager.ALGORITHM_PARAMS:
316
+ if k in algorithm_params:
317
+ core_params[k] = algorithm_params[k]
318
+ for k in ConfigurationManager.POSTPROCESSING_PARAMS:
319
+ if k in postprocessing_params:
320
+ core_params[k] = postprocessing_params[k]
321
+
322
+ return dict(core_params)
323
+
324
+
325
+ def get_params_by_category(category: str) -> dict[str, Any]:
326
+ """
327
+ Get parameters filtered by category.
328
+
329
+ Parameters
330
+ ----------
331
+ category : str
332
+ Category name: 'artificial_dataset' (or 'dataset' for compatibility),
333
+ 'algorithm', 'postprocessing', or 'all'
334
+
335
+ Returns
336
+ -------
337
+ dict[str, Any]
338
+ Filtered parameters dictionary
339
+ """
340
+ return _config_manager.get_params_by_category(category)
341
+
342
+
343
+ def get_current_config(
344
+ constants: dict[str, Any] | None = None,
345
+ constant_type: Literal[
346
+ 'algorithm',
347
+ 'postprocessing',
348
+ 'algorithm_and_postprocessing',
349
+ 'dataset',
350
+ 'all',
351
+ ] = 'all',
352
+ ) -> str:
353
+ """
354
+ Get configuration parameters in a formatted table.
355
+
356
+ Parameters
357
+ ----------
358
+ constants : dict[str, Any] | None, optional
359
+ Configuration parameters to display. If None, uses current config.
360
+ constant_type : str
361
+ Parameter category to display:
362
+ 'algorithm', 'postprocessing', 'algorithm_and_postprocessing', 'dataset', 'all'
363
+
364
+ Returns
365
+ -------
366
+ str
367
+ Formatted table of configuration parameters.
368
+ """
369
+ if constants is None:
370
+ constants = as_dict()
371
+
372
+ # Map legacy category names and handle special cases
373
+ if constant_type in ('artificial_data', 'dataset'):
374
+ category = 'artificial_dataset'
375
+ elif constant_type == 'algorithm_and_postprocessing':
376
+ # Special case: combine algorithm and postprocessing parameters
377
+ algo_params = get_params_by_category('algorithm')
378
+ post_params = get_params_by_category('postprocessing')
379
+ filtered_constants = {**algo_params, **post_params}
380
+ constants = {k: v for k, v in constants.items() if k in filtered_constants}
381
+ else:
382
+ category = constant_type
383
+
384
+ if constant_type != 'algorithm_and_postprocessing':
385
+ if category != 'all':
386
+ filtered_constants = get_params_by_category(category)
387
+ constants = {k: v for k, v in constants.items() if k in filtered_constants}
388
+
389
+ if not constants:
390
+ return 'No parameters to display.'
391
+
392
+ # Create formatted table
393
+ table_lines = [
394
+ '| Parameter | Current Value | Description |',
395
+ '|-----------|---------------|-------------|',
396
+ ]
397
+
398
+ for param_name in constants.keys():
399
+ param_value = constants[param_name]
400
+ description = ConfigurationManager.PARAM_DESCRIPTIONS.get(
401
+ param_name, 'Configuration parameter'
402
+ )
403
+
404
+ # Format value based on type
405
+ if isinstance(param_value, float):
406
+ formatted_value = f'{param_value:.6g}'
407
+ elif isinstance(param_value, str):
408
+ formatted_value = f'"{param_value}"'
409
+ else:
410
+ formatted_value = str(param_value)
411
+
412
+ table_lines.append(f'| `{param_name}` | {formatted_value} | {description} |')
413
+
414
+ return '\n'.join(table_lines)
415
+
416
+
417
+ def display_current_config(
418
+ constants: dict[str, Any] | None = None,
419
+ constant_type: Literal[
420
+ 'algorithm',
421
+ 'postprocessing',
422
+ 'algorithm_and_postprocessing',
423
+ 'dataset',
424
+ 'all',
425
+ ] = 'all',
426
+ ) -> None:
427
+ """
428
+ Display configuration parameters in a formatted table.
429
+
430
+ Parameters
431
+ ----------
432
+ constants : dict[str, Any] | None, optional
433
+ Configuration parameters to display. If None, uses current config.
434
+ constant_type : str
435
+ Parameter category to display:
436
+ 'algorithm', 'postprocessing', 'algorithm_and_postprocessing', 'dataset', 'all'
437
+ """
438
+ if ipython_display is None:
439
+ print('IPython not available. Cannot display formatted configuration.')
440
+ return
441
+
442
+ table_lines = get_current_config(
443
+ constants=constants,
444
+ constant_type=constant_type,
445
+ )
446
+
447
+ # Map legacy category names and handle special cases
448
+ if constant_type in ('artificial_data', 'dataset'):
449
+ section_title = 'artificial dataset parameters'
450
+ elif constant_type == 'algorithm_and_postprocessing':
451
+ section_title = 'algorithm and postprocessing parameters'
452
+ else:
453
+ section_title = f'{constant_type} parameters'
454
+
455
+ ipython_display(Markdown(f'## Configuration: {section_title}'))
456
+ ipython_display(Markdown(table_lines))
457
+ return
@@ -0,0 +1,26 @@
1
+ """
2
+ Project constants for GEMSS (Gaussian Ensemble for Multiple Sparse Solutions).
3
+
4
+ This module contains essential project-related constants including file names,
5
+ paths, and project metadata used throughout the configuration system.
6
+ """
7
+
8
+ from pathlib import Path
9
+ from typing import Final
10
+
11
+ # Configuration file names
12
+ CONFIG_FILES: Final = {
13
+ 'ARTIFICIAL_DATASET': 'generated_dataset_parameters.json',
14
+ 'ALGORITHM': 'algorithm_settings.json',
15
+ 'POSTPROCESSING': 'solution_postprocessing_settings.json',
16
+ }
17
+ ROOT_DIR = Path(__file__).resolve().parent.parent.parent
18
+ DATA_DIR = ROOT_DIR / 'data'
19
+
20
+ # Project metadata
21
+ PROJECT_NAME: Final = 'Gaussian Ensemble for Multiple Sparse Solutions'
22
+ PROJECT_ABBREV: Final = 'GEMSS'
23
+ CONFIG_PACKAGE_NAME: Final = 'gemss.config'
24
+
25
+ # Experiment results directory
26
+ EXPERIMENT_RESULTS_DIR: Final = ROOT_DIR / 'scripts' / 'results'
@@ -0,0 +1,11 @@
1
+ {
2
+ "N_FEATURES": 500,
3
+ "N_SAMPLES": 100,
4
+ "NOISE_STD": 0.1,
5
+ "NAN_RATIO": 0.0,
6
+ "N_GENERATING_SOLUTIONS": 3,
7
+ "DATASET_SEED": 42,
8
+ "BINARIZE": true,
9
+ "SPARSITY": 3,
10
+ "BINARY_RESPONSE_RATIO": 0.5
11
+ }
@@ -0,0 +1,6 @@
1
+ {
2
+ "DESIRED_SPARSITY": 3,
3
+ "MIN_MU_THRESHOLD": 0.2,
4
+ "USE_MEDIAN_FOR_OUTLIER_DETECTION": false,
5
+ "OUTLIER_DEVIATION_THRESHOLDS": [2.5, 3.0]
6
+ }
@@ -0,0 +1,16 @@
1
+ """
2
+ Data handling utilities for GEMSS experiments.
3
+ """
4
+
5
+ from gemss.data_handling.data_processing import (
6
+ get_df_from_X,
7
+ get_feature_name_mapping,
8
+ load_data,
9
+ preprocess_features,
10
+ preprocess_non_numeric_features,
11
+ )
12
+ from gemss.data_handling.generate_artificial_dataset import (
13
+ generate_artificial_dataset,
14
+ generate_multi_solution_data,
15
+ show_overview_of_generated_data,
16
+ )