ftir-prep 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. ftir_prep-0.1.0/LICENSE +21 -0
  2. ftir_prep-0.1.0/MANIFEST.in +40 -0
  3. ftir_prep-0.1.0/PKG-INFO +316 -0
  4. ftir_prep-0.1.0/README.md +264 -0
  5. ftir_prep-0.1.0/dataset/absorbance.dat +159 -0
  6. ftir_prep-0.1.0/dataset/wavenumbers.dat +1867 -0
  7. ftir_prep-0.1.0/examples/README.md +131 -0
  8. ftir_prep-0.1.0/examples/compare_pipelines/compare_pipelines.py +99 -0
  9. ftir_prep-0.1.0/examples/create_pipeline/direct_configuration.py +79 -0
  10. ftir_prep-0.1.0/examples/create_pipeline/pipeline_builder.py +81 -0
  11. ftir_prep-0.1.0/examples/explainer_analysis/explainer_example.py +145 -0
  12. ftir_prep-0.1.0/examples/pipeline_search/pipeline_search.py +164 -0
  13. ftir_prep-0.1.0/examples/pipeline_search/pipeline_search_nested.py +138 -0
  14. ftir_prep-0.1.0/examples/read_pipeline_from_file/read_pipeline_file.py +67 -0
  15. ftir_prep-0.1.0/pyproject.toml +112 -0
  16. ftir_prep-0.1.0/setup.cfg +4 -0
  17. ftir_prep-0.1.0/setup.py +72 -0
  18. ftir_prep-0.1.0/src/ftir_framework/__init__.py +87 -0
  19. ftir_prep-0.1.0/src/ftir_framework/config/__init__.py +25 -0
  20. ftir_prep-0.1.0/src/ftir_framework/config/settings.py +97 -0
  21. ftir_prep-0.1.0/src/ftir_framework/core/__init__.py +15 -0
  22. ftir_prep-0.1.0/src/ftir_framework/core/evaluator.py +692 -0
  23. ftir_prep-0.1.0/src/ftir_framework/core/explainer.py +329 -0
  24. ftir_prep-0.1.0/src/ftir_framework/core/pipeline.py +262 -0
  25. ftir_prep-0.1.0/src/ftir_framework/optimization/__init__.py +9 -0
  26. ftir_prep-0.1.0/src/ftir_framework/optimization/optuna_optimizer.py +1103 -0
  27. ftir_prep-0.1.0/src/ftir_framework/preprocessing/__init__.py +67 -0
  28. ftir_prep-0.1.0/src/ftir_framework/preprocessing/baseline.py +656 -0
  29. ftir_prep-0.1.0/src/ftir_framework/preprocessing/derivatives.py +98 -0
  30. ftir_prep-0.1.0/src/ftir_framework/preprocessing/normalization.py +333 -0
  31. ftir_prep-0.1.0/src/ftir_framework/preprocessing/smoothing.py +602 -0
  32. ftir_prep-0.1.0/src/ftir_framework/preprocessing/truncation.py +158 -0
  33. ftir_prep-0.1.0/src/ftir_framework/utils/__init__.py +9 -0
  34. ftir_prep-0.1.0/src/ftir_framework/utils/data_loader.py +160 -0
  35. ftir_prep-0.1.0/src/ftir_prep.egg-info/PKG-INFO +316 -0
  36. ftir_prep-0.1.0/src/ftir_prep.egg-info/SOURCES.txt +38 -0
  37. ftir_prep-0.1.0/src/ftir_prep.egg-info/dependency_links.txt +1 -0
  38. ftir_prep-0.1.0/src/ftir_prep.egg-info/not-zip-safe +1 -0
  39. ftir_prep-0.1.0/src/ftir_prep.egg-info/requires.txt +22 -0
  40. ftir_prep-0.1.0/src/ftir_prep.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Lucas Mendonça
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,40 @@
1
+ # Include the README and license files
2
+ include README.md
3
+ include LICENSE
4
+
5
+ # Include configuration files
6
+ include pyproject.toml
7
+ include setup.py
8
+
9
+ # Include examples directory
10
+ recursive-include examples *.py
11
+ include examples/README.md
12
+
13
+ # Include dataset for examples
14
+ recursive-include dataset *.dat
15
+
16
+ # Exclude unnecessary files
17
+ global-exclude *.pyc
18
+ global-exclude *.pyo
19
+ global-exclude *.pyd
20
+ global-exclude __pycache__
21
+ global-exclude .git*
22
+ global-exclude .DS_Store
23
+ global-exclude *.so
24
+
25
+ # Exclude build and distribution directories
26
+ prune build
27
+ prune dist
28
+ prune *.egg-info
29
+ prune venv
30
+ prune .venv
31
+
32
+ # Exclude test files from distribution
33
+ prune tests
34
+
35
+ # Exclude development files
36
+ exclude .gitignore
37
+ exclude *.ods
38
+ exclude metodos_a_avaliar.txt
39
+ exclude optimization_metadata.json
40
+ exclude best_pipeline_found.json
@@ -0,0 +1,316 @@
1
+ Metadata-Version: 2.4
2
+ Name: ftir-prep
3
+ Version: 0.1.0
4
+ Summary: A framework for designing and evaluating optimal preprocessing pipelines for FTIR spectral data used in classification tasks. It provides modular implementations of common preprocessing techniques and allows automated exploration of preprocessing combinations to enhance model performance.
5
+ Home-page: https://github.com/username/ftir-preprocessing-framework
6
+ Author: Lucas Mendonça
7
+ Author-email: Lucas Mendonça <lucas.mendonca@example.com>
8
+ Maintainer-email: Lucas Mendonça <lucas.mendonca@example.com>
9
+ License-Expression: MIT
10
+ Project-URL: Homepage, https://github.com/lucas-mendonca-andrade/FTIR-Prep
11
+ Project-URL: Repository, https://github.com/lucas-mendonca-andrade/FTIR-Prep
12
+ Project-URL: Bug Tracker, https://github.com/lucas-mendonca-andrade/FTIR-Prep/issues
13
+ Project-URL: Documentation, https://github.com/lucas-mendonca-andrade/FTIR-Prep/docs
14
+ Keywords: ftir,spectroscopy,preprocessing,machine-learning,optimization,bioinformatics
15
+ Classifier: Development Status :: 3 - Alpha
16
+ Classifier: Intended Audience :: Science/Research
17
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
18
+ Classifier: Topic :: Scientific/Engineering :: Chemistry
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.8
21
+ Classifier: Programming Language :: Python :: 3.9
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3.12
25
+ Requires-Python: >=3.8
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: numpy>=1.20.0
29
+ Requires-Dist: scikit-learn>=1.0.0
30
+ Requires-Dist: optuna>=3.0.0
31
+ Requires-Dist: rampy>=0.1.0
32
+ Requires-Dist: PyWavelets>=1.9.0
33
+ Requires-Dist: statsmodels>=0.13.0
34
+ Requires-Dist: pandas>=1.3.0
35
+ Requires-Dist: matplotlib>=3.5.0
36
+ Requires-Dist: shap>=0.41.0
37
+ Requires-Dist: openpyxl>=3.0.0
38
+ Provides-Extra: dev
39
+ Requires-Dist: pytest>=6.0; extra == "dev"
40
+ Requires-Dist: pytest-cov>=3.0; extra == "dev"
41
+ Requires-Dist: black>=22.0; extra == "dev"
42
+ Requires-Dist: flake8>=4.0; extra == "dev"
43
+ Requires-Dist: mypy>=0.950; extra == "dev"
44
+ Provides-Extra: docs
45
+ Requires-Dist: sphinx>=4.0; extra == "docs"
46
+ Requires-Dist: sphinx-rtd-theme>=1.0; extra == "docs"
47
+ Requires-Dist: myst-parser>=0.17; extra == "docs"
48
+ Dynamic: author
49
+ Dynamic: home-page
50
+ Dynamic: license-file
51
+ Dynamic: requires-python
52
+
53
+ # FTIR-Prep: FTIR Preprocessing Framework
54
+
55
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
56
+ [![Version](https://img.shields.io/badge/version-0.1.0-orange.svg)](https://github.com/username/ftir-preprocessing-framework)
57
+
58
+ A modular and extensible framework for optimizing FTIR preprocessing pipelines for disease diagnosis.
59
+
60
+ ## 🚀 Features
61
+
62
+ - **Modular**: Component-based reusable architecture
63
+ - **Extensible**: Easy addition of new preprocessing techniques
64
+ - **Automatic Optimization**: Optuna integration for optimal preprocessing pipeline search
65
+ - **Robust Validation**: Support for group-based cross-validation
66
+ - **Configurable**: Flexible pipeline configuration system
67
+ - **Documented**: Complete documentation with practical examples
68
+
69
+ ## 📋 Supported Preprocessing Techniques
70
+
71
+ ### 🔧 Baseline Correction
72
+ - **Rubberband**: Automatic correction using rubberband algorithm
73
+ - **Polynomial**: Correction using configurable order polynomials (1-6)
74
+ - **Whittaker**: Penalized least squares smoothing with lambda parameter
75
+ - **ALS**: Asymmetric Least Squares with lambda and p parameters
76
+ - **ArPLS**: Adaptive reweighted penalized least squares
77
+ - **DrPLS**: Doubly reweighted penalized least squares
78
+ - **GCV Spline**: Generalized cross-validation spline smoothing
79
+ - **Gaussian Process**: Baseline correction using Gaussian processes
80
+
81
+ ### 📊 Normalization
82
+ - **Min-Max**: Individual Min-Max spectrum normalization
83
+ - **Vector**: L1, L2, or maximum normalization
84
+ - **Amida I**: Normalization based on amide I band peak (1600-1700 cm⁻¹)
85
+ - **Area**: Area under curve normalization
86
+
87
+ ### 🎯 Smoothing
88
+ - **Savitzky-Golay**: Polynomial filter with configurable parameters
89
+ - **Wavelets**: Denoising using Daubechies wavelets (db2, db3, db4)
90
+ - **Local Polynomial**: LOWESS smoothing with configurable bandwidth
91
+ - **Whittaker**: Penalized least squares smoothing
92
+ - **GCV Spline**: Generalized cross-validation spline smoothing
93
+ - **Flat**: Flat window convolution smoothing
94
+ - **Hanning**: Hanning window convolution smoothing
95
+
96
+ ### 📈 Derivatives
97
+ - **First Derivative**: First derivative calculation via Savitzky-Golay (order 1)
98
+ - **Second Derivative**: Second derivative calculation via Savitzky-Golay (order 2)
99
+
100
+ ### ✂️ Wavelength Truncation
101
+ - **Fingerprint Region**: Keep only fingerprint region (900-1800 cm⁻¹)
102
+ - **Fingerprint + Amide**: Keep fingerprint and amide regions (900-1800, 2800-3050 cm⁻¹)
103
+
104
+ ### 🔍 Model Explainability
105
+ - **SHAP Analysis**: Feature importance analysis using SHAP values
106
+
107
+ ## 🏗️ Architecture
108
+
109
+ ```
110
+ ftir_framework/
111
+ ├── core/ # Core functionalities
112
+ │ ├── pipeline.py # Preprocessing pipeline
113
+ │ ├── evaluator.py # Pipeline evaluation
114
+ │ └── explainer.py # SHAP explainability analysis
115
+ ├── preprocessing/ # Preprocessing techniques
116
+ │ ├── baseline.py # Baseline correction
117
+ │ ├── normalization.py # Normalization
118
+ │ ├── smoothing.py # Smoothing
119
+ │ ├── derivatives.py # Derivative calculation
120
+ │ └── truncation.py # Wavelength truncation
121
+ ├── optimization/ # Automatic optimization
122
+ │ └── optuna_optimizer.py # Optuna integration
123
+ ├── utils/ # Utilities
124
+ │ └── data_loader.py # Data loading
125
+ └── config/ # Configurations
126
+ └── settings.py # Default parameters
127
+ ```
128
+
129
+ ## 🚀 Installation
130
+
131
+ ### Requirements
132
+ - Python 3.8+
133
+ - pip (usually included with Python)
134
+
135
+ ### Installation via PyPI (Recommended - Simplest)
136
+
137
+
138
+ ```bash
139
+ pip install ftir-prep
140
+ ```
141
+
142
+
143
+
144
+ ## 📖 Basic Usage
145
+
146
+ ### 1. Data Loading
147
+ #### 1.1 Separates into groups to guarantee that data from the same patient will be in the same fold in a future classification task
148
+ ```python
149
+ from ftir_framework import FTIRDataLoader
150
+
151
+ # Load FTIR data
152
+ data_loader = FTIRDataLoader(
153
+ data_path="ftir_data.dat",
154
+ wavenumbers_path="wavenumbers.dat"
155
+ )
156
+
157
+ X, y, wavenumbers = data_loader.load_data()
158
+
159
+ # Create groups var that will be used in classification task to indicate that patient's data must be in the same fold
160
+ groups = data_loader.create_groups(instances_per_group=3)
161
+ ```
162
+
163
+ #### 1.2 Slices the data to use only one spectra per patient. Data must be ordered by patient
164
+ ```python
165
+ from ftir_framework import FTIRDataLoader
166
+
167
+ # Load FTIR data
168
+ data_loader = FTIRDataLoader(
169
+ data_path="ftir_data.dat",
170
+ wavenumbers_path="wavenumbers.dat"
171
+ )
172
+
173
+ X, y, wavenumbers = data_loader.load_data(slice_size = 3) #use one of the triplicated spectra per patient
174
+ ```
175
+
176
+ ### 2. Pipeline Creation
177
+ ```python
178
+ from ftir_framework import FTIRPipeline, PipelineBuilder
179
+
180
+ # Using direct configuration
181
+ pipeline = FTIRPipeline()
182
+ pipeline.add_step('truncation', 'fingerprint_amide')
183
+ pipeline.add_step('baseline', 'polynomial', polynomial_order=2)
184
+ pipeline.add_step('normalization', 'vector')
185
+
186
+
187
+ # Using PipelineBuilder (Fluent API)
188
+ pipeline = (PipelineBuilder()
189
+ .add_truncation('fingerprint_amide')
190
+ .add_baseline('rubberband')
191
+ .add_normalization('minmax')
192
+ .add_smoothing('savgol', polyorder=2)
193
+ .add_derivative('savgol',order=1)
194
+ .build())
195
+ ```
196
+
197
+ ### 3. Execution and Evaluation
198
+ ```python
199
+ from ftir_framework import PipelineEvaluator
200
+
201
+ # Process data
202
+ X_processed, wavenumbers_processed = pipeline.process(X, wavenumbers)
203
+
204
+ # Evaluate pipeline
205
+ evaluator = PipelineEvaluator(classifier=None, # use default Random Forest
206
+ cv_method='StratifiedGroupKFold', # cross-validation strategy
207
+ cv_params={'n_splits': 3, #folds
208
+ 'shuffler': False,
209
+ 'random_state': 42})
210
+ results = evaluator.evaluate_pipeline(pipeline,
211
+ X, y,
212
+ groups, # groups var created previously
213
+ wavenumbers=wavenumbers
214
+ )
215
+
216
+ print(f"Accuracy: {results['mean_accuracy']:.4f} ± {results['std_accuracy']:.4f}")
217
+ ```
218
+
219
+ ### 4. Automatic Optimization
220
+ ```python
221
+ from ftir_framework import OptunaPipelineOptimizer
222
+
223
+ # Automatically optimize parameters
224
+ optimizer = OptunaPipelineOptimizer(X, y,
225
+ wavenumbers,
226
+ groups,
227
+ evaluator=evaluator, # previously configured PipelineEvaluator object
228
+ metric='f1_macro')
229
+ study = optimizer.optimize(n_trials=30)
230
+
231
+ best_pipeline = optimizer.best_pipeline
232
+ best_pipeline.save_pipeline("best_pipeline_found.json") # Saves the best pipeline found in a json file
233
+ print("Best pipeline saved to 'best_pipeline_found.json'")
234
+
235
+ # Save optimization metadata
236
+ metadata = optimizer.get_metadata()
237
+ metadata.to_csv("optimization_metadata.csv")
238
+
239
+ ```
240
+
241
+ ### 5. Model Explainability
242
+ ```python
243
+ from ftir_framework import FTIRExplainer
244
+
245
+ # Create explainer
246
+ explainer = FTIRExplainer(classifier=your_classifier)
247
+
248
+ # Analyze feature importance with SHAP
249
+ # It will save in output_dir a csv and a png with feature importance data
250
+ results = explainer.explain_model(
251
+ X_processed, y, groups,
252
+ split_method='stratified_group',
253
+ feature_names=wavenumbers_processed,
254
+ output_dir="shap_analysis"
255
+ )
256
+ ```
257
+
258
+ ## 🔬 Practical Examples
259
+
260
+ ### Pipeline Creation Examples
261
+ ```bash
262
+ # Direct configuration example
263
+ python3 examples/create_pipeline/direct_configuration.py
264
+
265
+ # PipelineBuilder (Fluent API) example
266
+ python3 examples/create_pipeline/pipeline_builder.py
267
+ ```
268
+
269
+ ### Pipeline Comparison Example
270
+ ```bash
271
+ # Compare different preprocessing strategies
272
+ python3 examples/compare_pipelines/compare_pipelines.py
273
+ ```
274
+
275
+ ### Pipeline Optimization Example
276
+ ```bash
277
+ # Automatic pipeline optimization
278
+ python3 examples/pipeline_search/pipeline_search.py
279
+ ```
280
+
281
+ ### Pipeline Loading Example
282
+ ```bash
283
+ # Load and use saved pipelines
284
+ python3 examples/read_pipeline_from_file/read_pipeline_file.py
285
+ ```
286
+
287
+ ### SHAP Explainability Example
288
+ ```bash
289
+ # Feature importance analysis with SHAP
290
+ python3 examples/shap_analysis/explainer_example.py
291
+ ```
292
+
293
+ ## 🎯 Use Cases
294
+
295
+ ### Disease Diagnosis
296
+ - Analysis of FTIR spectra from biological samples
297
+ - Biomarker identification
298
+ - Automatic sample classification
299
+
300
+ ### Scientific Research
301
+ - Methodology comparison
302
+ - Protocol optimization
303
+ - Result validation
304
+
305
+
306
+ ## 📚 Documentation
307
+
308
+ - **Docstrings**: Complete inline documentation
309
+ - **Examples**: Functional example code
310
+
311
+ ## 👥 Authors
312
+
313
+ - **Lucas Mendonça** - *Initial development* - [GitHub](https://github.com/lucas-mendonca-andrade)
314
+
315
+
316
+ ⭐ If this project was useful to you, consider giving it a star on GitHub!
@@ -0,0 +1,264 @@
1
+ # FTIR-Prep: FTIR Preprocessing Framework
2
+
3
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
4
+ [![Version](https://img.shields.io/badge/version-0.1.0-orange.svg)](https://github.com/username/ftir-preprocessing-framework)
5
+
6
+ A modular and extensible framework for optimizing FTIR preprocessing pipelines for disease diagnosis.
7
+
8
+ ## 🚀 Features
9
+
10
+ - **Modular**: Component-based reusable architecture
11
+ - **Extensible**: Easy addition of new preprocessing techniques
12
+ - **Automatic Optimization**: Optuna integration for optimal preprocessing pipeline search
13
+ - **Robust Validation**: Support for group-based cross-validation
14
+ - **Configurable**: Flexible pipeline configuration system
15
+ - **Documented**: Complete documentation with practical examples
16
+
17
+ ## 📋 Supported Preprocessing Techniques
18
+
19
+ ### 🔧 Baseline Correction
20
+ - **Rubberband**: Automatic correction using rubberband algorithm
21
+ - **Polynomial**: Correction using configurable order polynomials (1-6)
22
+ - **Whittaker**: Penalized least squares smoothing with lambda parameter
23
+ - **ALS**: Asymmetric Least Squares with lambda and p parameters
24
+ - **ArPLS**: Adaptive reweighted penalized least squares
25
+ - **DrPLS**: Doubly reweighted penalized least squares
26
+ - **GCV Spline**: Generalized cross-validation spline smoothing
27
+ - **Gaussian Process**: Baseline correction using Gaussian processes
28
+
29
+ ### 📊 Normalization
30
+ - **Min-Max**: Individual Min-Max spectrum normalization
31
+ - **Vector**: L1, L2, or maximum normalization
32
+ - **Amida I**: Normalization based on amide I band peak (1600-1700 cm⁻¹)
33
+ - **Area**: Area under curve normalization
34
+
35
+ ### 🎯 Smoothing
36
+ - **Savitzky-Golay**: Polynomial filter with configurable parameters
37
+ - **Wavelets**: Denoising using Daubechies wavelets (db2, db3, db4)
38
+ - **Local Polynomial**: LOWESS smoothing with configurable bandwidth
39
+ - **Whittaker**: Penalized least squares smoothing
40
+ - **GCV Spline**: Generalized cross-validation spline smoothing
41
+ - **Flat**: Flat window convolution smoothing
42
+ - **Hanning**: Hanning window convolution smoothing
43
+
44
+ ### 📈 Derivatives
45
+ - **First Derivative**: First derivative calculation via Savitzky-Golay (order 1)
46
+ - **Second Derivative**: Second derivative calculation via Savitzky-Golay (order 2)
47
+
48
+ ### ✂️ Wavelength Truncation
49
+ - **Fingerprint Region**: Keep only fingerprint region (900-1800 cm⁻¹)
50
+ - **Fingerprint + Amide**: Keep fingerprint and amide regions (900-1800, 2800-3050 cm⁻¹)
51
+
52
+ ### 🔍 Model Explainability
53
+ - **SHAP Analysis**: Feature importance analysis using SHAP values
54
+
55
+ ## 🏗️ Architecture
56
+
57
+ ```
58
+ ftir_framework/
59
+ ├── core/ # Core functionalities
60
+ │ ├── pipeline.py # Preprocessing pipeline
61
+ │ ├── evaluator.py # Pipeline evaluation
62
+ │ └── explainer.py # SHAP explainability analysis
63
+ ├── preprocessing/ # Preprocessing techniques
64
+ │ ├── baseline.py # Baseline correction
65
+ │ ├── normalization.py # Normalization
66
+ │ ├── smoothing.py # Smoothing
67
+ │ ├── derivatives.py # Derivative calculation
68
+ │ └── truncation.py # Wavelength truncation
69
+ ├── optimization/ # Automatic optimization
70
+ │ └── optuna_optimizer.py # Optuna integration
71
+ ├── utils/ # Utilities
72
+ │ └── data_loader.py # Data loading
73
+ └── config/ # Configurations
74
+ └── settings.py # Default parameters
75
+ ```
76
+
77
+ ## 🚀 Installation
78
+
79
+ ### Requirements
80
+ - Python 3.8+
81
+ - pip (usually included with Python)
82
+
83
+ ### Installation via PyPI (Recommended - Simplest)
84
+
85
+
86
+ ```bash
87
+ pip install ftir-prep
88
+ ```
89
+
90
+
91
+
92
+ ## 📖 Basic Usage
93
+
94
+ ### 1. Data Loading
95
+ #### 1.1 Separates into groups to guarantee that data from the same patient will be in the same fold in a future classification task
96
+ ```python
97
+ from ftir_framework import FTIRDataLoader
98
+
99
+ # Load FTIR data
100
+ data_loader = FTIRDataLoader(
101
+ data_path="ftir_data.dat",
102
+ wavenumbers_path="wavenumbers.dat"
103
+ )
104
+
105
+ X, y, wavenumbers = data_loader.load_data()
106
+
107
+ # Create groups var that will be used in classification task to indicate that patient's data must be in the same fold
108
+ groups = data_loader.create_groups(instances_per_group=3)
109
+ ```
110
+
111
+ #### 1.2 Slices the data to use only one spectra per patient. Data must be ordered by patient
112
+ ```python
113
+ from ftir_framework import FTIRDataLoader
114
+
115
+ # Load FTIR data
116
+ data_loader = FTIRDataLoader(
117
+ data_path="ftir_data.dat",
118
+ wavenumbers_path="wavenumbers.dat"
119
+ )
120
+
121
+ X, y, wavenumbers = data_loader.load_data(slice_size = 3) #use one of the triplicated spectra per patient
122
+ ```
123
+
124
+ ### 2. Pipeline Creation
125
+ ```python
126
+ from ftir_framework import FTIRPipeline, PipelineBuilder
127
+
128
+ # Using direct configuration
129
+ pipeline = FTIRPipeline()
130
+ pipeline.add_step('truncation', 'fingerprint_amide')
131
+ pipeline.add_step('baseline', 'polynomial', polynomial_order=2)
132
+ pipeline.add_step('normalization', 'vector')
133
+
134
+
135
+ # Using PipelineBuilder (Fluent API)
136
+ pipeline = (PipelineBuilder()
137
+ .add_truncation('fingerprint_amide')
138
+ .add_baseline('rubberband')
139
+ .add_normalization('minmax')
140
+ .add_smoothing('savgol', polyorder=2)
141
+ .add_derivative('savgol',order=1)
142
+ .build())
143
+ ```
144
+
145
+ ### 3. Execution and Evaluation
146
+ ```python
147
+ from ftir_framework import PipelineEvaluator
148
+
149
+ # Process data
150
+ X_processed, wavenumbers_processed = pipeline.process(X, wavenumbers)
151
+
152
+ # Evaluate pipeline
153
+ evaluator = PipelineEvaluator(classifier=None, # use default Random Forest
154
+ cv_method='StratifiedGroupKFold', # cross-validation strategy
155
+ cv_params={'n_splits': 3, #folds
156
+ 'shuffler': False,
157
+ 'random_state': 42})
158
+ results = evaluator.evaluate_pipeline(pipeline,
159
+ X, y,
160
+ groups, # groups var created previously
161
+ wavenumbers=wavenumbers
162
+ )
163
+
164
+ print(f"Accuracy: {results['mean_accuracy']:.4f} ± {results['std_accuracy']:.4f}")
165
+ ```
166
+
167
+ ### 4. Automatic Optimization
168
+ ```python
169
+ from ftir_framework import OptunaPipelineOptimizer
170
+
171
+ # Automatically optimize parameters
172
+ optimizer = OptunaPipelineOptimizer(X, y,
173
+ wavenumbers,
174
+ groups,
175
+ evaluator=evaluator, # previously configured PipelineEvaluator object
176
+ metric='f1_macro')
177
+ study = optimizer.optimize(n_trials=30)
178
+
179
+ best_pipeline = optimizer.best_pipeline
180
+ best_pipeline.save_pipeline("best_pipeline_found.json") # Saves the best pipeline found in a json file
181
+ print("Best pipeline saved to 'best_pipeline_found.json'")
182
+
183
+ # Save optimization metadata
184
+ metadata = optimizer.get_metadata()
185
+ metadata.to_csv("optimization_metadata.csv")
186
+
187
+ ```
188
+
189
+ ### 5. Model Explainability
190
+ ```python
191
+ from ftir_framework import FTIRExplainer
192
+
193
+ # Create explainer
194
+ explainer = FTIRExplainer(classifier=your_classifier)
195
+
196
+ # Analyze feature importance with SHAP
197
+ # It will save in output_dir a csv and a png with feature importance data
198
+ results = explainer.explain_model(
199
+ X_processed, y, groups,
200
+ split_method='stratified_group',
201
+ feature_names=wavenumbers_processed,
202
+ output_dir="shap_analysis"
203
+ )
204
+ ```
205
+
206
+ ## 🔬 Practical Examples
207
+
208
+ ### Pipeline Creation Examples
209
+ ```bash
210
+ # Direct configuration example
211
+ python3 examples/create_pipeline/direct_configuration.py
212
+
213
+ # PipelineBuilder (Fluent API) example
214
+ python3 examples/create_pipeline/pipeline_builder.py
215
+ ```
216
+
217
+ ### Pipeline Comparison Example
218
+ ```bash
219
+ # Compare different preprocessing strategies
220
+ python3 examples/compare_pipelines/compare_pipelines.py
221
+ ```
222
+
223
+ ### Pipeline Optimization Example
224
+ ```bash
225
+ # Automatic pipeline optimization
226
+ python3 examples/pipeline_search/pipeline_search.py
227
+ ```
228
+
229
+ ### Pipeline Loading Example
230
+ ```bash
231
+ # Load and use saved pipelines
232
+ python3 examples/read_pipeline_from_file/read_pipeline_file.py
233
+ ```
234
+
235
+ ### SHAP Explainability Example
236
+ ```bash
237
+ # Feature importance analysis with SHAP
238
+ python3 examples/shap_analysis/explainer_example.py
239
+ ```
240
+
241
+ ## 🎯 Use Cases
242
+
243
+ ### Disease Diagnosis
244
+ - Analysis of FTIR spectra from biological samples
245
+ - Biomarker identification
246
+ - Automatic sample classification
247
+
248
+ ### Scientific Research
249
+ - Methodology comparison
250
+ - Protocol optimization
251
+ - Result validation
252
+
253
+
254
+ ## 📚 Documentation
255
+
256
+ - **Docstrings**: Complete inline documentation
257
+ - **Examples**: Functional example code
258
+
259
+ ## 👥 Authors
260
+
261
+ - **Lucas Mendonça** - *Initial development* - [GitHub](https://github.com/lucas-mendonca-andrade)
262
+
263
+
264
+ ⭐ If this project was useful to you, consider giving it a star on GitHub!