spectral-model-explainer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spectral_model_explainer-0.1.0/LICENSE +21 -0
- spectral_model_explainer-0.1.0/PKG-INFO +202 -0
- spectral_model_explainer-0.1.0/README.md +171 -0
- spectral_model_explainer-0.1.0/pyproject.toml +39 -0
- spectral_model_explainer-0.1.0/smx/__init__.py +68 -0
- spectral_model_explainer-0.1.0/smx/_version.py +1 -0
- spectral_model_explainer-0.1.0/smx/datasets/__init__.py +3 -0
- spectral_model_explainer-0.1.0/smx/datasets/synthetic.py +246 -0
- spectral_model_explainer-0.1.0/smx/graph/__init__.py +16 -0
- spectral_model_explainer-0.1.0/smx/graph/builder.py +196 -0
- spectral_model_explainer-0.1.0/smx/graph/centrality.py +123 -0
- spectral_model_explainer-0.1.0/smx/graph/interpretation.py +191 -0
- spectral_model_explainer-0.1.0/smx/pipeline.py +332 -0
- spectral_model_explainer-0.1.0/smx/plotting/__init__.py +3 -0
- spectral_model_explainer-0.1.0/smx/plotting/threshold.py +132 -0
- spectral_model_explainer-0.1.0/smx/predicates/__init__.py +11 -0
- spectral_model_explainer-0.1.0/smx/predicates/bagging.py +167 -0
- spectral_model_explainer-0.1.0/smx/predicates/generation.py +176 -0
- spectral_model_explainer-0.1.0/smx/predicates/metrics.py +537 -0
- spectral_model_explainer-0.1.0/smx/zones/__init__.py +4 -0
- spectral_model_explainer-0.1.0/smx/zones/aggregation.py +172 -0
- spectral_model_explainer-0.1.0/smx/zones/extraction.py +109 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 RIBEIRO JOSE VINICIUS
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: spectral-model-explainer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A python library for spectral-zone-level explanations in machine learning models trained on spectral data (XRF, GRSm Raman, etc.)
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: spectroscopy,explainability,machine-learning,eXplainable-AI
|
|
8
|
+
Author: Ribeiro Jose Vinicius
|
|
9
|
+
Author-email: ribeirojosevinicius@gmail.com
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Provides-Extra: plotting
|
|
20
|
+
Requires-Dist: networkx (>=3.0)
|
|
21
|
+
Requires-Dist: numpy (>=1.24)
|
|
22
|
+
Requires-Dist: pandas (>=2.0)
|
|
23
|
+
Requires-Dist: plotly (>=5.0) ; extra == "dev"
|
|
24
|
+
Requires-Dist: plotly (>=5.0) ; extra == "plotting"
|
|
25
|
+
Requires-Dist: pytest (>=7.0) ; extra == "dev"
|
|
26
|
+
Requires-Dist: scikit-learn (>=1.3)
|
|
27
|
+
Project-URL: Homepage, https://github.com/joseviniciusr/SMX
|
|
28
|
+
Project-URL: Repository, https://github.com/joseviniciusr/SMX
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# SMX
|
|
32
|
+
|
|
33
|
+

|
|
34
|
+
|
|
35
|
+
This is the official repository for the `spectral-model-explainer` (SMX) library, an eXplainable AI tool designed to provide explanations for machine learning models trained on spectral data (*e.g.*, XRF, GRS, Raman, and related modalities).
|
|
36
|
+
|
|
37
|
+
SMX is a post-hoc, global, model-agnostic framework that explains spectral-based ML classifiers directly in terms of expert-informed spectral zones. It aggregates each zone via PCA, formulates quantile-based logical predicates, estimates their relevance through perturbation experiments within stochastic subsamples, and integrates the results into a directed weighted graph whose global structure is summarized by Local Reaching Centrality. A distinctive feature is threshold spectrum reconstruction, which back-projects each predicate's decision boundary into the original spectral domain in natural measurement units, enabling practitioners to visually compare their spectra against the model-related boundaries.
|
|
38
|
+
|
|
39
|
+
## Method Overview in the Library
|
|
40
|
+
|
|
41
|
+
The high-level workflow is implemented in the `SMX` pipeline class and can also be executed component-by-component through the public API:
|
|
42
|
+
|
|
43
|
+
1. spectral zone extraction
|
|
44
|
+
2. zone aggregation (typically PCA-based)
|
|
45
|
+
3. predicate generation from quantiles
|
|
46
|
+
4. bagging-based robustness evaluation
|
|
47
|
+
5. predicate relevance scoring
|
|
48
|
+
6. directed graph construction
|
|
49
|
+
7. centrality-based ranking and optional mapping back to natural scale
|
|
50
|
+
|
|
51
|
+
This implementation allows both:
|
|
52
|
+
|
|
53
|
+
- end-to-end execution through a single pipeline object
|
|
54
|
+
- advanced control through direct use of dedicated classes/functions
|
|
55
|
+
|
|
56
|
+
## Spectral Zone Construction
|
|
57
|
+
|
|
58
|
+
The method starts by partitioning the spectral axis into zones using `extract_spectral_zones`. Input spectra are expected as a DataFrame in which columns represent numeric spectral positions (energies, wavelengths, channels, etc.).
|
|
59
|
+
|
|
60
|
+
### How zones must be provided
|
|
61
|
+
|
|
62
|
+
The `cuts` argument accepts multiple valid formats:
|
|
63
|
+
|
|
64
|
+
- `(start, end)`
|
|
65
|
+
- `(name, start, end)`
|
|
66
|
+
- `(name, start, end, group)`
|
|
67
|
+
- `{name, start, end}`
|
|
68
|
+
- `{name, start, end, group}`
|
|
69
|
+
|
|
70
|
+
Important behavior:
|
|
71
|
+
|
|
72
|
+
- boundaries are interpreted numerically and inclusively
|
|
73
|
+
- if `start > end`, the library automatically reorders them
|
|
74
|
+
- grouped cuts (same `group`) are concatenated into one merged zone
|
|
75
|
+
- non-grouped cuts are kept as independent zones
|
|
76
|
+
|
|
77
|
+
This flexibility enables both physically meaningful elemental regions and composite regions such as aggregated background segments.
|
|
78
|
+
|
|
79
|
+
## Predicate Construction from Zone Scores
|
|
80
|
+
|
|
81
|
+
After extraction, each zone is transformed into one scalar score per sample (default strategy: PC1 score via `ZoneAggregator(method="pca")`). These zone-level summaries are the basis for predicate generation.
|
|
82
|
+
|
|
83
|
+
`PredicateGenerator` creates binary threshold predicates from a user-defined set of quantiles. For each zone and each quantile value `q`, two complementary predicates are produced:
|
|
84
|
+
|
|
85
|
+
- `zone <= threshold(q)`
|
|
86
|
+
- `zone > threshold(q)`
|
|
87
|
+
|
|
88
|
+
Therefore, if `k` quantiles are provided, the initial candidate set is `2k` predicates per zone (before duplicate removal). Duplicate rules are automatically removed when quantiles collapse to identical threshold values.
|
|
89
|
+
|
|
90
|
+
## Bagging and Robustness Hyperparameters
|
|
91
|
+
|
|
92
|
+
SMX estimates predicate robustness through repeated bagging cycles. In the high-level pipeline, this is controlled primarily by:
|
|
93
|
+
|
|
94
|
+
- `n_bags`: number of bags generated per repetition (seed)
|
|
95
|
+
- `n_repetitions`: number of independent repetitions (seed loop)
|
|
96
|
+
- `n_samples_fraction`: fraction of samples drawn in each bag
|
|
97
|
+
- `quantiles`: quantile grid that defines predicate thresholds
|
|
98
|
+
|
|
99
|
+
Operationally:
|
|
100
|
+
|
|
101
|
+
- each repetition creates a new random context for bag generation
|
|
102
|
+
- each bag evaluates which predicates are sufficiently supported by sampled data
|
|
103
|
+
- predicates with very low support in a bag are discarded for that bag
|
|
104
|
+
- final rankings are aggregated across valid repetitions to reduce seed sensitivity
|
|
105
|
+
|
|
106
|
+
This design makes the explanation less dependent on a single random split and more representative of stable decision behavior.
|
|
107
|
+
|
|
108
|
+
## Predicate Relevance and Graph Construction
|
|
109
|
+
|
|
110
|
+
Within each bag, predicates are ranked by an importance metric based on perturbation experiments:
|
|
111
|
+
|
|
112
|
+
- perturbation-based relevance (`PerturbationMetric`), using a fitted estimator
|
|
113
|
+
|
|
114
|
+
`PredicateGraphBuilder` then constructs a directed graph from ranked predicates:
|
|
115
|
+
|
|
116
|
+
- consecutive predicates in a ranking induce directed edges
|
|
117
|
+
- edge weights are accumulated across bags
|
|
118
|
+
- terminal class nodes are linked from last predicates in each path
|
|
119
|
+
- bidirectional conflicts are resolved by keeping the stronger direction (ties are randomized)
|
|
120
|
+
- edge weighting can incorporate zone-level explained variance from PCA (`var_exp=True`), which constrains the graph structure to reflect both predictive relevance and variance importance of zones
|
|
121
|
+
|
|
122
|
+
Finally, the graph is summarized through Local Reaching Centrality (LRC), producing a ranked list of influential predicates/zones. Accordngly, the final output is a DataFrame with predicates ranked by their LRC scores, along with their corresponding natural-scale thresholds and zone information. This allows practitioners to identify which spectral zones and thresholds are most influential in the model's decision-making process, providing insights into the underlying spectral features driving predictions. Beyond identifying relevant zones, the predicate's threshold values themselves live in PCA space and are back-projected to the original domain as per-zone multivariate thresholds that can be overlaid on measured spectra, translating an abstract condition into a physically readable boundary. Thus, SMX goes beyond numerical importances by delivering condition-aware, subset-aware explanations that support validation, hypothesis generation, and more actionable domain conclusions.
|
|
123
|
+
|
|
124
|
+
## Model Compatibility Note
|
|
125
|
+
|
|
126
|
+
At the current stage, SMX is primarily designed for use with scikit-learn-style estimators. In practical terms, this means that when the perturbation-based relevance strategy is employed, the estimator passed to the pipeline is expected to be already fitted and to expose the standard prediction interface required by the selected perturbation metric.
|
|
127
|
+
|
|
128
|
+
More specifically, the minimum requirement is a valid `predict` method. In addition, some perturbation metrics require richer interfaces: `probability_shift` requires `predict_proba`, while `decision_function_shift` requires `decision_function`. Consequently, any model class that follows this contract can be integrated in a technically consistent manner, independently of the specific learning algorithm (for example, SVMs, tree ensembles, linear models, and related scikit-learn-compatible estimators).
|
|
129
|
+
|
|
130
|
+
Ongoing development is focused on extending this compatibility layer beyond the current scikit-learn-centric workflow, with the objective of supporting additional model ecosystems and API styles in Python while preserving methodological consistency and interpretability guarantees.
|
|
131
|
+
|
|
132
|
+
## Installation and Optional Plotting Dependency
|
|
133
|
+
|
|
134
|
+
SMX is intentionally distributed with a lightweight core dependency set, where visualization is treated as an optional capability rather than a mandatory runtime requirement. This design ensures that users interested exclusively in methodological analysis (zone extraction, predicate construction, bagging, graph construction, and centrality-based ranking) can install and execute the framework without incurring additional graphical dependencies.
|
|
135
|
+
|
|
136
|
+
Base installation:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
pip install spectral-model-explainer
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
Installation with plotting support:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
pip install "spectral-model-explainer[plotting]"
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
In practical terms, the plotting extra enables functions that generate interactive visual outputs (for example, threshold-spectrum overlays used to inspect reconstructed multivariate decision boundaries in the natural spectral domain). The analytical SMX pipeline remains fully functional without this extra.
|
|
149
|
+
|
|
150
|
+
If plotting routines are invoked in an environment where the plotting extra has not been installed, SMX raises an explicit import-related error with installation guidance. This behavior is intentional: it preserves minimal installation overhead for non-visual workflows while providing clear and immediate feedback when visualization features are requested.
|
|
151
|
+
|
|
152
|
+
## Easy Usage
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
import pandas as pd
|
|
156
|
+
from sklearn.svm import SVC
|
|
157
|
+
from smx import SMX
|
|
158
|
+
|
|
159
|
+
# X_cal_prep: preprocessed calibration spectra (DataFrame)
|
|
160
|
+
# X_cal_natural: original calibration spectra before preprocessing (DataFrame)
|
|
161
|
+
# y_cal_labels: class labels for calibration samples (Series)
|
|
162
|
+
|
|
163
|
+
spectral_cuts = [
|
|
164
|
+
("F1", 1.0, 100.0),
|
|
165
|
+
("background", 100.0, 200.0, "background_group"),
|
|
166
|
+
("F2", 200.0, 300.0),
|
|
167
|
+
]
|
|
168
|
+
|
|
169
|
+
model = SVC(kernel="rbf", probability=True, random_state=42)
|
|
170
|
+
model.fit(X_cal_prep, y_cal_labels)
|
|
171
|
+
|
|
172
|
+
# Example: probability of the first class as continuous output
|
|
173
|
+
y_pred_cal = model.predict_proba(X_cal_prep)[:, 0]
|
|
174
|
+
|
|
175
|
+
smx = SMX(
|
|
176
|
+
spectral_cuts=spectral_cuts,
|
|
177
|
+
quantiles=[0.25, 0.50, 0.75],
|
|
178
|
+
n_repetitions=4,
|
|
179
|
+
n_bags=10,
|
|
180
|
+
n_samples_fraction=0.8,
|
|
181
|
+
replace=False,
|
|
182
|
+
metric="perturbation",
|
|
183
|
+
estimator=model,
|
|
184
|
+
perturbation_mode="median",
|
|
185
|
+
perturbation_metric="probability_shift",
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
smx.fit(X_cal_prep, y_pred_cal, X_cal_natural=X_cal_natural)
|
|
189
|
+
|
|
190
|
+
# Main result (ranked predicates with natural-scale thresholds)
|
|
191
|
+
results = smx.lrc_natural_
|
|
192
|
+
print(results.head())
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
For a complete, executable walkthrough with synthetic data and visualization outputs, see the quickstart notebook:
|
|
196
|
+
|
|
197
|
+
[examples/quickstart.ipynb](examples/quickstart.ipynb)
|
|
198
|
+
|
|
199
|
+
## License
|
|
200
|
+
|
|
201
|
+
This project is licensed under the MIT License. See the LICENSE file for details.
|
|
202
|
+
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# SMX
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
|
|
5
|
+
This is the official repository for the `spectral-model-explainer` (SMX) library, an eXplainable AI tool designed to provide explanations for machine learning models trained on spectral data (*e.g.*, XRF, GRS, Raman, and related modalities).
|
|
6
|
+
|
|
7
|
+
SMX is a post-hoc, global, model-agnostic framework that explains spectral-based ML classifiers directly in terms of expert-informed spectral zones. It aggregates each zone via PCA, formulates quantile-based logical predicates, estimates their relevance through perturbation experiments within stochastic subsamples, and integrates the results into a directed weighted graph whose global structure is summarized by Local Reaching Centrality. A distinctive feature is threshold spectrum reconstruction, which back-projects each predicate's decision boundary into the original spectral domain in natural measurement units, enabling practitioners to visually compare their spectra against the model-related boundaries.
|
|
8
|
+
|
|
9
|
+
## Method Overview in the Library
|
|
10
|
+
|
|
11
|
+
The high-level workflow is implemented in the `SMX` pipeline class and can also be executed component-by-component through the public API:
|
|
12
|
+
|
|
13
|
+
1. spectral zone extraction
|
|
14
|
+
2. zone aggregation (typically PCA-based)
|
|
15
|
+
3. predicate generation from quantiles
|
|
16
|
+
4. bagging-based robustness evaluation
|
|
17
|
+
5. predicate relevance scoring
|
|
18
|
+
6. directed graph construction
|
|
19
|
+
7. centrality-based ranking and optional mapping back to natural scale
|
|
20
|
+
|
|
21
|
+
This implementation allows both:
|
|
22
|
+
|
|
23
|
+
- end-to-end execution through a single pipeline object
|
|
24
|
+
- advanced control through direct use of dedicated classes/functions
|
|
25
|
+
|
|
26
|
+
## Spectral Zone Construction
|
|
27
|
+
|
|
28
|
+
The method starts by partitioning the spectral axis into zones using `extract_spectral_zones`. Input spectra are expected as a DataFrame in which columns represent numeric spectral positions (energies, wavelengths, channels, etc.).
|
|
29
|
+
|
|
30
|
+
### How zones must be provided
|
|
31
|
+
|
|
32
|
+
The `cuts` argument accepts multiple valid formats:
|
|
33
|
+
|
|
34
|
+
- `(start, end)`
|
|
35
|
+
- `(name, start, end)`
|
|
36
|
+
- `(name, start, end, group)`
|
|
37
|
+
- `{name, start, end}`
|
|
38
|
+
- `{name, start, end, group}`
|
|
39
|
+
|
|
40
|
+
Important behavior:
|
|
41
|
+
|
|
42
|
+
- boundaries are interpreted numerically and inclusively
|
|
43
|
+
- if `start > end`, the library automatically reorders them
|
|
44
|
+
- grouped cuts (same `group`) are concatenated into one merged zone
|
|
45
|
+
- non-grouped cuts are kept as independent zones
|
|
46
|
+
|
|
47
|
+
This flexibility enables both physically meaningful elemental regions and composite regions such as aggregated background segments.
|
|
48
|
+
|
|
49
|
+
## Predicate Construction from Zone Scores
|
|
50
|
+
|
|
51
|
+
After extraction, each zone is transformed into one scalar score per sample (default strategy: PC1 score via `ZoneAggregator(method="pca")`). These zone-level summaries are the basis for predicate generation.
|
|
52
|
+
|
|
53
|
+
`PredicateGenerator` creates binary threshold predicates from a user-defined set of quantiles. For each zone and each quantile value `q`, two complementary predicates are produced:
|
|
54
|
+
|
|
55
|
+
- `zone <= threshold(q)`
|
|
56
|
+
- `zone > threshold(q)`
|
|
57
|
+
|
|
58
|
+
Therefore, if `k` quantiles are provided, the initial candidate set is `2k` predicates per zone (before duplicate removal). Duplicate rules are automatically removed when quantiles collapse to identical threshold values.
|
|
59
|
+
|
|
60
|
+
## Bagging and Robustness Hyperparameters
|
|
61
|
+
|
|
62
|
+
SMX estimates predicate robustness through repeated bagging cycles. In the high-level pipeline, this is controlled primarily by:
|
|
63
|
+
|
|
64
|
+
- `n_bags`: number of bags generated per repetition (seed)
|
|
65
|
+
- `n_repetitions`: number of independent repetitions (seed loop)
|
|
66
|
+
- `n_samples_fraction`: fraction of samples drawn in each bag
|
|
67
|
+
- `quantiles`: quantile grid that defines predicate thresholds
|
|
68
|
+
|
|
69
|
+
Operationally:
|
|
70
|
+
|
|
71
|
+
- each repetition creates a new random context for bag generation
|
|
72
|
+
- each bag evaluates which predicates are sufficiently supported by sampled data
|
|
73
|
+
- predicates with very low support in a bag are discarded for that bag
|
|
74
|
+
- final rankings are aggregated across valid repetitions to reduce seed sensitivity
|
|
75
|
+
|
|
76
|
+
This design makes the explanation less dependent on a single random split and more representative of stable decision behavior.
|
|
77
|
+
|
|
78
|
+
## Predicate Relevance and Graph Construction
|
|
79
|
+
|
|
80
|
+
Within each bag, predicates are ranked by an importance metric based on perturbation experiments:
|
|
81
|
+
|
|
82
|
+
- perturbation-based relevance (`PerturbationMetric`), using a fitted estimator
|
|
83
|
+
|
|
84
|
+
`PredicateGraphBuilder` then constructs a directed graph from ranked predicates:
|
|
85
|
+
|
|
86
|
+
- consecutive predicates in a ranking induce directed edges
|
|
87
|
+
- edge weights are accumulated across bags
|
|
88
|
+
- terminal class nodes are linked from last predicates in each path
|
|
89
|
+
- bidirectional conflicts are resolved by keeping the stronger direction (ties are randomized)
|
|
90
|
+
- edge weighting can incorporate zone-level explained variance from PCA (`var_exp=True`), which constrains the graph structure to reflect both predictive relevance and variance importance of zones
|
|
91
|
+
|
|
92
|
+
Finally, the graph is summarized through Local Reaching Centrality (LRC), producing a ranked list of influential predicates/zones. Accordngly, the final output is a DataFrame with predicates ranked by their LRC scores, along with their corresponding natural-scale thresholds and zone information. This allows practitioners to identify which spectral zones and thresholds are most influential in the model's decision-making process, providing insights into the underlying spectral features driving predictions. Beyond identifying relevant zones, the predicate's threshold values themselves live in PCA space and are back-projected to the original domain as per-zone multivariate thresholds that can be overlaid on measured spectra, translating an abstract condition into a physically readable boundary. Thus, SMX goes beyond numerical importances by delivering condition-aware, subset-aware explanations that support validation, hypothesis generation, and more actionable domain conclusions.
|
|
93
|
+
|
|
94
|
+
## Model Compatibility Note
|
|
95
|
+
|
|
96
|
+
At the current stage, SMX is primarily designed for use with scikit-learn-style estimators. In practical terms, this means that when the perturbation-based relevance strategy is employed, the estimator passed to the pipeline is expected to be already fitted and to expose the standard prediction interface required by the selected perturbation metric.
|
|
97
|
+
|
|
98
|
+
More specifically, the minimum requirement is a valid `predict` method. In addition, some perturbation metrics require richer interfaces: `probability_shift` requires `predict_proba`, while `decision_function_shift` requires `decision_function`. Consequently, any model class that follows this contract can be integrated in a technically consistent manner, independently of the specific learning algorithm (for example, SVMs, tree ensembles, linear models, and related scikit-learn-compatible estimators).
|
|
99
|
+
|
|
100
|
+
Ongoing development is focused on extending this compatibility layer beyond the current scikit-learn-centric workflow, with the objective of supporting additional model ecosystems and API styles in Python while preserving methodological consistency and interpretability guarantees.
|
|
101
|
+
|
|
102
|
+
## Installation and Optional Plotting Dependency
|
|
103
|
+
|
|
104
|
+
SMX is intentionally distributed with a lightweight core dependency set, where visualization is treated as an optional capability rather than a mandatory runtime requirement. This design ensures that users interested exclusively in methodological analysis (zone extraction, predicate construction, bagging, graph construction, and centrality-based ranking) can install and execute the framework without incurring additional graphical dependencies.
|
|
105
|
+
|
|
106
|
+
Base installation:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
pip install spectral-model-explainer
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Installation with plotting support:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
pip install "spectral-model-explainer[plotting]"
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
In practical terms, the plotting extra enables functions that generate interactive visual outputs (for example, threshold-spectrum overlays used to inspect reconstructed multivariate decision boundaries in the natural spectral domain). The analytical SMX pipeline remains fully functional without this extra.
|
|
119
|
+
|
|
120
|
+
If plotting routines are invoked in an environment where the plotting extra has not been installed, SMX raises an explicit import-related error with installation guidance. This behavior is intentional: it preserves minimal installation overhead for non-visual workflows while providing clear and immediate feedback when visualization features are requested.
|
|
121
|
+
|
|
122
|
+
## Easy Usage
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
import pandas as pd
|
|
126
|
+
from sklearn.svm import SVC
|
|
127
|
+
from smx import SMX
|
|
128
|
+
|
|
129
|
+
# X_cal_prep: preprocessed calibration spectra (DataFrame)
|
|
130
|
+
# X_cal_natural: original calibration spectra before preprocessing (DataFrame)
|
|
131
|
+
# y_cal_labels: class labels for calibration samples (Series)
|
|
132
|
+
|
|
133
|
+
spectral_cuts = [
|
|
134
|
+
("F1", 1.0, 100.0),
|
|
135
|
+
("background", 100.0, 200.0, "background_group"),
|
|
136
|
+
("F2", 200.0, 300.0),
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
model = SVC(kernel="rbf", probability=True, random_state=42)
|
|
140
|
+
model.fit(X_cal_prep, y_cal_labels)
|
|
141
|
+
|
|
142
|
+
# Example: probability of the first class as continuous output
|
|
143
|
+
y_pred_cal = model.predict_proba(X_cal_prep)[:, 0]
|
|
144
|
+
|
|
145
|
+
smx = SMX(
|
|
146
|
+
spectral_cuts=spectral_cuts,
|
|
147
|
+
quantiles=[0.25, 0.50, 0.75],
|
|
148
|
+
n_repetitions=4,
|
|
149
|
+
n_bags=10,
|
|
150
|
+
n_samples_fraction=0.8,
|
|
151
|
+
replace=False,
|
|
152
|
+
metric="perturbation",
|
|
153
|
+
estimator=model,
|
|
154
|
+
perturbation_mode="median",
|
|
155
|
+
perturbation_metric="probability_shift",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
smx.fit(X_cal_prep, y_pred_cal, X_cal_natural=X_cal_natural)
|
|
159
|
+
|
|
160
|
+
# Main result (ranked predicates with natural-scale thresholds)
|
|
161
|
+
results = smx.lrc_natural_
|
|
162
|
+
print(results.head())
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
For a complete, executable walkthrough with synthetic data and visualization outputs, see the quickstart notebook:
|
|
166
|
+
|
|
167
|
+
[examples/quickstart.ipynb](examples/quickstart.ipynb)
|
|
168
|
+
|
|
169
|
+
## License
|
|
170
|
+
|
|
171
|
+
This project is licensed under the MIT License. See the LICENSE file for details.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["poetry-core>=1.9.0"]
|
|
3
|
+
build-backend = "poetry.core.masonry.api"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "spectral-model-explainer"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A python library for spectral-zone-level explanations in machine learning models trained on spectral data (XRF, GRSm Raman, etc.)"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "Ribeiro Jose Vinicius", email = "ribeirojosevinicius@gmail.com" },
|
|
11
|
+
{ name = "Goncalves Rafael Figueira" },
|
|
12
|
+
{ name = "Barbon Junior Sylvio", email = "sylvio.barbonjunior@units.it" },
|
|
13
|
+
]
|
|
14
|
+
readme = "README.md"
|
|
15
|
+
requires-python = ">=3.9"
|
|
16
|
+
license = "MIT"
|
|
17
|
+
license-files = ["LICENSE"]
|
|
18
|
+
keywords = ["spectroscopy", "explainability", "machine-learning", "eXplainable-AI"]
|
|
19
|
+
|
|
20
|
+
dependencies = [
|
|
21
|
+
"numpy>=1.24",
|
|
22
|
+
"pandas>=2.0",
|
|
23
|
+
"scikit-learn>=1.3",
|
|
24
|
+
"networkx>=3.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.optional-dependencies]
|
|
28
|
+
plotting = ["plotly>=5.0"]
|
|
29
|
+
dev = [
|
|
30
|
+
"pytest>=7.0",
|
|
31
|
+
"plotly>=5.0",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://github.com/joseviniciusr/SMX"
|
|
36
|
+
Repository = "https://github.com/joseviniciusr/SMX"
|
|
37
|
+
|
|
38
|
+
[tool.poetry]
|
|
39
|
+
packages = [{ include = "smx" }]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SMX — Spectral Model Explanation
|
|
3
|
+
=================================
|
|
4
|
+
Public API for the SMX core algorithm library.
|
|
5
|
+
|
|
6
|
+
Typical usage
|
|
7
|
+
-------------
|
|
8
|
+
>>> import smx
|
|
9
|
+
>>> zones = smx.extract_spectral_zones(Xcal, cuts)
|
|
10
|
+
>>> agg = smx.ZoneAggregator(method='pca')
|
|
11
|
+
>>> scores_df = agg.fit_transform(zones)
|
|
12
|
+
>>> gen = smx.PredicateGenerator(quantiles=[0.25, 0.5, 0.75])
|
|
13
|
+
>>> gen.fit(scores_df)
|
|
14
|
+
>>> bagger = smx.PredicateBagger()
|
|
15
|
+
>>> bags = bagger.run(scores_df, y_pred, gen.predicates_df_)
|
|
16
|
+
>>> metric = smx.CovarianceMetric(threshold=0.01)
|
|
17
|
+
>>> rankings = metric.compute(bags)
|
|
18
|
+
>>> builder = smx.PredicateGraphBuilder()
|
|
19
|
+
>>> graph = builder.build(bags, rankings, metric_column='Covariance')
|
|
20
|
+
>>> lrc_df = smx.compute_lrc(graph, gen.predicates_df_)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from smx._version import __version__ # noqa: F401
|
|
24
|
+
|
|
25
|
+
from smx.pipeline import SMX
|
|
26
|
+
from smx.zones.extraction import extract_spectral_zones
|
|
27
|
+
from smx.zones.aggregation import ZoneAggregator
|
|
28
|
+
from smx.predicates.generation import PredicateGenerator
|
|
29
|
+
from smx.predicates.bagging import PredicateBagger
|
|
30
|
+
from smx.predicates.metrics import (
|
|
31
|
+
BasePredicateMetric,
|
|
32
|
+
CovarianceMetric,
|
|
33
|
+
PerturbationMetric,
|
|
34
|
+
)
|
|
35
|
+
from smx.graph.builder import PredicateGraphBuilder
|
|
36
|
+
from smx.graph.centrality import compute_lrc, aggregate_lrc_across_seeds
|
|
37
|
+
from smx.graph.interpretation import (
|
|
38
|
+
map_thresholds_to_natural,
|
|
39
|
+
reconstruct_threshold_to_spectrum,
|
|
40
|
+
extract_predicate_info,
|
|
41
|
+
)
|
|
42
|
+
from smx.datasets.synthetic import generate_synthetic_spectral_data
|
|
43
|
+
|
|
44
|
+
__all__ = [
|
|
45
|
+
"__version__",
|
|
46
|
+
# pipeline (high-level facade)
|
|
47
|
+
"SMX",
|
|
48
|
+
# zones
|
|
49
|
+
"extract_spectral_zones",
|
|
50
|
+
"ZoneAggregator",
|
|
51
|
+
# predicates
|
|
52
|
+
"PredicateGenerator",
|
|
53
|
+
"PredicateBagger",
|
|
54
|
+
# metrics
|
|
55
|
+
"BasePredicateMetric",
|
|
56
|
+
"CovarianceMetric",
|
|
57
|
+
"PerturbationMetric",
|
|
58
|
+
# graph
|
|
59
|
+
"PredicateGraphBuilder",
|
|
60
|
+
"compute_lrc",
|
|
61
|
+
"aggregate_lrc_across_seeds",
|
|
62
|
+
# interpretation
|
|
63
|
+
"map_thresholds_to_natural",
|
|
64
|
+
"reconstruct_threshold_to_spectrum",
|
|
65
|
+
"extract_predicate_info",
|
|
66
|
+
# datasets
|
|
67
|
+
"generate_synthetic_spectral_data",
|
|
68
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|