easeai 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easeai-0.1.0/LICENSE +21 -0
- easeai-0.1.0/PKG-INFO +138 -0
- easeai-0.1.0/README.md +106 -0
- easeai-0.1.0/easeai/__init__.py +4 -0
- easeai-0.1.0/easeai/data.py +71 -0
- easeai-0.1.0/easeai/explain.py +43 -0
- easeai-0.1.0/easeai/model.py +118 -0
- easeai-0.1.0/easeai/plotting.py +77 -0
- easeai-0.1.0/easeai/workflow.py +108 -0
- easeai-0.1.0/easeai.egg-info/PKG-INFO +138 -0
- easeai-0.1.0/easeai.egg-info/SOURCES.txt +16 -0
- easeai-0.1.0/easeai.egg-info/dependency_links.txt +1 -0
- easeai-0.1.0/easeai.egg-info/requires.txt +10 -0
- easeai-0.1.0/easeai.egg-info/top_level.txt +1 -0
- easeai-0.1.0/pyproject.toml +49 -0
- easeai-0.1.0/setup.cfg +4 -0
- easeai-0.1.0/tests/test_basic.py +26 -0
- easeai-0.1.0/tests/test_import.py +5 -0
easeai-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ria
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
easeai-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: easeai
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Reusable XGBoost + SHAP workflow for tabular regression explainability
|
|
5
|
+
Author: Ria A. Martins
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/martinsria7/easeai
|
|
8
|
+
Project-URL: Repository, https://github.com/martinsria7/easeai
|
|
9
|
+
Project-URL: Issues, https://github.com/martinsria7/easeai/issues
|
|
10
|
+
Keywords: xgboost,shap,tabular,xai,regression
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: numpy>=1.24
|
|
23
|
+
Requires-Dist: pandas>=2.0
|
|
24
|
+
Requires-Dist: scikit-learn>=1.3
|
|
25
|
+
Requires-Dist: xgboost>=2.0
|
|
26
|
+
Requires-Dist: shap>=0.45
|
|
27
|
+
Requires-Dist: matplotlib>=3.8
|
|
28
|
+
Requires-Dist: Pillow>=10.0
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# EASEai
|
|
34
|
+
|
|
35
|
+
<p align="center">
|
|
36
|
+
<a href="https://www.python.org/"><img alt="Python" src="https://img.shields.io/badge/python-3.9%2B-blue.svg"></a>
|
|
37
|
+
<a href="./LICENSE"><img alt="License: MIT" src="https://img.shields.io/badge/license-MIT-green.svg"></a>
|
|
38
|
+
<img alt="Status" src="https://img.shields.io/badge/status-alpha-orange.svg">
|
|
39
|
+
<img alt="XGBoost" src="https://img.shields.io/badge/model-XGBoost-5C8A3D.svg">
|
|
40
|
+
<img alt="SHAP" src="https://img.shields.io/badge/explainability-SHAP-7A3E9D.svg">
|
|
41
|
+
</p>
|
|
42
|
+
|
|
43
|
+
<p align="center"><strong>Explainable AI for epidemiology and public health.</strong></p>
|
|
44
|
+
|
|
45
|
+
`easeai` is a lightweight Python package for **tabular regression explainability** built around a practical research workflow:
|
|
46
|
+
|
|
47
|
+
- XGBoost hyperparameter tuning
|
|
48
|
+
- iterative feature elimination based on XGBoost importance
|
|
49
|
+
- cross-validated RMSE and R²
|
|
50
|
+
- SHAP feature ranking
|
|
51
|
+
- partial dependence plots
|
|
52
|
+
- row-level dominant driver extraction
|
|
53
|
+
|
|
54
|
+
It was extracted from a county-level environmental health workflow, but the package is intentionally general so others can use it on any tabular regression dataset.
|
|
55
|
+
|
|
56
|
+
## Why use EASEai?
|
|
57
|
+
|
|
58
|
+
Many research notebooks mix together preprocessing, model tuning, feature selection, evaluation, and explainability in one place. `easeai` turns that into a reusable package for researchers who want:
|
|
59
|
+
|
|
60
|
+
- a quick XGBoost + SHAP baseline
|
|
61
|
+
- interpretable feature ranking
|
|
62
|
+
- reproducible artifact export
|
|
63
|
+
- a cleaner starting point for GitHub or publication-oriented workflows
|
|
64
|
+
|
|
65
|
+
## Installation
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install -e .
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Or after publication:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install easeai
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Minimal example
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
import pandas as pd
|
|
81
|
+
import easeai as ea
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
df = pd.read_csv("Alzheimer_merged1.csv", encoding="ISO-8859-1")
|
|
85
|
+
|
|
86
|
+
workflow = ea.TabularXAIRegressor(
|
|
87
|
+
target="AD_PREV_MEAN",
|
|
88
|
+
drop_columns=["Counties", "FIPS"],
|
|
89
|
+
target_n_features=15,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
workflow.fit(df)
|
|
93
|
+
results = workflow.summarize(id_column="FIPS", name_column="Counties")
|
|
94
|
+
|
|
95
|
+
print(results.selected_features)
|
|
96
|
+
print(results.metrics)
|
|
97
|
+
print(results.shap_importance.head())
|
|
98
|
+
|
|
99
|
+
results.top_drivers[["id", "name", "top_driver"]].to_csv("county_top_drivers.csv", index=False)
|
|
100
|
+
workflow.export_artifacts("artifacts")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Package structure
|
|
104
|
+
|
|
105
|
+
```text
|
|
106
|
+
easeai/
|
|
107
|
+
data.py # preprocessing helpers
|
|
108
|
+
model.py # tuning, CV, recursive elimination
|
|
109
|
+
explain.py # SHAP summaries and top-driver extraction
|
|
110
|
+
plotting.py # SHAP and PDP export helpers
|
|
111
|
+
workflow.py # end-to-end workflow class
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Suggested GitHub topics
|
|
115
|
+
|
|
116
|
+
`xgboost`, `shap`, `explainable-ai`, `tabular-data`, `epidemiology`, `public-health`, `machine-learning`, `python`
|
|
117
|
+
|
|
118
|
+
## Roadmap
|
|
119
|
+
|
|
120
|
+
- add classification support
|
|
121
|
+
- add permutation importance
|
|
122
|
+
- add bootstrap confidence intervals
|
|
123
|
+
- add optional map-ready exports
|
|
124
|
+
- publish on PyPI
|
|
125
|
+
|
|
126
|
+
## Development
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
pytest
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## License
|
|
133
|
+
|
|
134
|
+
MIT
|
|
135
|
+
|
|
136
|
+
## Author
|
|
137
|
+
|
|
138
|
+
Ria A. Martins
|
easeai-0.1.0/README.md
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# EASEai
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<a href="https://www.python.org/"><img alt="Python" src="https://img.shields.io/badge/python-3.9%2B-blue.svg"></a>
|
|
5
|
+
<a href="./LICENSE"><img alt="License: MIT" src="https://img.shields.io/badge/license-MIT-green.svg"></a>
|
|
6
|
+
<img alt="Status" src="https://img.shields.io/badge/status-alpha-orange.svg">
|
|
7
|
+
<img alt="XGBoost" src="https://img.shields.io/badge/model-XGBoost-5C8A3D.svg">
|
|
8
|
+
<img alt="SHAP" src="https://img.shields.io/badge/explainability-SHAP-7A3E9D.svg">
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
<p align="center"><strong>Explainable AI for epidemiology and public health.</strong></p>
|
|
12
|
+
|
|
13
|
+
`easeai` is a lightweight Python package for **tabular regression explainability** built around a practical research workflow:
|
|
14
|
+
|
|
15
|
+
- XGBoost hyperparameter tuning
|
|
16
|
+
- iterative feature elimination based on XGBoost importance
|
|
17
|
+
- cross-validated RMSE and R²
|
|
18
|
+
- SHAP feature ranking
|
|
19
|
+
- partial dependence plots
|
|
20
|
+
- row-level dominant driver extraction
|
|
21
|
+
|
|
22
|
+
It was extracted from a county-level environmental health workflow, but the package is intentionally general so others can use it on any tabular regression dataset.
|
|
23
|
+
|
|
24
|
+
## Why use EASEai?
|
|
25
|
+
|
|
26
|
+
Many research notebooks mix together preprocessing, model tuning, feature selection, evaluation, and explainability in one place. `easeai` turns that into a reusable package for researchers who want:
|
|
27
|
+
|
|
28
|
+
- a quick XGBoost + SHAP baseline
|
|
29
|
+
- interpretable feature ranking
|
|
30
|
+
- reproducible artifact export
|
|
31
|
+
- a cleaner starting point for GitHub or publication-oriented workflows
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install -e .
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Or after publication:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install easeai
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Minimal example
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import pandas as pd
|
|
49
|
+
import easeai as ea
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
df = pd.read_csv("Alzheimer_merged1.csv", encoding="ISO-8859-1")
|
|
53
|
+
|
|
54
|
+
workflow = ea.TabularXAIRegressor(
|
|
55
|
+
target="AD_PREV_MEAN",
|
|
56
|
+
drop_columns=["Counties", "FIPS"],
|
|
57
|
+
target_n_features=15,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
workflow.fit(df)
|
|
61
|
+
results = workflow.summarize(id_column="FIPS", name_column="Counties")
|
|
62
|
+
|
|
63
|
+
print(results.selected_features)
|
|
64
|
+
print(results.metrics)
|
|
65
|
+
print(results.shap_importance.head())
|
|
66
|
+
|
|
67
|
+
results.top_drivers[["id", "name", "top_driver"]].to_csv("county_top_drivers.csv", index=False)
|
|
68
|
+
workflow.export_artifacts("artifacts")
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Package structure
|
|
72
|
+
|
|
73
|
+
```text
|
|
74
|
+
easeai/
|
|
75
|
+
data.py # preprocessing helpers
|
|
76
|
+
model.py # tuning, CV, recursive elimination
|
|
77
|
+
explain.py # SHAP summaries and top-driver extraction
|
|
78
|
+
plotting.py # SHAP and PDP export helpers
|
|
79
|
+
workflow.py # end-to-end workflow class
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Suggested GitHub topics
|
|
83
|
+
|
|
84
|
+
`xgboost`, `shap`, `explainable-ai`, `tabular-data`, `epidemiology`, `public-health`, `machine-learning`, `python`
|
|
85
|
+
|
|
86
|
+
## Roadmap
|
|
87
|
+
|
|
88
|
+
- add classification support
|
|
89
|
+
- add permutation importance
|
|
90
|
+
- add bootstrap confidence intervals
|
|
91
|
+
- add optional map-ready exports
|
|
92
|
+
- publish on PyPI
|
|
93
|
+
|
|
94
|
+
## Development
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
pytest
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## License
|
|
101
|
+
|
|
102
|
+
MIT
|
|
103
|
+
|
|
104
|
+
## Author
|
|
105
|
+
|
|
106
|
+
Ria A. Martins
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Iterable, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def preprocess_frame(
|
|
10
|
+
df: pd.DataFrame,
|
|
11
|
+
target: str,
|
|
12
|
+
drop_columns: Optional[Iterable[str]] = None,
|
|
13
|
+
fill_strategy: str = "median",
|
|
14
|
+
coerce_numeric: bool = True,
|
|
15
|
+
clean_column_names: bool = True,
|
|
16
|
+
) -> Tuple[pd.DataFrame, pd.Series]:
|
|
17
|
+
"""Prepare a tabular dataframe for XGBoost regression.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
df:
|
|
22
|
+
Input dataframe.
|
|
23
|
+
target:
|
|
24
|
+
Name of target column.
|
|
25
|
+
drop_columns:
|
|
26
|
+
Extra identifier or metadata columns to remove from the feature matrix.
|
|
27
|
+
fill_strategy:
|
|
28
|
+
One of {"median", "mean", "zero", "drop"}.
|
|
29
|
+
coerce_numeric:
|
|
30
|
+
Convert feature columns to numeric, coercing non-numeric values to NaN.
|
|
31
|
+
clean_column_names:
|
|
32
|
+
Replace non-alphanumeric characters with underscores.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
X, y
|
|
37
|
+
"""
|
|
38
|
+
if target not in df.columns:
|
|
39
|
+
raise KeyError(f"Target column '{target}' not found in dataframe.")
|
|
40
|
+
|
|
41
|
+
drop_columns = list(drop_columns or [])
|
|
42
|
+
y = pd.to_numeric(df[target], errors="raise")
|
|
43
|
+
|
|
44
|
+
feature_drop = [target] + [c for c in drop_columns if c in df.columns]
|
|
45
|
+
X = df.drop(columns=feature_drop).copy()
|
|
46
|
+
|
|
47
|
+
if coerce_numeric:
|
|
48
|
+
X = X.apply(pd.to_numeric, errors="coerce")
|
|
49
|
+
|
|
50
|
+
X = X.replace([np.inf, -np.inf], np.nan)
|
|
51
|
+
|
|
52
|
+
if fill_strategy == "median":
|
|
53
|
+
X = X.fillna(X.median(numeric_only=True))
|
|
54
|
+
elif fill_strategy == "mean":
|
|
55
|
+
X = X.fillna(X.mean(numeric_only=True))
|
|
56
|
+
elif fill_strategy == "zero":
|
|
57
|
+
X = X.fillna(0)
|
|
58
|
+
elif fill_strategy == "drop":
|
|
59
|
+
valid_rows = X.notna().all(axis=1)
|
|
60
|
+
X = X.loc[valid_rows].copy()
|
|
61
|
+
y = y.loc[valid_rows].copy()
|
|
62
|
+
else:
|
|
63
|
+
raise ValueError("fill_strategy must be one of: median, mean, zero, drop")
|
|
64
|
+
|
|
65
|
+
if clean_column_names:
|
|
66
|
+
X.columns = (
|
|
67
|
+
X.columns.str.replace(r"[^A-Za-z0-9_]+", "_", regex=True)
|
|
68
|
+
.str.strip("_")
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return X, y
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import shap
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def compute_shap_values(model, X: pd.DataFrame):
|
|
11
|
+
explainer = shap.TreeExplainer(model)
|
|
12
|
+
shap_values = explainer.shap_values(X)
|
|
13
|
+
return explainer, shap_values
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def shap_importance_table(shap_values, X: pd.DataFrame) -> pd.DataFrame:
|
|
18
|
+
return (
|
|
19
|
+
pd.DataFrame(
|
|
20
|
+
{
|
|
21
|
+
"feature": X.columns,
|
|
22
|
+
"mean_abs_shap": np.abs(shap_values).mean(axis=0),
|
|
23
|
+
}
|
|
24
|
+
)
|
|
25
|
+
.sort_values("mean_abs_shap", ascending=False)
|
|
26
|
+
.reset_index(drop=True)
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def county_top_drivers(
|
|
32
|
+
shap_values,
|
|
33
|
+
X: pd.DataFrame,
|
|
34
|
+
id_col: Optional[pd.Series] = None,
|
|
35
|
+
name_col: Optional[pd.Series] = None,
|
|
36
|
+
) -> pd.DataFrame:
|
|
37
|
+
shap_df = pd.DataFrame(shap_values, columns=X.columns)
|
|
38
|
+
if id_col is not None:
|
|
39
|
+
shap_df["id"] = list(id_col)
|
|
40
|
+
if name_col is not None:
|
|
41
|
+
shap_df["name"] = list(name_col)
|
|
42
|
+
shap_df["top_driver"] = shap_df[X.columns].abs().idxmax(axis=1)
|
|
43
|
+
return shap_df
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.model_selection import KFold, RandomizedSearchCV, cross_val_score
|
|
7
|
+
from xgboost import XGBRegressor
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
DEFAULT_PARAM_DIST: Dict[str, Iterable] = {
|
|
11
|
+
"n_estimators": [300, 500, 700, 900],
|
|
12
|
+
"max_depth": [3, 4, 5, 6, 7],
|
|
13
|
+
"learning_rate": [0.01, 0.03, 0.05, 0.1, 0.2],
|
|
14
|
+
"subsample": [0.5, 0.7, 0.85, 1.0],
|
|
15
|
+
"colsample_bytree": [0.7, 0.8, 0.9, 1.0],
|
|
16
|
+
"min_child_weight": [1, 3, 5, 7, 10],
|
|
17
|
+
"gamma": [0, 1, 2, 3, 5],
|
|
18
|
+
"reg_alpha": [0, 0.1, 0.5, 1.0],
|
|
19
|
+
"reg_lambda": [1, 3, 5, 10],
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def make_cv(n_splits: int = 5, shuffle: bool = True, random_state: int = 42) -> KFold:
|
|
24
|
+
return KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def tune_xgb_regressor(
|
|
28
|
+
X: pd.DataFrame,
|
|
29
|
+
y: pd.Series,
|
|
30
|
+
param_distributions: Optional[Dict[str, Iterable]] = None,
|
|
31
|
+
n_iter: int = 30,
|
|
32
|
+
cv: Optional[KFold] = None,
|
|
33
|
+
scoring: str = "neg_root_mean_squared_error",
|
|
34
|
+
random_state: int = 42,
|
|
35
|
+
n_jobs: int = -1,
|
|
36
|
+
tree_method: str = "hist",
|
|
37
|
+
verbose: int = 0,
|
|
38
|
+
) -> Tuple[XGBRegressor, RandomizedSearchCV]:
|
|
39
|
+
"""Tune an XGBRegressor using randomized search."""
|
|
40
|
+
cv = cv or make_cv(random_state=random_state)
|
|
41
|
+
base_model = XGBRegressor(
|
|
42
|
+
objective="reg:squarederror",
|
|
43
|
+
random_state=random_state,
|
|
44
|
+
n_jobs=n_jobs,
|
|
45
|
+
tree_method=tree_method,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
search = RandomizedSearchCV(
|
|
49
|
+
estimator=base_model,
|
|
50
|
+
param_distributions=param_distributions or DEFAULT_PARAM_DIST,
|
|
51
|
+
n_iter=n_iter,
|
|
52
|
+
scoring=scoring,
|
|
53
|
+
cv=cv,
|
|
54
|
+
random_state=random_state,
|
|
55
|
+
n_jobs=n_jobs,
|
|
56
|
+
verbose=verbose,
|
|
57
|
+
refit=True,
|
|
58
|
+
)
|
|
59
|
+
search.fit(X, y)
|
|
60
|
+
return search.best_estimator_, search
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def rfe_xgb(
|
|
64
|
+
model: XGBRegressor,
|
|
65
|
+
X: pd.DataFrame,
|
|
66
|
+
y: pd.Series,
|
|
67
|
+
target_n: int = 15,
|
|
68
|
+
drop_frac: float = 0.10,
|
|
69
|
+
) -> List[str]:
|
|
70
|
+
"""Iteratively drop the least important features until target_n remain."""
|
|
71
|
+
if target_n < 1:
|
|
72
|
+
raise ValueError("target_n must be >= 1")
|
|
73
|
+
if not 0 < drop_frac < 1:
|
|
74
|
+
raise ValueError("drop_frac must be between 0 and 1")
|
|
75
|
+
if target_n > X.shape[1]:
|
|
76
|
+
raise ValueError("target_n cannot exceed number of features")
|
|
77
|
+
|
|
78
|
+
features = list(X.columns)
|
|
79
|
+
while len(features) > target_n:
|
|
80
|
+
model.fit(X[features], y)
|
|
81
|
+
imp = pd.Series(model.feature_importances_, index=features).sort_values()
|
|
82
|
+
drop_n = max(1, int(len(features) * drop_frac))
|
|
83
|
+
drop_feats = imp.index[:drop_n].tolist()
|
|
84
|
+
features = [f for f in features if f not in drop_feats]
|
|
85
|
+
return features
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def evaluate_regression_cv(
|
|
89
|
+
model: XGBRegressor,
|
|
90
|
+
X: pd.DataFrame,
|
|
91
|
+
y: pd.Series,
|
|
92
|
+
cv: Optional[KFold] = None,
|
|
93
|
+
n_jobs: int = -1,
|
|
94
|
+
) -> Dict[str, float]:
|
|
95
|
+
"""Return mean and SD for RMSE and R^2 across CV folds."""
|
|
96
|
+
cv = cv or make_cv()
|
|
97
|
+
rmse_scores = -cross_val_score(
|
|
98
|
+
model,
|
|
99
|
+
X,
|
|
100
|
+
y,
|
|
101
|
+
scoring="neg_root_mean_squared_error",
|
|
102
|
+
cv=cv,
|
|
103
|
+
n_jobs=n_jobs,
|
|
104
|
+
)
|
|
105
|
+
r2_scores = cross_val_score(
|
|
106
|
+
model,
|
|
107
|
+
X,
|
|
108
|
+
y,
|
|
109
|
+
scoring="r2",
|
|
110
|
+
cv=cv,
|
|
111
|
+
n_jobs=n_jobs,
|
|
112
|
+
)
|
|
113
|
+
return {
|
|
114
|
+
"rmse_mean": float(rmse_scores.mean()),
|
|
115
|
+
"rmse_sd": float(rmse_scores.std()),
|
|
116
|
+
"r2_mean": float(r2_scores.mean()),
|
|
117
|
+
"r2_sd": float(r2_scores.std()),
|
|
118
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Iterable, Optional
|
|
5
|
+
|
|
6
|
+
import matplotlib.pyplot as plt
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import shap
|
|
9
|
+
from sklearn.inspection import PartialDependenceDisplay
|
|
10
|
+
from PIL import Image
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def save_shap_summary(X: pd.DataFrame, shap_values, outpath: str, plot_type: Optional[str] = None, dpi: int = 300):
|
|
15
|
+
plt.figure()
|
|
16
|
+
shap.summary_plot(shap_values, X, plot_type=plot_type, show=False)
|
|
17
|
+
plt.tight_layout()
|
|
18
|
+
plt.savefig(outpath, dpi=dpi, bbox_inches="tight")
|
|
19
|
+
plt.close()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def save_pdp_plots(model, X: pd.DataFrame, features: Iterable[str], outdir: str, grid_resolution: int = 60):
|
|
24
|
+
outdir = Path(outdir)
|
|
25
|
+
outdir.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
saved = []
|
|
27
|
+
for feature in features:
|
|
28
|
+
fig, ax = plt.subplots(figsize=(5.5, 4.0))
|
|
29
|
+
PartialDependenceDisplay.from_estimator(
|
|
30
|
+
model,
|
|
31
|
+
X,
|
|
32
|
+
features=[feature],
|
|
33
|
+
kind="average",
|
|
34
|
+
grid_resolution=grid_resolution,
|
|
35
|
+
percentiles=(0.01, 0.99),
|
|
36
|
+
ax=ax,
|
|
37
|
+
)
|
|
38
|
+
ax.set_title(f"PDP: {feature}")
|
|
39
|
+
outpath = outdir / f"pdp_{str(feature)[:40].replace(' ', '_')}.png"
|
|
40
|
+
plt.tight_layout()
|
|
41
|
+
plt.savefig(outpath, dpi=300, bbox_inches="tight")
|
|
42
|
+
plt.close()
|
|
43
|
+
saved.append(str(outpath))
|
|
44
|
+
return saved
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def save_dependence_panel(X: pd.DataFrame, shap_values, top_vars: Iterable[str], outpath: str):
|
|
49
|
+
fig, axes = plt.subplots(2, 3, figsize=(12, 9))
|
|
50
|
+
axes = axes.ravel()
|
|
51
|
+
for i, var in enumerate(top_vars):
|
|
52
|
+
plt.sca(axes[i])
|
|
53
|
+
shap.dependence_plot(
|
|
54
|
+
ind=var,
|
|
55
|
+
shap_values=shap_values,
|
|
56
|
+
features=X,
|
|
57
|
+
interaction_index="auto",
|
|
58
|
+
show=False,
|
|
59
|
+
ax=axes[i],
|
|
60
|
+
)
|
|
61
|
+
axes[i].set_title(str(var))
|
|
62
|
+
plt.tight_layout()
|
|
63
|
+
plt.savefig(outpath, dpi=300, bbox_inches="tight")
|
|
64
|
+
plt.close()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def combine_images_side_by_side(left_path: str, right_path: str, outpath: str):
|
|
69
|
+
left = Image.open(left_path)
|
|
70
|
+
right = Image.open(right_path)
|
|
71
|
+
target_h = max(left.height, right.height)
|
|
72
|
+
left = left.resize((int(left.width * target_h / left.height), target_h), Image.Resampling.LANCZOS)
|
|
73
|
+
right = right.resize((int(right.width * target_h / right.height), target_h), Image.Resampling.LANCZOS)
|
|
74
|
+
combined = Image.new("RGB", (left.width + right.width, target_h), (255, 255, 255))
|
|
75
|
+
combined.paste(left, (0, 0))
|
|
76
|
+
combined.paste(right, (left.width, 0))
|
|
77
|
+
combined.save(outpath)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Iterable, List, Optional
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from xgboost import XGBRegressor
|
|
9
|
+
|
|
10
|
+
from .data import preprocess_frame
|
|
11
|
+
from .explain import compute_shap_values, county_top_drivers, shap_importance_table
|
|
12
|
+
from .model import evaluate_regression_cv, make_cv, rfe_xgb, tune_xgb_regressor
|
|
13
|
+
from .plotting import save_pdp_plots, save_shap_summary
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class WorkflowResults:
|
|
18
|
+
selected_features: List[str]
|
|
19
|
+
metrics: dict
|
|
20
|
+
shap_importance: pd.DataFrame
|
|
21
|
+
top_drivers: pd.DataFrame
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TabularXAIRegressor:
|
|
25
|
+
"""End-to-end workflow for tabular regression with XGBoost + SHAP."""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
target: str,
|
|
30
|
+
drop_columns: Optional[Iterable[str]] = None,
|
|
31
|
+
target_n_features: int = 15,
|
|
32
|
+
random_state: int = 42,
|
|
33
|
+
n_iter_search: int = 15,
|
|
34
|
+
):
|
|
35
|
+
self.target = target
|
|
36
|
+
self.drop_columns = list(drop_columns or [])
|
|
37
|
+
self.target_n_features = target_n_features
|
|
38
|
+
self.random_state = random_state
|
|
39
|
+
self.n_iter_search = n_iter_search
|
|
40
|
+
self.search_ = None
|
|
41
|
+
self.best_model_: Optional[XGBRegressor] = None
|
|
42
|
+
self.final_model_: Optional[XGBRegressor] = None
|
|
43
|
+
self.selected_features_: Optional[List[str]] = None
|
|
44
|
+
self.explainer_ = None
|
|
45
|
+
self.shap_values_ = None
|
|
46
|
+
self.X_selected_: Optional[pd.DataFrame] = None
|
|
47
|
+
|
|
48
|
+
def fit(self, df: pd.DataFrame) -> "TabularXAIRegressor":
|
|
49
|
+
X, y = preprocess_frame(
|
|
50
|
+
df,
|
|
51
|
+
target=self.target,
|
|
52
|
+
drop_columns=self.drop_columns,
|
|
53
|
+
)
|
|
54
|
+
cv = make_cv(random_state=self.random_state)
|
|
55
|
+
self.best_model_, self.search_ = tune_xgb_regressor(
|
|
56
|
+
X,
|
|
57
|
+
y,
|
|
58
|
+
cv=cv,
|
|
59
|
+
random_state=self.random_state,
|
|
60
|
+
n_iter=self.n_iter_search,
|
|
61
|
+
)
|
|
62
|
+
self.selected_features_ = rfe_xgb(
|
|
63
|
+
self.best_model_,
|
|
64
|
+
X,
|
|
65
|
+
y,
|
|
66
|
+
target_n=self.target_n_features,
|
|
67
|
+
)
|
|
68
|
+
self.final_model_ = XGBRegressor(
|
|
69
|
+
**self.search_.best_params_,
|
|
70
|
+
objective="reg:squarederror",
|
|
71
|
+
random_state=self.random_state,
|
|
72
|
+
n_jobs=-1,
|
|
73
|
+
tree_method="hist",
|
|
74
|
+
)
|
|
75
|
+
self.X_selected_ = X[self.selected_features_].copy()
|
|
76
|
+
self.final_model_.fit(self.X_selected_, y)
|
|
77
|
+
self.explainer_, self.shap_values_ = compute_shap_values(self.final_model_, self.X_selected_)
|
|
78
|
+
self.y_ = y
|
|
79
|
+
self.original_df_ = df.copy()
|
|
80
|
+
return self
|
|
81
|
+
|
|
82
|
+
def summarize(self, id_column: Optional[str] = None, name_column: Optional[str] = None) -> WorkflowResults:
|
|
83
|
+
if self.final_model_ is None or self.X_selected_ is None or self.selected_features_ is None:
|
|
84
|
+
raise RuntimeError("Call fit() before summarize().")
|
|
85
|
+
metrics = evaluate_regression_cv(self.final_model_, self.X_selected_, self.y_)
|
|
86
|
+
shap_rank = shap_importance_table(self.shap_values_, self.X_selected_)
|
|
87
|
+
top_drivers = county_top_drivers(
|
|
88
|
+
self.shap_values_,
|
|
89
|
+
self.X_selected_,
|
|
90
|
+
id_col=self.original_df_[id_column] if id_column and id_column in self.original_df_.columns else None,
|
|
91
|
+
name_col=self.original_df_[name_column] if name_column and name_column in self.original_df_.columns else None,
|
|
92
|
+
)
|
|
93
|
+
return WorkflowResults(
|
|
94
|
+
selected_features=self.selected_features_,
|
|
95
|
+
metrics=metrics,
|
|
96
|
+
shap_importance=shap_rank,
|
|
97
|
+
top_drivers=top_drivers,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
def export_artifacts(self, outdir: str, top_k_pdp: int = 6) -> None:
|
|
101
|
+
if self.X_selected_ is None:
|
|
102
|
+
raise RuntimeError("Call fit() before export_artifacts().")
|
|
103
|
+
outdir = Path(outdir)
|
|
104
|
+
outdir.mkdir(parents=True, exist_ok=True)
|
|
105
|
+
save_shap_summary(self.X_selected_, self.shap_values_, str(outdir / "shap_beeswarm.png"))
|
|
106
|
+
save_shap_summary(self.X_selected_, self.shap_values_, str(outdir / "shap_bar.png"), plot_type="bar")
|
|
107
|
+
top_vars = list(self.X_selected_.columns[:top_k_pdp])
|
|
108
|
+
save_pdp_plots(self.final_model_, self.X_selected_, top_vars, str(outdir / "pdp_plots"))
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: easeai
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Reusable XGBoost + SHAP workflow for tabular regression explainability
|
|
5
|
+
Author: Ria A. Martins
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/martinsria7/easeai
|
|
8
|
+
Project-URL: Repository, https://github.com/martinsria7/easeai
|
|
9
|
+
Project-URL: Issues, https://github.com/martinsria7/easeai/issues
|
|
10
|
+
Keywords: xgboost,shap,tabular,xai,regression
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: numpy>=1.24
|
|
23
|
+
Requires-Dist: pandas>=2.0
|
|
24
|
+
Requires-Dist: scikit-learn>=1.3
|
|
25
|
+
Requires-Dist: xgboost>=2.0
|
|
26
|
+
Requires-Dist: shap>=0.45
|
|
27
|
+
Requires-Dist: matplotlib>=3.8
|
|
28
|
+
Requires-Dist: Pillow>=10.0
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# EASEai
|
|
34
|
+
|
|
35
|
+
<p align="center">
|
|
36
|
+
<a href="https://www.python.org/"><img alt="Python" src="https://img.shields.io/badge/python-3.9%2B-blue.svg"></a>
|
|
37
|
+
<a href="./LICENSE"><img alt="License: MIT" src="https://img.shields.io/badge/license-MIT-green.svg"></a>
|
|
38
|
+
<img alt="Status" src="https://img.shields.io/badge/status-alpha-orange.svg">
|
|
39
|
+
<img alt="XGBoost" src="https://img.shields.io/badge/model-XGBoost-5C8A3D.svg">
|
|
40
|
+
<img alt="SHAP" src="https://img.shields.io/badge/explainability-SHAP-7A3E9D.svg">
|
|
41
|
+
</p>
|
|
42
|
+
|
|
43
|
+
<p align="center"><strong>Explainable AI for epidemiology and public health.</strong></p>
|
|
44
|
+
|
|
45
|
+
`easeai` is a lightweight Python package for **tabular regression explainability** built around a practical research workflow:
|
|
46
|
+
|
|
47
|
+
- XGBoost hyperparameter tuning
|
|
48
|
+
- iterative feature elimination based on XGBoost importance
|
|
49
|
+
- cross-validated RMSE and R²
|
|
50
|
+
- SHAP feature ranking
|
|
51
|
+
- partial dependence plots
|
|
52
|
+
- row-level dominant driver extraction
|
|
53
|
+
|
|
54
|
+
It was extracted from a county-level environmental health workflow, but the package is intentionally general so others can use it on any tabular regression dataset.
|
|
55
|
+
|
|
56
|
+
## Why use EASEai?
|
|
57
|
+
|
|
58
|
+
Many research notebooks mix together preprocessing, model tuning, feature selection, evaluation, and explainability in one place. `easeai` turns that into a reusable package for researchers who want:
|
|
59
|
+
|
|
60
|
+
- a quick XGBoost + SHAP baseline
|
|
61
|
+
- interpretable feature ranking
|
|
62
|
+
- reproducible artifact export
|
|
63
|
+
- a cleaner starting point for GitHub or publication-oriented workflows
|
|
64
|
+
|
|
65
|
+
## Installation
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install -e .
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Or after publication:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install easeai
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Minimal example
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
import pandas as pd
|
|
81
|
+
import easeai as ea
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
df = pd.read_csv("Alzheimer_merged1.csv", encoding="ISO-8859-1")
|
|
85
|
+
|
|
86
|
+
workflow = ea.TabularXAIRegressor(
|
|
87
|
+
target="AD_PREV_MEAN",
|
|
88
|
+
drop_columns=["Counties", "FIPS"],
|
|
89
|
+
target_n_features=15,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
workflow.fit(df)
|
|
93
|
+
results = workflow.summarize(id_column="FIPS", name_column="Counties")
|
|
94
|
+
|
|
95
|
+
print(results.selected_features)
|
|
96
|
+
print(results.metrics)
|
|
97
|
+
print(results.shap_importance.head())
|
|
98
|
+
|
|
99
|
+
results.top_drivers[["id", "name", "top_driver"]].to_csv("county_top_drivers.csv", index=False)
|
|
100
|
+
workflow.export_artifacts("artifacts")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Package structure
|
|
104
|
+
|
|
105
|
+
```text
|
|
106
|
+
easeai/
|
|
107
|
+
data.py # preprocessing helpers
|
|
108
|
+
model.py # tuning, CV, recursive elimination
|
|
109
|
+
explain.py # SHAP summaries and top-driver extraction
|
|
110
|
+
plotting.py # SHAP and PDP export helpers
|
|
111
|
+
workflow.py # end-to-end workflow class
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Suggested GitHub topics
|
|
115
|
+
|
|
116
|
+
`xgboost`, `shap`, `explainable-ai`, `tabular-data`, `epidemiology`, `public-health`, `machine-learning`, `python`
|
|
117
|
+
|
|
118
|
+
## Roadmap
|
|
119
|
+
|
|
120
|
+
- add classification support
|
|
121
|
+
- add permutation importance
|
|
122
|
+
- add bootstrap confidence intervals
|
|
123
|
+
- add optional map-ready exports
|
|
124
|
+
- publish on PyPI
|
|
125
|
+
|
|
126
|
+
## Development
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
pytest
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## License
|
|
133
|
+
|
|
134
|
+
MIT
|
|
135
|
+
|
|
136
|
+
## Author
|
|
137
|
+
|
|
138
|
+
Ria A. Martins
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
easeai/__init__.py
|
|
5
|
+
easeai/data.py
|
|
6
|
+
easeai/explain.py
|
|
7
|
+
easeai/model.py
|
|
8
|
+
easeai/plotting.py
|
|
9
|
+
easeai/workflow.py
|
|
10
|
+
easeai.egg-info/PKG-INFO
|
|
11
|
+
easeai.egg-info/SOURCES.txt
|
|
12
|
+
easeai.egg-info/dependency_links.txt
|
|
13
|
+
easeai.egg-info/requires.txt
|
|
14
|
+
easeai.egg-info/top_level.txt
|
|
15
|
+
tests/test_basic.py
|
|
16
|
+
tests/test_import.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
easeai
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "easeai"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Reusable XGBoost + SHAP workflow for tabular regression explainability"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Ria A. Martins"}
|
|
14
|
+
]
|
|
15
|
+
dependencies = [
|
|
16
|
+
"numpy>=1.24",
|
|
17
|
+
"pandas>=2.0",
|
|
18
|
+
"scikit-learn>=1.3",
|
|
19
|
+
"xgboost>=2.0",
|
|
20
|
+
"shap>=0.45",
|
|
21
|
+
"matplotlib>=3.8",
|
|
22
|
+
"Pillow>=10.0",
|
|
23
|
+
]
|
|
24
|
+
keywords = ["xgboost", "shap", "tabular", "xai", "regression"]
|
|
25
|
+
classifiers = [
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.9",
|
|
28
|
+
"Programming Language :: Python :: 3.10",
|
|
29
|
+
"Programming Language :: Python :: 3.11",
|
|
30
|
+
"License :: OSI Approved :: MIT License",
|
|
31
|
+
"Operating System :: OS Independent",
|
|
32
|
+
"Intended Audience :: Science/Research",
|
|
33
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
34
|
+
]
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/martinsria7/easeai"
|
|
37
|
+
Repository = "https://github.com/martinsria7/easeai"
|
|
38
|
+
Issues = "https://github.com/martinsria7/easeai/issues"
|
|
39
|
+
|
|
40
|
+
[project.optional-dependencies]
|
|
41
|
+
dev = [
|
|
42
|
+
"pytest>=8.0",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[tool.setuptools]
|
|
46
|
+
include-package-data = true
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.packages.find]
|
|
49
|
+
include = ["easeai*"]
|
easeai-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import easeai as ea
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_basic_fit():
|
|
7
|
+
rng = np.random.default_rng(42)
|
|
8
|
+
df = pd.DataFrame({
|
|
9
|
+
"x1": rng.normal(size=100),
|
|
10
|
+
"x2": rng.normal(size=100),
|
|
11
|
+
"FIPS": range(100),
|
|
12
|
+
"Counties": [f"c{i}" for i in range(100)],
|
|
13
|
+
})
|
|
14
|
+
df["y"] = 2 * df["x1"] - 0.5 * df["x2"] + rng.normal(scale=0.1, size=100)
|
|
15
|
+
|
|
16
|
+
workflow = ea.TabularXAIRegressor(
|
|
17
|
+
target="y",
|
|
18
|
+
drop_columns=["FIPS", "Counties"],
|
|
19
|
+
target_n_features=2,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
workflow.fit(df)
|
|
23
|
+
results = workflow.summarize(id_column="FIPS", name_column="Counties")
|
|
24
|
+
|
|
25
|
+
assert len(results.selected_features) > 0
|
|
26
|
+
assert results.metrics is not None
|