chrom-qsar 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chrom_qsar-0.1.0/LICENSE +21 -0
- chrom_qsar-0.1.0/PKG-INFO +30 -0
- chrom_qsar-0.1.0/chrom_qsar/__init__.py +8 -0
- chrom_qsar-0.1.0/chrom_qsar/data.py +24 -0
- chrom_qsar-0.1.0/chrom_qsar/interpreter.py +57 -0
- chrom_qsar-0.1.0/chrom_qsar/models.py +37 -0
- chrom_qsar-0.1.0/chrom_qsar/pipeline.py +137 -0
- chrom_qsar-0.1.0/chrom_qsar/trainer.py +37 -0
- chrom_qsar-0.1.0/chrom_qsar/utils.py +21 -0
- chrom_qsar-0.1.0/chrom_qsar.egg-info/PKG-INFO +30 -0
- chrom_qsar-0.1.0/chrom_qsar.egg-info/SOURCES.txt +14 -0
- chrom_qsar-0.1.0/chrom_qsar.egg-info/dependency_links.txt +1 -0
- chrom_qsar-0.1.0/chrom_qsar.egg-info/requires.txt +11 -0
- chrom_qsar-0.1.0/chrom_qsar.egg-info/top_level.txt +1 -0
- chrom_qsar-0.1.0/pyproject.toml +38 -0
- chrom_qsar-0.1.0/setup.cfg +4 -0
chrom_qsar-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Your Name / Your Lab
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chrom-qsar
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A streamlined machine learning pipeline for QSAR and regression tasks with SHAP interpretability.
|
|
5
|
+
Author-email: Your Name <your.email@university.edu>
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: machine-learning,qsar,shap,regression,optuna
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pandas>=1.3.0
|
|
20
|
+
Requires-Dist: numpy>=1.21.0
|
|
21
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
22
|
+
Requires-Dist: xgboost>=1.5.0
|
|
23
|
+
Requires-Dist: lightgbm>=3.3.0
|
|
24
|
+
Requires-Dist: optuna>=3.0.0
|
|
25
|
+
Requires-Dist: shap>=0.41.0
|
|
26
|
+
Requires-Dist: matplotlib>=3.4.0
|
|
27
|
+
Requires-Dist: seaborn>=0.11.0
|
|
28
|
+
Requires-Dist: joblib>=1.1.0
|
|
29
|
+
Requires-Dist: openpyxl>=3.0.0
|
|
30
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
def load_and_clean_data(filepath: str):
|
|
5
|
+
if not os.path.exists(filepath):
|
|
6
|
+
parent_path = os.path.join('..', filepath)
|
|
7
|
+
if os.path.exists(parent_path):
|
|
8
|
+
filepath = parent_path
|
|
9
|
+
else:
|
|
10
|
+
raise FileNotFoundError(f"找不到数据文件: {filepath}")
|
|
11
|
+
|
|
12
|
+
df = pd.read_excel(filepath)
|
|
13
|
+
y_raw = df.iloc[:, 0]
|
|
14
|
+
X_raw = df.iloc[:, 1:]
|
|
15
|
+
|
|
16
|
+
y = pd.to_numeric(y_raw, errors='coerce')
|
|
17
|
+
X = X_raw.apply(pd.to_numeric, errors='coerce')
|
|
18
|
+
clean_df = pd.concat([y.rename('target'), X], axis=1).dropna()
|
|
19
|
+
|
|
20
|
+
X_clean = clean_df.drop(columns=['target'])
|
|
21
|
+
y_clean = clean_df['target']
|
|
22
|
+
|
|
23
|
+
print(f"数据加载完成: 原始样本 {len(df)} → 有效样本 {len(X_clean)} | 特征维度: {X_clean.shape[1]}")
|
|
24
|
+
return X_clean, y_clean
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shap
|
|
3
|
+
import numpy as np
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
from sklearn.pipeline import Pipeline
|
|
6
|
+
|
|
7
|
+
def run_comprehensive_shap_analysis(model, X_test, feature_names, model_name, out_dir):
|
|
8
|
+
print(f" ├─ 启动 SHAP 分析 ({model_name})...")
|
|
9
|
+
is_pipeline = isinstance(model, Pipeline)
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
if is_pipeline:
|
|
13
|
+
n_feat = X_test.shape[1]
|
|
14
|
+
min_evals = min(2 * n_feat + 1, 1000)
|
|
15
|
+
explainer = shap.PermutationExplainer(model.predict, X_test, max_evals=min_evals)
|
|
16
|
+
else:
|
|
17
|
+
explainer = shap.Explainer(model, X_test)
|
|
18
|
+
except Exception:
|
|
19
|
+
n_feat = X_test.shape[1]
|
|
20
|
+
min_evals = min(2 * n_feat + 1, 1000)
|
|
21
|
+
explainer = shap.PermutationExplainer(model.predict, X_test, max_evals=min_evals)
|
|
22
|
+
|
|
23
|
+
shap_output = explainer(X_test)
|
|
24
|
+
shap_arr = shap_output.values if hasattr(shap_output, 'values') else np.array(shap_output)
|
|
25
|
+
if shap_arr.ndim == 1:
|
|
26
|
+
shap_arr = shap_arr.reshape(1, -1)
|
|
27
|
+
|
|
28
|
+
mean_abs_shap = np.mean(np.abs(shap_arr), axis=0)
|
|
29
|
+
top_indices = np.argsort(mean_abs_shap)[::-1][:15]
|
|
30
|
+
top_feats = [feature_names[i] for i in top_indices]
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
plt.figure(figsize=(12, 8))
|
|
34
|
+
if hasattr(shap.plots, 'beeswarm'):
|
|
35
|
+
shap.plots.beeswarm(shap_output, max_display=15, show=False)
|
|
36
|
+
else:
|
|
37
|
+
shap.summary_plot(shap_output, X_test, feature_names=feature_names, max_display=15, show=False)
|
|
38
|
+
plt.title(model_name, fontsize=16, fontweight='bold', pad=10)
|
|
39
|
+
plt.tight_layout()
|
|
40
|
+
plt.savefig(os.path.join(out_dir, f'shap_summary_{model_name}.png'), dpi=300, bbox_inches='tight')
|
|
41
|
+
plt.close()
|
|
42
|
+
except Exception as e:
|
|
43
|
+
print(f" └─ Summary Plot 失败: {e}")
|
|
44
|
+
|
|
45
|
+
plt.figure(figsize=(10, 8))
|
|
46
|
+
plt.barh(range(len(top_feats)), mean_abs_shap[top_indices][::-1], color='#4C72B0', edgecolor='k', alpha=0.8)
|
|
47
|
+
plt.yticks(range(len(top_feats)), top_feats[::-1], fontsize=12)
|
|
48
|
+
plt.xlabel('Mean |SHAP Value|', fontsize=14, fontweight='bold')
|
|
49
|
+
plt.title(model_name, fontsize=16, fontweight='bold', pad=10)
|
|
50
|
+
plt.grid(axis='x', linestyle='--', alpha=0.4)
|
|
51
|
+
plt.tight_layout()
|
|
52
|
+
plt.savefig(os.path.join(out_dir, f'shap_importance_{model_name}.png'), dpi=300, bbox_inches='tight')
|
|
53
|
+
plt.close()
|
|
54
|
+
|
|
55
|
+
import pandas as pd
|
|
56
|
+
pd.DataFrame(shap_arr, columns=feature_names).to_csv(os.path.join(out_dir, f'shap_values_{model_name}.csv'), index=False)
|
|
57
|
+
print(f" └─ SHAP 分析完成,结果已保存至 {out_dir}")
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
|
|
2
|
+
from sklearn.cross_decomposition import PLSRegression
|
|
3
|
+
from sklearn.svm import SVR
|
|
4
|
+
from sklearn.tree import DecisionTreeRegressor
|
|
5
|
+
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
|
|
6
|
+
import xgboost as xgb
|
|
7
|
+
import lightgbm as lgb
|
|
8
|
+
|
|
9
|
+
def get_model_configs():
|
|
10
|
+
def pls_space(trial): return {'n_components': trial.suggest_int('n_components', 2, 50)}
|
|
11
|
+
def ridge_space(trial): return {'alpha': trial.suggest_float('alpha', 1e-3, 1e3, log=True)}
|
|
12
|
+
def lasso_space(trial): return {'alpha': trial.suggest_float('alpha', 1e-3, 1e3, log=True)}
|
|
13
|
+
def enet_space(trial): return {'alpha': trial.suggest_float('alpha', 1e-3, 1e3, log=True), 'l1_ratio': trial.suggest_float('l1_ratio', 0.0, 1.0)}
|
|
14
|
+
def svr_space(trial): return {'C': trial.suggest_float('C', 0.1, 100.0, log=True), 'epsilon': trial.suggest_float('epsilon', 0.01, 0.5, log=True), 'kernel': trial.suggest_categorical('kernel', ['rbf', 'linear'])}
|
|
15
|
+
def dt_space(trial): return {'max_depth': trial.suggest_int('max_depth', 3, 20), 'min_samples_split': trial.suggest_int('min_samples_split', 2, 10), 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 8)}
|
|
16
|
+
def rf_space(trial): return {'n_estimators': trial.suggest_int('n_estimators', 200, 800, step=50), 'max_depth': trial.suggest_int('max_depth', 5, 30), 'min_samples_split': trial.suggest_int('min_samples_split', 2, 10), 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 8), 'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None])}
|
|
17
|
+
def xgb_space(trial): return {'n_estimators': trial.suggest_int('n_estimators', 200, 1000, step=50), 'max_depth': trial.suggest_int('max_depth', 3, 12), 'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True), 'subsample': trial.suggest_float('subsample', 0.6, 1.0), 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0), 'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True), 'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True)}
|
|
18
|
+
def lgb_space(trial): return {'n_estimators': trial.suggest_int('n_estimators', 200, 1000, step=50), 'max_depth': trial.suggest_int('max_depth', 3, 12), 'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True), 'num_leaves': trial.suggest_int('num_leaves', 20, 100), 'subsample': trial.suggest_float('subsample', 0.6, 1.0), 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0), 'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True), 'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True)}
|
|
19
|
+
def ada_space(trial): return {'n_estimators': trial.suggest_int('n_estimators', 50, 500, step=25), 'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True), 'loss': trial.suggest_categorical('loss', ['linear', 'square', 'exponential'])}
|
|
20
|
+
def et_space(trial): return {'n_estimators': trial.suggest_int('n_estimators', 200, 800, step=50), 'max_depth': trial.suggest_int('max_depth', 5, 30), 'min_samples_split': trial.suggest_int('min_samples_split', 2, 10), 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 8), 'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None])}
|
|
21
|
+
def hgb_space(trial): return {'max_iter': trial.suggest_int('max_iter', 200, 1000, step=50), 'max_depth': trial.suggest_int('max_depth', 3, 12), 'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True), 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 50), 'l2_regularization': trial.suggest_float('l2_regularization', 1e-4, 10.0, log=True)}
|
|
22
|
+
|
|
23
|
+
return {
|
|
24
|
+
'PLSR': {'model': PLSRegression, 'space': pls_space},
|
|
25
|
+
'LinearRegression': {'model': LinearRegression, 'space': None},
|
|
26
|
+
'Ridge': {'model': Ridge, 'space': ridge_space},
|
|
27
|
+
'Lasso': {'model': Lasso, 'space': lasso_space},
|
|
28
|
+
'ElasticNet': {'model': ElasticNet, 'space': enet_space},
|
|
29
|
+
'SVR': {'model': SVR, 'space': svr_space},
|
|
30
|
+
'DecisionTree': {'model': DecisionTreeRegressor, 'space': dt_space},
|
|
31
|
+
'RandomForest': {'model': RandomForestRegressor, 'space': rf_space},
|
|
32
|
+
'XGBoost': {'model': xgb.XGBRegressor, 'space': xgb_space},
|
|
33
|
+
'LightGBM': {'model': lgb.LGBMRegressor, 'space': lgb_space},
|
|
34
|
+
'AdaBoost': {'model': AdaBoostRegressor, 'space': ada_space},
|
|
35
|
+
'ExtraTrees': {'model': ExtraTreesRegressor, 'space': et_space},
|
|
36
|
+
'HistGradientBoosting': {'model': HistGradientBoostingRegressor, 'space': hgb_space}
|
|
37
|
+
}
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import glob
|
|
3
|
+
import joblib
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import numpy as np
|
|
6
|
+
import matplotlib.pyplot as plt
|
|
7
|
+
from sklearn.model_selection import train_test_split
|
|
8
|
+
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
|
|
9
|
+
from .utils import setup_environment, set_seed
|
|
10
|
+
from .data import load_and_clean_data
|
|
11
|
+
from .models import get_model_configs
|
|
12
|
+
from .trainer import optimize_model
|
|
13
|
+
from .interpreter import run_comprehensive_shap_analysis
|
|
14
|
+
|
|
15
|
+
class QSARModelTrainer:
|
|
16
|
+
def __init__(self, data_path: str, out_dir: str = 'training_results', n_trials: int = 100):
|
|
17
|
+
setup_environment()
|
|
18
|
+
set_seed(42)
|
|
19
|
+
self.data_path = data_path
|
|
20
|
+
self.out_dir = out_dir
|
|
21
|
+
self.n_trials = n_trials
|
|
22
|
+
os.makedirs(out_dir, exist_ok=True)
|
|
23
|
+
|
|
24
|
+
def run(self):
|
|
25
|
+
print("="*60 + "\n 开始自动化建模与优化流程\n" + "="*60)
|
|
26
|
+
X, y = load_and_clean_data(self.data_path)
|
|
27
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
28
|
+
configs = get_model_configs()
|
|
29
|
+
results = []
|
|
30
|
+
|
|
31
|
+
for name, cfg in configs.items():
|
|
32
|
+
print(f"\n[ {name} ] 处理中...")
|
|
33
|
+
if cfg['space'] is not None:
|
|
34
|
+
model, best_params = optimize_model(
|
|
35
|
+
cfg['model'], cfg['space'], X_train, y_train,
|
|
36
|
+
n_trials=self.n_trials, is_svr=(name == 'SVR')
|
|
37
|
+
)
|
|
38
|
+
print(f" ├─ 最优参数: {best_params}")
|
|
39
|
+
else:
|
|
40
|
+
model = cfg['model']()
|
|
41
|
+
|
|
42
|
+
model.fit(X_train, y_train)
|
|
43
|
+
y_pred = model.predict(X_test)
|
|
44
|
+
|
|
45
|
+
r2 = r2_score(y_test, y_pred)
|
|
46
|
+
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
|
|
47
|
+
mae = mean_absolute_error(y_test, y_pred)
|
|
48
|
+
print(f" ├─ 测试集: R²={r2:.4f} | RMSE={rmse:.4f} | MAE={mae:.4f}")
|
|
49
|
+
|
|
50
|
+
results.append({'Model': name, 'R2': r2, 'RMSE': rmse, 'MAE': mae})
|
|
51
|
+
|
|
52
|
+
# 保存模型和特征
|
|
53
|
+
joblib.dump(model, os.path.join(self.out_dir, f'model_{name}.pkl'))
|
|
54
|
+
joblib.dump(X.columns.tolist(), os.path.join(self.out_dir, f'features_{name}.pkl'))
|
|
55
|
+
|
|
56
|
+
# SHAP 分析
|
|
57
|
+
run_comprehensive_shap_analysis(model, X_test, X.columns.tolist(), name, self.out_dir)
|
|
58
|
+
|
|
59
|
+
# 汇总结果
|
|
60
|
+
res_df = pd.DataFrame(results).sort_values('R2', ascending=False)
|
|
61
|
+
res_df.to_csv(os.path.join(self.out_dir, 'model_comparison.csv'), index=False)
|
|
62
|
+
print("\n训练流程结束,结果已保存至:", self.out_dir)
|
|
63
|
+
return res_df
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class QSARModelAnalyzer:
|
|
67
|
+
def __init__(self, data_path: str, model_dir: str = '.', out_dir: str = 'batch_analysis_results'):
|
|
68
|
+
setup_environment()
|
|
69
|
+
set_seed(42)
|
|
70
|
+
self.data_path = data_path
|
|
71
|
+
self.model_dir = model_dir
|
|
72
|
+
self.out_dir = out_dir
|
|
73
|
+
os.makedirs(out_dir, exist_ok=True)
|
|
74
|
+
|
|
75
|
+
def run(self):
|
|
76
|
+
print("="*60 + f"\n 开始批量模型分析 (目录: {self.model_dir})\n" + "="*60)
|
|
77
|
+
X, y = load_and_clean_data(self.data_path)
|
|
78
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
79
|
+
|
|
80
|
+
model_files = sorted(glob.glob(os.path.join(self.model_dir, 'model_*.pkl')))
|
|
81
|
+
if not model_files:
|
|
82
|
+
print("未找到任何 model_*.pkl 文件!")
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
results = []
|
|
86
|
+
for mfile in model_files:
|
|
87
|
+
name = os.path.basename(mfile).replace('model_', '').replace('.pkl', '')
|
|
88
|
+
ffile = os.path.join(self.model_dir, f'features_{name}.pkl')
|
|
89
|
+
|
|
90
|
+
if not os.path.exists(ffile):
|
|
91
|
+
print(f"跳过 [{name}]: 缺少特征文件")
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
print(f"\n[ {name} ] 分析中...")
|
|
95
|
+
model = joblib.load(mfile)
|
|
96
|
+
feat_names = joblib.load(ffile)
|
|
97
|
+
X_test_aligned = X_test[feat_names]
|
|
98
|
+
|
|
99
|
+
# 预测
|
|
100
|
+
y_pred = model.predict(X_test_aligned)
|
|
101
|
+
r2 = r2_score(y_test, y_pred)
|
|
102
|
+
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
|
|
103
|
+
mae = mean_absolute_error(y_test, y_pred)
|
|
104
|
+
results.append({'Model': name, 'R2': r2, 'RMSE': rmse, 'MAE': mae})
|
|
105
|
+
|
|
106
|
+
# 1. True vs Predicted 散点图
|
|
107
|
+
self._plot_true_vs_pred(model, X_train, X_test_aligned, y_train, y_test, feat_names, name)
|
|
108
|
+
|
|
109
|
+
# 2. SHAP 分析
|
|
110
|
+
run_comprehensive_shap_analysis(model, X_test_aligned, feat_names, name, self.out_dir)
|
|
111
|
+
|
|
112
|
+
# 汇总
|
|
113
|
+
res_df = pd.DataFrame(results).sort_values('R2', ascending=False)
|
|
114
|
+
res_df.to_csv(os.path.join(self.out_dir, 'model_summary_metrics.csv'), index=False)
|
|
115
|
+
print("\n批量分析结束,结果已保存至:", self.out_dir)
|
|
116
|
+
|
|
117
|
+
def _plot_true_vs_pred(self, model, X_train, X_test, y_train, y_test, feat_names, name):
|
|
118
|
+
y_train_pred = model.predict(X_train[feat_names])
|
|
119
|
+
y_test_pred = model.predict(X_test)
|
|
120
|
+
r2_train = r2_score(y_train, y_train_pred)
|
|
121
|
+
r2_test = r2_score(y_test, y_test_pred)
|
|
122
|
+
|
|
123
|
+
plt.figure(figsize=(8, 8))
|
|
124
|
+
plt.scatter(y_train, y_train_pred, c='#2E86AB', alpha=0.6, edgecolor='white', s=80, label=f'Train (R²={r2_train:.3f})')
|
|
125
|
+
plt.scatter(y_test, y_test_pred, c='#E94F37', alpha=0.7, edgecolor='black', s=100, label=f'Test (R²={r2_test:.3f})')
|
|
126
|
+
|
|
127
|
+
lims = [min(y_train.min(), y_test.min()) - 0.5, max(y_train.max(), y_test.max()) + 0.5]
|
|
128
|
+
plt.plot(lims, lims, 'k--', lw=2.0, zorder=0)
|
|
129
|
+
|
|
130
|
+
plt.xlabel('True Adsorption Capacity', fontsize=14, fontweight='bold')
|
|
131
|
+
plt.ylabel('Predicted Adsorption Capacity', fontsize=14, fontweight='bold')
|
|
132
|
+
plt.title(name, fontsize=16, pad=10, fontweight='bold')
|
|
133
|
+
plt.legend(fontsize=12, frameon=True, loc='upper left')
|
|
134
|
+
plt.grid(axis='both', linestyle=':', alpha=0.4)
|
|
135
|
+
plt.tight_layout()
|
|
136
|
+
plt.savefig(os.path.join(self.out_dir, f'true_pred_{name}.png'), dpi=300, bbox_inches='tight')
|
|
137
|
+
plt.close()
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import optuna
|
|
2
|
+
from sklearn.model_selection import cross_val_score
|
|
3
|
+
from sklearn.pipeline import make_pipeline
|
|
4
|
+
from sklearn.preprocessing import StandardScaler
|
|
5
|
+
|
|
6
|
+
def get_safe_params(model_class, params: dict) -> dict:
|
|
7
|
+
safe_params = params.copy()
|
|
8
|
+
try: model_class(random_state=42); safe_params['random_state'] = 42
|
|
9
|
+
except TypeError: pass
|
|
10
|
+
try: model_class(n_jobs=-1); safe_params['n_jobs'] = -1
|
|
11
|
+
except TypeError: pass
|
|
12
|
+
return safe_params
|
|
13
|
+
|
|
14
|
+
def optimize_model(model_class, space_func, X_train, y_train, n_trials: int = 100, is_svr: bool = False):
|
|
15
|
+
def objective(trial):
|
|
16
|
+
params = space_func(trial) if space_func else {}
|
|
17
|
+
safe_params = get_safe_params(model_class, params)
|
|
18
|
+
|
|
19
|
+
if is_svr:
|
|
20
|
+
model = make_pipeline(StandardScaler(), model_class(**safe_params))
|
|
21
|
+
else:
|
|
22
|
+
model = model_class(**safe_params)
|
|
23
|
+
|
|
24
|
+
score = cross_val_score(model, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error', n_jobs=-1).mean()
|
|
25
|
+
return score
|
|
26
|
+
|
|
27
|
+
study = optuna.create_study(
|
|
28
|
+
direction='maximize',
|
|
29
|
+
sampler=optuna.samplers.TPESampler(seed=42),
|
|
30
|
+
pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=3)
|
|
31
|
+
)
|
|
32
|
+
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
|
|
33
|
+
|
|
34
|
+
best_params = get_safe_params(model_class, study.best_trial.params)
|
|
35
|
+
if is_svr:
|
|
36
|
+
return make_pipeline(StandardScaler(), model_class(**best_params)), study.best_trial.params
|
|
37
|
+
return model_class(**best_params), study.best_trial.params
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
import numpy as np
|
|
3
|
+
import random
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
import seaborn as sns
|
|
6
|
+
|
|
7
|
+
def setup_environment():
|
|
8
|
+
warnings.filterwarnings('ignore')
|
|
9
|
+
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial']
|
|
10
|
+
plt.rcParams['axes.unicode_minus'] = False
|
|
11
|
+
plt.rcParams['font.size'] = 12
|
|
12
|
+
plt.rcParams['axes.labelsize'] = 14
|
|
13
|
+
plt.rcParams['xtick.labelsize'] = 12
|
|
14
|
+
plt.rcParams['ytick.labelsize'] = 12
|
|
15
|
+
plt.rcParams['legend.fontsize'] = 12
|
|
16
|
+
plt.rcParams['axes.titlesize'] = 16
|
|
17
|
+
sns.set_style('whitegrid')
|
|
18
|
+
|
|
19
|
+
def set_seed(seed: int = 42):
|
|
20
|
+
random.seed(seed)
|
|
21
|
+
np.random.seed(seed)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chrom-qsar
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A streamlined machine learning pipeline for QSAR and regression tasks with SHAP interpretability.
|
|
5
|
+
Author-email: Your Name <your.email@university.edu>
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: machine-learning,qsar,shap,regression,optuna
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pandas>=1.3.0
|
|
20
|
+
Requires-Dist: numpy>=1.21.0
|
|
21
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
22
|
+
Requires-Dist: xgboost>=1.5.0
|
|
23
|
+
Requires-Dist: lightgbm>=3.3.0
|
|
24
|
+
Requires-Dist: optuna>=3.0.0
|
|
25
|
+
Requires-Dist: shap>=0.41.0
|
|
26
|
+
Requires-Dist: matplotlib>=3.4.0
|
|
27
|
+
Requires-Dist: seaborn>=0.11.0
|
|
28
|
+
Requires-Dist: joblib>=1.1.0
|
|
29
|
+
Requires-Dist: openpyxl>=3.0.0
|
|
30
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
pyproject.toml
|
|
3
|
+
chrom_qsar/__init__.py
|
|
4
|
+
chrom_qsar/data.py
|
|
5
|
+
chrom_qsar/interpreter.py
|
|
6
|
+
chrom_qsar/models.py
|
|
7
|
+
chrom_qsar/pipeline.py
|
|
8
|
+
chrom_qsar/trainer.py
|
|
9
|
+
chrom_qsar/utils.py
|
|
10
|
+
chrom_qsar.egg-info/PKG-INFO
|
|
11
|
+
chrom_qsar.egg-info/SOURCES.txt
|
|
12
|
+
chrom_qsar.egg-info/dependency_links.txt
|
|
13
|
+
chrom_qsar.egg-info/requires.txt
|
|
14
|
+
chrom_qsar.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
chrom_qsar
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "chrom-qsar"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A streamlined machine learning pipeline for QSAR and regression tasks with SHAP interpretability."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Your Name", email = "your.email@university.edu"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["machine-learning", "qsar", "shap", "regression", "optuna"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.8",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"pandas>=1.3.0",
|
|
28
|
+
"numpy>=1.21.0",
|
|
29
|
+
"scikit-learn>=1.0.0",
|
|
30
|
+
"xgboost>=1.5.0",
|
|
31
|
+
"lightgbm>=3.3.0",
|
|
32
|
+
"optuna>=3.0.0",
|
|
33
|
+
"shap>=0.41.0",
|
|
34
|
+
"matplotlib>=3.4.0",
|
|
35
|
+
"seaborn>=0.11.0",
|
|
36
|
+
"joblib>=1.1.0",
|
|
37
|
+
"openpyxl>=3.0.0"
|
|
38
|
+
]
|