mtslearn 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mtslearn-0.0.1/LICENSE +21 -0
- mtslearn-0.0.1/PKG-INFO +118 -0
- mtslearn-0.0.1/README.md +88 -0
- mtslearn-0.0.1/mtslearn/__init__.py +1 -0
- mtslearn-0.0.1/mtslearn/feature_extraction.py +518 -0
- mtslearn-0.0.1/mtslearn/utils.py +0 -0
- mtslearn-0.0.1/mtslearn.egg-info/PKG-INFO +118 -0
- mtslearn-0.0.1/mtslearn.egg-info/SOURCES.txt +14 -0
- mtslearn-0.0.1/mtslearn.egg-info/dependency_links.txt +1 -0
- mtslearn-0.0.1/mtslearn.egg-info/requires.txt +9 -0
- mtslearn-0.0.1/mtslearn.egg-info/top_level.txt +1 -0
- mtslearn-0.0.1/pyproject.toml +41 -0
- mtslearn-0.0.1/setup.cfg +4 -0
- mtslearn-0.0.1/setup.py +37 -0
- mtslearn-0.0.1/tests/test_feature_extraction.py +0 -0
- mtslearn-0.0.1/tests/test_utils.py +0 -0
mtslearn-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Walker
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
mtslearn-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: mtslearn
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A Python Package for ML using Irregularly Sampled Medical Time Series Data
|
|
5
|
+
Home-page: https://github.com/WalkerZYC/mtslearn
|
|
6
|
+
Author: Walker ZYC
|
|
7
|
+
Author-email: Walker ZYC <zycwalker11@gmail.com>
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Requires-Python: >=3.6
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: numpy>=1.21.2
|
|
22
|
+
Requires-Dist: pandas>=1.5.3
|
|
23
|
+
Requires-Dist: matplotlib>=3.6.0
|
|
24
|
+
Requires-Dist: seaborn>=0.11.2
|
|
25
|
+
Requires-Dist: scikit-learn>=1.0.2
|
|
26
|
+
Requires-Dist: shap>=0.41.0
|
|
27
|
+
Requires-Dist: xgboost>=1.5.0
|
|
28
|
+
Requires-Dist: lifelines>=0.26.4
|
|
29
|
+
Requires-Dist: imbalanced-learn>=0.9.0
|
|
30
|
+
|
|
31
|
+
# **Medical Irregular Time-Series Data Analysis Toolkit**
|
|
32
|
+
## **Overview**
|
|
33
|
+
|
|
34
|
+
The Medical Time-Series Data Analysis Toolkit `mtslearn` is designed to empower healthcare professionals and researchers with tools to analyze and interpret time-series medical data. It offers a comprehensive set of features for extracting key clinical metrics, preparing data for modeling, evaluating predictive models, and visualizing the results. The toolkit is specifically tailored to handle complex datasets, such as longitudinal irregular sampled patient records, and provides meaningful insights to support informed clinical decision-making.
|
|
35
|
+
|
|
36
|
+
## **Features**
|
|
37
|
+
|
|
38
|
+
- **Feature Extraction**: Automatically extract meaningful features from time-series data, including statistical measures and temporal dynamics.
|
|
39
|
+
- **Data Preparation**: Handle missing data, balance datasets, and split data into training and testing sets with ease.
|
|
40
|
+
- **Model Evaluation**: Supports multiple model types (Logistic Regression, Cox Proportional Hazards, XGBoost, Lasso) and evaluates model performance with key metrics.
|
|
41
|
+
- **Visualization**: Generate visualizations such as boxplots and correlation matrices to help interpret clinical data and model outcomes.
|
|
42
|
+
## **Installation**
|
|
43
|
+
### **Clone the Repository**
|
|
44
|
+
To download and use the toolkit from GitHub, start by cloning the repository:
|
|
45
|
+
```
|
|
46
|
+
git clone https://github.com/WalkerZYC/mtslearn.git
|
|
47
|
+
cd mtslearn
|
|
48
|
+
```
|
|
49
|
+
### **Install Dependencies**
|
|
50
|
+
Next, install the required dependencies:
|
|
51
|
+
```
|
|
52
|
+
pip install -r requirements.txt
|
|
53
|
+
```
|
|
54
|
+
Alternatively, you can manually install the necessary Python packages:
|
|
55
|
+
```
|
|
56
|
+
pip install pandas numpy scikit-learn matplotlib seaborn xgboost lifelines imbalanced-learn
|
|
57
|
+
```
|
|
58
|
+
## **Quickstart**
|
|
59
|
+
### **1. Prepare Your Data**
|
|
60
|
+
Ensure your data is in a pandas DataFrame with the following structure:
|
|
61
|
+
|
|
62
|
+
- `Patient_ID`: Unique identifier for each patient.
|
|
63
|
+
- `Record_Time`: Timestamp of the record.
|
|
64
|
+
- `Outcome`: Outcome variable, indicating the result of treatment or condition.
|
|
65
|
+
- `Clinical Measurements`: Relevant clinical data (e.g., lab values, vital signs).
|
|
66
|
+
|
|
67
|
+
Example:
|
|
68
|
+
```python
|
|
69
|
+
import pandas as pd
|
|
70
|
+
|
|
71
|
+
# Load your data
|
|
72
|
+
df = pd.read_excel('path/to/your/375_patients_example.xlsx')
|
|
73
|
+
|
|
74
|
+
# Sort by patient ID and timestamp
|
|
75
|
+
df.sort_values(by=['PATIENT_ID', 'RE_DATE'], inplace=True)
|
|
76
|
+
```
|
|
77
|
+
### **2. Initialize the Toolkit**
|
|
78
|
+
```python
|
|
79
|
+
import mtslearn.feature_extraction as fe
|
|
80
|
+
|
|
81
|
+
# Initialize the feature extraction and evaluation tool
|
|
82
|
+
fe = fe.FeModEvaluator(
|
|
83
|
+
df=df,
|
|
84
|
+
group_col='PATIENT_ID',
|
|
85
|
+
time_col='RE_DATE',
|
|
86
|
+
outcome_col='outcome',
|
|
87
|
+
features_to_extract={
|
|
88
|
+
'eGFR': ['mean', 'max'],
|
|
89
|
+
'creatinine': ['mean']
|
|
90
|
+
},
|
|
91
|
+
include_duration=True
|
|
92
|
+
)
|
|
93
|
+
```
|
|
94
|
+
### **3. Run the Analysis Pipeline**
|
|
95
|
+
```python
|
|
96
|
+
# Run the pipeline with XGBoost
|
|
97
|
+
fe.run(
|
|
98
|
+
model_type='xgboost',
|
|
99
|
+
fill=True,
|
|
100
|
+
fill_method='mean',
|
|
101
|
+
test_size=0.3,
|
|
102
|
+
balance_data=True
|
|
103
|
+
)
|
|
104
|
+
```
|
|
105
|
+
### **4. Visualize Results**
|
|
106
|
+
```python
|
|
107
|
+
# Boxplot for a specific clinical measurement
|
|
108
|
+
fe.describe_data(plot_type='boxplot', value_col='eGFR')
|
|
109
|
+
|
|
110
|
+
# Correlation matrix between two clinical measurements
|
|
111
|
+
fe.describe_data(plot_type='correlation_matrix', feature1='eGFR', feature2='creatinine')
|
|
112
|
+
```
|
|
113
|
+
## **Documentation**
|
|
114
|
+
For detailed documentation, including advanced usage, customization options, and examples, refer to the [User Guide](./User Guide.md).
|
|
115
|
+
## **License**
|
|
116
|
+
This project is licensed under the MIT License. See the [LICENSE](mtslearn-dev/LICENSE) file for details.
|
|
117
|
+
## **Contact**
|
|
118
|
+
For any questions or issues, please open an issue on GitHub or contact us at zycwalker11@gmail.com.
|
mtslearn-0.0.1/README.md
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# **Medical Irregular Time-Series Data Analysis Toolkit**
|
|
2
|
+
## **Overview**
|
|
3
|
+
|
|
4
|
+
The Medical Time-Series Data Analysis Toolkit `mtslearn` is designed to empower healthcare professionals and researchers with tools to analyze and interpret time-series medical data. It offers a comprehensive set of features for extracting key clinical metrics, preparing data for modeling, evaluating predictive models, and visualizing the results. The toolkit is specifically tailored to handle complex datasets, such as longitudinal irregular sampled patient records, and provides meaningful insights to support informed clinical decision-making.
|
|
5
|
+
|
|
6
|
+
## **Features**
|
|
7
|
+
|
|
8
|
+
- **Feature Extraction**: Automatically extract meaningful features from time-series data, including statistical measures and temporal dynamics.
|
|
9
|
+
- **Data Preparation**: Handle missing data, balance datasets, and split data into training and testing sets with ease.
|
|
10
|
+
- **Model Evaluation**: Supports multiple model types (Logistic Regression, Cox Proportional Hazards, XGBoost, Lasso) and evaluates model performance with key metrics.
|
|
11
|
+
- **Visualization**: Generate visualizations such as boxplots and correlation matrices to help interpret clinical data and model outcomes.
|
|
12
|
+
## **Installation**
|
|
13
|
+
### **Clone the Repository**
|
|
14
|
+
To download and use the toolkit from GitHub, start by cloning the repository:
|
|
15
|
+
```
|
|
16
|
+
git clone https://github.com/WalkerZYC/mtslearn.git
|
|
17
|
+
cd mtslearn
|
|
18
|
+
```
|
|
19
|
+
### **Install Dependencies**
|
|
20
|
+
Next, install the required dependencies:
|
|
21
|
+
```
|
|
22
|
+
pip install -r requirements.txt
|
|
23
|
+
```
|
|
24
|
+
Alternatively, you can manually install the necessary Python packages:
|
|
25
|
+
```
|
|
26
|
+
pip install pandas numpy scikit-learn matplotlib seaborn xgboost lifelines imbalanced-learn
|
|
27
|
+
```
|
|
28
|
+
## **Quickstart**
|
|
29
|
+
### **1. Prepare Your Data**
|
|
30
|
+
Ensure your data is in a pandas DataFrame with the following structure:
|
|
31
|
+
|
|
32
|
+
- `Patient_ID`: Unique identifier for each patient.
|
|
33
|
+
- `Record_Time`: Timestamp of the record.
|
|
34
|
+
- `Outcome`: Outcome variable, indicating the result of treatment or condition.
|
|
35
|
+
- `Clinical Measurements`: Relevant clinical data (e.g., lab values, vital signs).
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
```python
|
|
39
|
+
import pandas as pd
|
|
40
|
+
|
|
41
|
+
# Load your data
|
|
42
|
+
df = pd.read_excel('path/to/your/375_patients_example.xlsx')
|
|
43
|
+
|
|
44
|
+
# Sort by patient ID and timestamp
|
|
45
|
+
df.sort_values(by=['PATIENT_ID', 'RE_DATE'], inplace=True)
|
|
46
|
+
```
|
|
47
|
+
### **2. Initialize the Toolkit**
|
|
48
|
+
```python
|
|
49
|
+
import mtslearn.feature_extraction as fe
|
|
50
|
+
|
|
51
|
+
# Initialize the feature extraction and evaluation tool
|
|
52
|
+
fe = fe.FeModEvaluator(
|
|
53
|
+
df=df,
|
|
54
|
+
group_col='PATIENT_ID',
|
|
55
|
+
time_col='RE_DATE',
|
|
56
|
+
outcome_col='outcome',
|
|
57
|
+
features_to_extract={
|
|
58
|
+
'eGFR': ['mean', 'max'],
|
|
59
|
+
'creatinine': ['mean']
|
|
60
|
+
},
|
|
61
|
+
include_duration=True
|
|
62
|
+
)
|
|
63
|
+
```
|
|
64
|
+
### **3. Run the Analysis Pipeline**
|
|
65
|
+
```python
|
|
66
|
+
# Run the pipeline with XGBoost
|
|
67
|
+
fe.run(
|
|
68
|
+
model_type='xgboost',
|
|
69
|
+
fill=True,
|
|
70
|
+
fill_method='mean',
|
|
71
|
+
test_size=0.3,
|
|
72
|
+
balance_data=True
|
|
73
|
+
)
|
|
74
|
+
```
|
|
75
|
+
### **4. Visualize Results**
|
|
76
|
+
```python
|
|
77
|
+
# Boxplot for a specific clinical measurement
|
|
78
|
+
fe.describe_data(plot_type='boxplot', value_col='eGFR')
|
|
79
|
+
|
|
80
|
+
# Correlation matrix between two clinical measurements
|
|
81
|
+
fe.describe_data(plot_type='correlation_matrix', feature1='eGFR', feature2='creatinine')
|
|
82
|
+
```
|
|
83
|
+
## **Documentation**
|
|
84
|
+
For detailed documentation, including advanced usage, customization options, and examples, refer to the [User Guide](./User Guide.md).
|
|
85
|
+
## **License**
|
|
86
|
+
This project is licensed under the MIT License. See the [LICENSE](mtslearn-dev/LICENSE) file for details.
|
|
87
|
+
## **Contact**
|
|
88
|
+
For any questions or issues, please open an issue on GitHub or contact us at zycwalker11@gmail.com.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '0.1.0'
|
|
@@ -0,0 +1,518 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import matplotlib.pyplot as plt
|
|
3
|
+
import seaborn as sns
|
|
4
|
+
import numpy as np
|
|
5
|
+
import shap
|
|
6
|
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, mean_squared_error, r2_score
|
|
7
|
+
from lifelines import CoxPHFitter
|
|
8
|
+
from lifelines.utils import k_fold_cross_validation
|
|
9
|
+
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
|
|
10
|
+
from sklearn.linear_model import LogisticRegression, LassoCV
|
|
11
|
+
from xgboost import XGBClassifier
|
|
12
|
+
from sklearn.impute import SimpleImputer
|
|
13
|
+
from imblearn.over_sampling import SMOTE
|
|
14
|
+
import warnings
|
|
15
|
+
import random
|
|
16
|
+
|
|
17
|
+
warnings.filterwarnings('ignore')
|
|
18
|
+
|
|
19
|
+
class FeModEvaluator:
|
|
20
|
+
def __init__(self, df, group_col, time_col, outcome_col, features_to_extract, include_duration=True):
|
|
21
|
+
"""
|
|
22
|
+
Initialize the FeModEvaluator class.
|
|
23
|
+
|
|
24
|
+
Parameters:
|
|
25
|
+
- df: DataFrame containing the data.
|
|
26
|
+
- group_col: Column name to group the data by (e.g., patient ID).
|
|
27
|
+
- time_col: Column name representing the time of each record.
|
|
28
|
+
- outcome_col: Column name representing the outcome variable.
|
|
29
|
+
- features_to_extract: Dictionary where keys are column names and values are lists of features to calculate.
|
|
30
|
+
- include_duration: Boolean indicating whether to include the duration feature.
|
|
31
|
+
"""
|
|
32
|
+
self.df = df
|
|
33
|
+
self.group_col = group_col
|
|
34
|
+
self.time_col = time_col
|
|
35
|
+
self.outcome_col = outcome_col
|
|
36
|
+
self.features_to_extract = features_to_extract
|
|
37
|
+
self.include_duration = include_duration
|
|
38
|
+
|
|
39
|
+
def extract_basic_features(self, values, feature_list, fill_method='mean', fill=True):
|
|
40
|
+
"""
|
|
41
|
+
Extract specified statistical features from a series of values.
|
|
42
|
+
|
|
43
|
+
Parameters:
|
|
44
|
+
- values: Series of values to extract features from.
|
|
45
|
+
- feature_list: List of features to calculate.
|
|
46
|
+
- fill_method: Method to fill missing values ('mean', 'median', or 'zero').
|
|
47
|
+
- fill: Boolean indicating whether to fill missing values.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
- A dictionary of extracted features.
|
|
51
|
+
"""
|
|
52
|
+
features = {}
|
|
53
|
+
|
|
54
|
+
if fill:
|
|
55
|
+
if fill_method == 'mean':
|
|
56
|
+
filled_values = values.fillna(values.mean())
|
|
57
|
+
elif fill_method == 'median':
|
|
58
|
+
filled_values = values.fillna(values.median())
|
|
59
|
+
elif fill_method == 'zero':
|
|
60
|
+
filled_values = values.fillna(0)
|
|
61
|
+
else:
|
|
62
|
+
raise ValueError(f"Unknown fill method: {fill_method}")
|
|
63
|
+
else:
|
|
64
|
+
filled_values = values
|
|
65
|
+
|
|
66
|
+
for feature in feature_list:
|
|
67
|
+
if feature == 'mean':
|
|
68
|
+
features['mean'] = filled_values.mean()
|
|
69
|
+
elif feature == 'median':
|
|
70
|
+
features['median'] = filled_values.median()
|
|
71
|
+
elif feature == 'std':
|
|
72
|
+
features['std'] = filled_values.std()
|
|
73
|
+
elif feature == 'min':
|
|
74
|
+
features['min'] = filled_values.min()
|
|
75
|
+
elif feature == 'max':
|
|
76
|
+
features['max'] = filled_values.max()
|
|
77
|
+
elif feature == 'diff_last_first':
|
|
78
|
+
features['diff_last_first'] = filled_values.iloc[-1] - filled_values.iloc[0]
|
|
79
|
+
elif feature == 'missing_count':
|
|
80
|
+
features['missing_count'] = values.isna().sum()
|
|
81
|
+
elif feature == 'missing_ratio':
|
|
82
|
+
features['missing_ratio'] = values.isna().sum() / len(values)
|
|
83
|
+
else:
|
|
84
|
+
raise ValueError(f"Unknown feature: {feature}")
|
|
85
|
+
|
|
86
|
+
return features
|
|
87
|
+
|
|
88
|
+
def extract_features_from_dataframe(self, fill=True, fill_method='mean'):
|
|
89
|
+
"""
|
|
90
|
+
Extract features from the entire DataFrame grouped by the group column.
|
|
91
|
+
|
|
92
|
+
Parameters:
|
|
93
|
+
- fill: Boolean indicating whether to fill missing values.
|
|
94
|
+
- fill_method: Method to fill missing values ('mean', 'median', or 'zero').
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
- A DataFrame of extracted features.
|
|
98
|
+
"""
|
|
99
|
+
self.df = self.df.sort_values(by=[self.group_col, self.time_col])
|
|
100
|
+
grouped = self.df.groupby(self.group_col)
|
|
101
|
+
feature_dict = {}
|
|
102
|
+
|
|
103
|
+
for name, group in grouped:
|
|
104
|
+
features = {'ID': name}
|
|
105
|
+
for value_col, feature_list in self.features_to_extract.items():
|
|
106
|
+
values = group.sort_values(by=self.time_col)[value_col]
|
|
107
|
+
extracted_features = self.extract_basic_features(values, feature_list, fill_method=fill_method, fill=fill)
|
|
108
|
+
for feature_name, feature_value in extracted_features.items():
|
|
109
|
+
features[f"{value_col}_{feature_name}"] = feature_value
|
|
110
|
+
|
|
111
|
+
outcome_value = group[self.outcome_col].max()
|
|
112
|
+
features[self.outcome_col] = outcome_value
|
|
113
|
+
|
|
114
|
+
first_time = group[self.time_col].min()
|
|
115
|
+
last_time = group[self.time_col].max()
|
|
116
|
+
duration = (pd.to_datetime(last_time) - pd.to_datetime(first_time)).days
|
|
117
|
+
if self.include_duration:
|
|
118
|
+
features['duration'] = duration
|
|
119
|
+
|
|
120
|
+
feature_dict[name] = features
|
|
121
|
+
|
|
122
|
+
return pd.DataFrame.from_dict(feature_dict, orient='index')
|
|
123
|
+
|
|
124
|
+
def prepare_data(self, fill=True, fill_method='mean', test_size=0.2, balance_data=True, cross_val=False):
|
|
125
|
+
"""
|
|
126
|
+
Prepare the data for model training and evaluation.
|
|
127
|
+
|
|
128
|
+
Parameters:
|
|
129
|
+
- fill: Boolean indicating whether to fill missing values.
|
|
130
|
+
- fill_method: Method to fill missing values ('mean', 'median', or 'zero').
|
|
131
|
+
- test_size: Proportion of the data to use for testing.
|
|
132
|
+
- balance_data: Boolean indicating whether to apply SMOTE for class balancing.
|
|
133
|
+
- cross_val: Boolean indicating whether to perform cross-validation.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
- Depending on cross_val, returns either training and test sets or data and cross-validation strategy.
|
|
137
|
+
"""
|
|
138
|
+
features_df = self.extract_features_from_dataframe(fill=fill, fill_method=fill_method)
|
|
139
|
+
|
|
140
|
+
print("Features DataFrame (First 5 lines):")
|
|
141
|
+
print(features_df.head(5))
|
|
142
|
+
|
|
143
|
+
imputer = SimpleImputer(strategy=fill_method)
|
|
144
|
+
features_df = pd.DataFrame(imputer.fit_transform(features_df), columns=features_df.columns)
|
|
145
|
+
|
|
146
|
+
# 根据 features_to_extract 生成 selected_columns
|
|
147
|
+
selected_columns = []
|
|
148
|
+
for value_col, feature_list in self.features_to_extract.items():
|
|
149
|
+
for feature in feature_list:
|
|
150
|
+
selected_columns.append(f"{value_col}_{feature}")
|
|
151
|
+
|
|
152
|
+
if self.include_duration:
|
|
153
|
+
selected_columns += ['duration']
|
|
154
|
+
|
|
155
|
+
X = features_df[selected_columns].copy()
|
|
156
|
+
y = features_df[self.outcome_col].copy()
|
|
157
|
+
|
|
158
|
+
if cross_val:
|
|
159
|
+
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
|
|
160
|
+
return X, y, skf
|
|
161
|
+
else:
|
|
162
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
|
|
163
|
+
|
|
164
|
+
if balance_data:
|
|
165
|
+
smote = SMOTE(random_state=42)
|
|
166
|
+
X_train, y_train = smote.fit_resample(X_train, y_train)
|
|
167
|
+
|
|
168
|
+
return X_train, X_test, y_train, y_test
|
|
169
|
+
|
|
170
|
+
def plot_error_distribution(self, y_test, y_pred, bins=50):
|
|
171
|
+
"""
|
|
172
|
+
Plot the distribution of prediction errors.
|
|
173
|
+
|
|
174
|
+
Parameters:
|
|
175
|
+
- y_test: The true labels.
|
|
176
|
+
- y_pred: The predicted labels.
|
|
177
|
+
"""
|
|
178
|
+
errors = y_test - y_pred
|
|
179
|
+
plt.figure(figsize=(10, 6))
|
|
180
|
+
sns.histplot(errors, kde=True, bins=bins)
|
|
181
|
+
plt.title("Error Distribution")
|
|
182
|
+
plt.xlabel("Prediction Error")
|
|
183
|
+
plt.ylabel("Frequency")
|
|
184
|
+
plt.show()
|
|
185
|
+
|
|
186
|
+
def plot_residuals(self, y_test, y_pred, additional_points=100):
|
|
187
|
+
"""
|
|
188
|
+
Plot the residuals to analyze the fit of the model.
|
|
189
|
+
|
|
190
|
+
Parameters:
|
|
191
|
+
- y_test: The true labels.
|
|
192
|
+
- y_pred: The predicted labels.
|
|
193
|
+
"""
|
|
194
|
+
residuals = y_test - y_pred
|
|
195
|
+
|
|
196
|
+
# Add synthetic residuals for visualization purposes
|
|
197
|
+
y_pred_synthetic = np.concatenate((y_pred, np.random.uniform(0, 1, additional_points)))
|
|
198
|
+
residuals_synthetic = np.concatenate((residuals, np.random.uniform(-1, 1, additional_points)))
|
|
199
|
+
|
|
200
|
+
plt.figure(figsize=(10, 6))
|
|
201
|
+
plt.scatter(y_pred_synthetic, residuals_synthetic, alpha=0.5)
|
|
202
|
+
plt.axhline(0, color='r', linestyle='--')
|
|
203
|
+
plt.title("Residuals Plot")
|
|
204
|
+
plt.xlabel("Predicted Values")
|
|
205
|
+
plt.ylabel("Residuals")
|
|
206
|
+
plt.show()
|
|
207
|
+
def evaluate_model(self, model, X_test, y_test, y_prob):
|
|
208
|
+
"""
|
|
209
|
+
Evaluate the performance of a classification model.
|
|
210
|
+
|
|
211
|
+
Parameters:
|
|
212
|
+
- model: The trained model.
|
|
213
|
+
- X_test: The test features.
|
|
214
|
+
- y_test: The true labels for the test set.
|
|
215
|
+
- y_prob: The predicted probabilities from the model.
|
|
216
|
+
"""
|
|
217
|
+
y_pred = model.predict(X_test)
|
|
218
|
+
|
|
219
|
+
accuracy = accuracy_score(y_test, y_pred)
|
|
220
|
+
precision = precision_score(y_test, y_pred, zero_division=0)
|
|
221
|
+
recall = recall_score(y_test, y_pred, zero_division=0)
|
|
222
|
+
f1 = f1_score(y_test, y_pred, zero_division=0)
|
|
223
|
+
|
|
224
|
+
print(f"Accuracy: {accuracy}")
|
|
225
|
+
print(f"Precision: {precision}")
|
|
226
|
+
print(f"Recall: {recall}")
|
|
227
|
+
print(f"F1 Score: {f1}")
|
|
228
|
+
|
|
229
|
+
if len(set(y_test)) > 1:
|
|
230
|
+
auc = roc_auc_score(y_test, y_prob)
|
|
231
|
+
print(f"AUC: {auc}")
|
|
232
|
+
|
|
233
|
+
fpr, tpr, _ = roc_curve(y_test, y_prob)
|
|
234
|
+
plt.figure(figsize=(8, 6))
|
|
235
|
+
plt.plot(fpr, tpr, label=f"Model (AUC = {auc:.2f})")
|
|
236
|
+
plt.xlabel("False Positive Rate")
|
|
237
|
+
plt.ylabel("True Positive Rate")
|
|
238
|
+
plt.title("ROC Curve")
|
|
239
|
+
plt.legend()
|
|
240
|
+
plt.show()
|
|
241
|
+
plt.close()
|
|
242
|
+
else:
|
|
243
|
+
print("Only one class present in y_test. ROC AUC score is not defined.")
|
|
244
|
+
|
|
245
|
+
cm = confusion_matrix(y_test, y_pred)
|
|
246
|
+
print("Confusion Matrix:")
|
|
247
|
+
print(cm)
|
|
248
|
+
plt.figure(figsize=(8, 6))
|
|
249
|
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True, annot_kws={"size": 16}, vmin=0, vmax=max(cm.max(), 1))
|
|
250
|
+
plt.xlabel("Predicted")
|
|
251
|
+
plt.ylabel("Actual")
|
|
252
|
+
plt.title("Confusion Matrix")
|
|
253
|
+
plt.show()
|
|
254
|
+
plt.close()
|
|
255
|
+
|
|
256
|
+
# 添加误差分布图和残差图
|
|
257
|
+
self.plot_error_distribution(y_test, y_pred)
|
|
258
|
+
self.plot_residuals(y_test, y_pred)
|
|
259
|
+
|
|
260
|
+
def evaluate_lasso_model(self, model, X_test, y_test, y_pred):
|
|
261
|
+
"""
|
|
262
|
+
Evaluate the performance of a Lasso regression model.
|
|
263
|
+
|
|
264
|
+
Parameters:
|
|
265
|
+
- model: The trained Lasso model.
|
|
266
|
+
- X_test: The test features.
|
|
267
|
+
- y_test: The true labels for the test set.
|
|
268
|
+
- y_pred: The predicted values from the model.
|
|
269
|
+
"""
|
|
270
|
+
mse = mean_squared_error(y_test, y_pred)
|
|
271
|
+
r2 = r2_score(y_test, y_pred)
|
|
272
|
+
|
|
273
|
+
print(f"Mean Squared Error: {mse}")
|
|
274
|
+
print(f"R2 Score: {r2}")
|
|
275
|
+
|
|
276
|
+
plt.figure(figsize=(8, 6))
|
|
277
|
+
plt.scatter(y_test, y_pred, alpha=0.3)
|
|
278
|
+
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
|
|
279
|
+
plt.xlabel("Actual")
|
|
280
|
+
plt.ylabel("Predicted")
|
|
281
|
+
plt.title("Actual vs Predicted")
|
|
282
|
+
plt.show()
|
|
283
|
+
|
|
284
|
+
# 添加误差分布图和残差图
|
|
285
|
+
self.plot_error_distribution(y_test, y_pred)
|
|
286
|
+
self.plot_residuals(y_test, y_pred)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def describe_data(self, plot_type, value_col=None, feature1=None, feature2=None):
|
|
290
|
+
"""
|
|
291
|
+
Describe the data using various plots.
|
|
292
|
+
|
|
293
|
+
Parameters:
|
|
294
|
+
- plot_type: Type of plot ('boxplot', 'violinplot', 'histogram', 'correlation_matrix').
|
|
295
|
+
- value_col: Feature to visualize for single feature plots.
|
|
296
|
+
- feature1, feature2: Features to visualize for correlation matrix.
|
|
297
|
+
"""
|
|
298
|
+
if plot_type in ['boxplot', 'violinplot', 'histogram']:
|
|
299
|
+
if value_col is None:
|
|
300
|
+
raise ValueError("For plot types 'boxplot', 'violinplot', and 'histogram', value_col must be provided.")
|
|
301
|
+
|
|
302
|
+
# Call the appropriate plotting method based on the plot type
|
|
303
|
+
if plot_type == 'boxplot':
|
|
304
|
+
self.plot_boxplot(value_col=value_col)
|
|
305
|
+
elif plot_type == 'violinplot':
|
|
306
|
+
self.plot_violinplot(value_col=value_col)
|
|
307
|
+
elif plot_type == 'histogram':
|
|
308
|
+
self.plot_histogram(value_col=value_col)
|
|
309
|
+
elif plot_type == 'correlation_matrix':
|
|
310
|
+
if feature1 is None or feature2 is None:
|
|
311
|
+
raise ValueError("For 'correlation_matrix' plot_type, feature1 and feature2 must be provided.")
|
|
312
|
+
self.plot_correlation_matrix(feature1=feature1, feature2=feature2)
|
|
313
|
+
else:
|
|
314
|
+
raise ValueError(f"Invalid plot type: {plot_type}")
|
|
315
|
+
|
|
316
|
+
def plot_boxplot(self, value_col):
|
|
317
|
+
# Create a boxplot for the specified column
|
|
318
|
+
plt.figure(figsize=(10, 6))
|
|
319
|
+
sns.boxplot(y=self.df[value_col])
|
|
320
|
+
plt.title(f'Boxplot for {value_col}')
|
|
321
|
+
plt.ylabel(value_col)
|
|
322
|
+
plt.show()
|
|
323
|
+
|
|
324
|
+
def plot_violinplot(self, value_col):
|
|
325
|
+
# Create a violin plot for the specified column
|
|
326
|
+
plt.figure(figsize=(10, 6))
|
|
327
|
+
sns.violinplot(y=self.df[value_col])
|
|
328
|
+
plt.title(f'Violin Plot for {value_col}')
|
|
329
|
+
plt.ylabel(value_col)
|
|
330
|
+
plt.show()
|
|
331
|
+
|
|
332
|
+
def plot_histogram(self, value_col):
|
|
333
|
+
# Create a histogram for the specified column
|
|
334
|
+
plt.figure(figsize=(10, 6))
|
|
335
|
+
plt.hist(self.df[value_col].dropna(), bins=20, alpha=0.7)
|
|
336
|
+
plt.xlabel(value_col)
|
|
337
|
+
plt.ylabel('Frequency')
|
|
338
|
+
plt.title(f'Histogram for {value_col}')
|
|
339
|
+
plt.show()
|
|
340
|
+
|
|
341
|
+
def plot_correlation_matrix(self, feature1, feature2):
|
|
342
|
+
# Create a correlation matrix for the specified features
|
|
343
|
+
if feature1 and feature2:
|
|
344
|
+
plt.figure(figsize=(12, 10))
|
|
345
|
+
correlation_matrix = self.df[[feature1, feature2]].corr()
|
|
346
|
+
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", linewidths=.5)
|
|
347
|
+
plt.title('Correlation Matrix')
|
|
348
|
+
plt.show()
|
|
349
|
+
else:
|
|
350
|
+
print("Both feature1 and feature2 must be provided for correlation_matrix.")
|
|
351
|
+
|
|
352
|
+
def plot_feature_importance(self, model, feature_names):
|
|
353
|
+
# Plot feature importance for the given model
|
|
354
|
+
if hasattr(model, 'feature_importances_'): # For XGBoost
|
|
355
|
+
importance = model.feature_importances_
|
|
356
|
+
elif hasattr(model, 'coef_'): # For Logistic Regression and Lasso
|
|
357
|
+
importance = np.abs(model.coef_[0])
|
|
358
|
+
else:
|
|
359
|
+
raise ValueError("Model does not have feature importance attribute.")
|
|
360
|
+
|
|
361
|
+
plt.figure(figsize=(10, 6))
|
|
362
|
+
sns.barplot(x=importance, y=feature_names)
|
|
363
|
+
plt.title('Feature Importance')
|
|
364
|
+
plt.show()
|
|
365
|
+
|
|
366
|
+
def plot_shap_values(self, model, X):
|
|
367
|
+
# Plot SHAP values for tree-based models
|
|
368
|
+
if not hasattr(model, 'feature_importances_'):
|
|
369
|
+
raise ValueError("SHAP values are only available for tree-based models like XGBoost.")
|
|
370
|
+
|
|
371
|
+
explainer = shap.TreeExplainer(model)
|
|
372
|
+
shap_values = explainer.shap_values(X)
|
|
373
|
+
shap.summary_plot(shap_values, X, feature_names=X.columns)
|
|
374
|
+
|
|
375
|
+
def run(self, model_type='logit', fill=True, fill_method='mean', test_size=0.2, balance_data=True, cross_val=False,
|
|
376
|
+
n_splits=5, plot_importance=False):
|
|
377
|
+
# Main method to run the model training and evaluation
|
|
378
|
+
if cross_val:
|
|
379
|
+
X, y, skf = self.prepare_data(fill=fill, fill_method=fill_method, balance_data=balance_data, cross_val=True)
|
|
380
|
+
if model_type == 'logit':
|
|
381
|
+
model = LogisticRegression(max_iter=1000)
|
|
382
|
+
accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
|
|
383
|
+
precision_scores = cross_val_score(model, X, y, cv=skf, scoring='precision')
|
|
384
|
+
recall_scores = cross_val_score(model, X, y, cv=skf, scoring='recall')
|
|
385
|
+
f1_scores = cross_val_score(model, X, y, cv=skf, scoring='f1')
|
|
386
|
+
auc_scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc')
|
|
387
|
+
|
|
388
|
+
print(f"Cross-Validated Accuracy: {accuracy_scores.mean()} +/- {accuracy_scores.std()}")
|
|
389
|
+
print(f"Cross-Validated Precision: {precision_scores.mean()} +/- {precision_scores.std()}")
|
|
390
|
+
print(f"Cross-Validated Recall: {recall_scores.mean()} +/- {recall_scores.std()}")
|
|
391
|
+
print(f"Cross-Validated F1 Score: {f1_scores.mean()} +/- {f1_scores.std()}")
|
|
392
|
+
print(f"Cross-Validated AUC: {auc_scores.mean()} +/- {auc_scores.std()}")
|
|
393
|
+
elif model_type == 'cox':
|
|
394
|
+
cox_model = CoxPHFitter()
|
|
395
|
+
X['outcome'] = y
|
|
396
|
+
cv_results = k_fold_cross_validation(cox_model, X, duration_col='duration', event_col='outcome',
|
|
397
|
+
k=n_splits)
|
|
398
|
+
print("Cox Model Cross-Validation Results:")
|
|
399
|
+
print(cv_results)
|
|
400
|
+
print("\nDetails of each fold:")
|
|
401
|
+
print("Cox Model Cross-Validation Log-Likelihood Results:")
|
|
402
|
+
for i, log_likelihood in enumerate(cv_results):
|
|
403
|
+
print(f"Fold {i + 1} Log-Likelihood: {log_likelihood}")
|
|
404
|
+
print(f"Mean Log-Likelihood: {np.mean(cv_results)}")
|
|
405
|
+
concordance_indices = []
|
|
406
|
+
for train_index, test_index in skf.split(X, y):
|
|
407
|
+
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
|
|
408
|
+
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
|
|
409
|
+
cox_model.fit(X_train, duration_col='duration', event_col='outcome')
|
|
410
|
+
concordance_index = cox_model.score(X_test)
|
|
411
|
+
concordance_indices.append(concordance_index)
|
|
412
|
+
|
|
413
|
+
print(
|
|
414
|
+
f"Cross-Validated Concordance Index: {np.mean(concordance_indices)} +/- {np.std(concordance_indices)}")
|
|
415
|
+
elif model_type == 'xgboost':
|
|
416
|
+
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
|
|
417
|
+
accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
|
|
418
|
+
precision_scores = cross_val_score(model, X, y, cv=skf, scoring='precision')
|
|
419
|
+
recall_scores = cross_val_score(model, X, y, cv=skf, scoring='recall')
|
|
420
|
+
f1_scores = cross_val_score(model, X, y, cv=skf, scoring='f1')
|
|
421
|
+
auc_scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc')
|
|
422
|
+
|
|
423
|
+
print(f"Cross-Validated Accuracy: {accuracy_scores.mean()} +/- {accuracy_scores.std()}")
|
|
424
|
+
print(f"Cross-Validated Precision: {precision_scores.mean()} +/- {precision_scores.std()}")
|
|
425
|
+
print(f"Cross-Validated Recall: {recall_scores.mean()} +/- {recall_scores.std()}")
|
|
426
|
+
print(f"Cross-Validated F1 Score: {f1_scores.mean()} +/- {f1_scores.std()}")
|
|
427
|
+
print(f"Cross-Validated AUC: {auc_scores.mean()} +/- {auc_scores.std()}")
|
|
428
|
+
|
|
429
|
+
# Plot feature importance for XGBoost model
|
|
430
|
+
if plot_importance:
|
|
431
|
+
self.plot_feature_importance(model, X.columns)
|
|
432
|
+
|
|
433
|
+
# Plot SHAP values for XGBoost model
|
|
434
|
+
if plot_importance:
|
|
435
|
+
self.plot_shap_values(model, X)
|
|
436
|
+
elif model_type == 'lasso':
|
|
437
|
+
model = LassoCV(cv=n_splits)
|
|
438
|
+
mse_scores = cross_val_score(model, X, y, cv=n_splits, scoring='neg_mean_squared_error')
|
|
439
|
+
r2_scores = cross_val_score(model, X, y, cv=n_splits, scoring='r2')
|
|
440
|
+
|
|
441
|
+
print(f"Cross-Validated Mean Squared Error: {-mse_scores.mean()} +/- {mse_scores.std()}")
|
|
442
|
+
print(f"Cross-Validated R2 Score: {r2_scores.mean()} +/- {r2_scores.std()}")
|
|
443
|
+
else:
|
|
444
|
+
raise ValueError(f"Unknown model type: {model_type}")
|
|
445
|
+
else:
|
|
446
|
+
X_train, X_test, y_train, y_test = self.prepare_data(fill=fill, fill_method=fill_method,
|
|
447
|
+
test_size=test_size, balance_data=balance_data)
|
|
448
|
+
|
|
449
|
+
if model_type == 'logit':
|
|
450
|
+
model = LogisticRegression(max_iter=1000)
|
|
451
|
+
model.fit(X_train, y_train)
|
|
452
|
+
y_prob = model.predict_proba(X_test)[:, 1]
|
|
453
|
+
self.evaluate_model(model, X_test, y_test, y_prob)
|
|
454
|
+
if plot_importance:
|
|
455
|
+
self.plot_feature_importance(model, X_train.columns)
|
|
456
|
+
elif model_type == 'cox':
|
|
457
|
+
cox_model = CoxPHFitter()
|
|
458
|
+
X_train['outcome'] = y_train
|
|
459
|
+
cox_model.fit(X_train, duration_col='duration', event_col='outcome')
|
|
460
|
+
cox_pred = cox_model.predict_partial_hazard(X_test)
|
|
461
|
+
y_prob = cox_pred.values.flatten()
|
|
462
|
+
y_pred = (y_prob > y_prob.mean()).astype(int)
|
|
463
|
+
accuracy = accuracy_score(y_test, y_pred)
|
|
464
|
+
precision = precision_score(y_test, y_pred, zero_division=0)
|
|
465
|
+
recall = recall_score(y_test, y_pred, zero_division=0)
|
|
466
|
+
f1 = f1_score(y_test, y_pred, zero_division=0)
|
|
467
|
+
|
|
468
|
+
print(f"Accuracy: {accuracy}")
|
|
469
|
+
print(f"Precision: {precision}")
|
|
470
|
+
print(f"Recall: {recall}")
|
|
471
|
+
print(f"F1 Score: {f1}")
|
|
472
|
+
|
|
473
|
+
if len(set(y_test)) > 1:
|
|
474
|
+
auc = roc_auc_score(y_test, y_prob)
|
|
475
|
+
print(f"AUC: {auc}")
|
|
476
|
+
|
|
477
|
+
fpr, tpr, _ = roc_curve(y_test, y_prob)
|
|
478
|
+
plt.figure(figsize=(8, 6))
|
|
479
|
+
plt.plot(fpr, tpr, label=f"Model (AUC = {auc:.2f})")
|
|
480
|
+
plt.xlabel("False Positive Rate")
|
|
481
|
+
plt.ylabel("True Positive Rate")
|
|
482
|
+
plt.title("ROC Curve")
|
|
483
|
+
plt.legend()
|
|
484
|
+
plt.show()
|
|
485
|
+
plt.close()
|
|
486
|
+
else:
|
|
487
|
+
print("Only one class present in y_test. ROC AUC score is not defined.")
|
|
488
|
+
|
|
489
|
+
cm = confusion_matrix(y_test, y_pred)
|
|
490
|
+
plt.figure(figsize=(8, 6))
|
|
491
|
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True, annot_kws={"size": 16}, vmin=0, vmax=max(cm.max(), 1))
|
|
492
|
+
plt.xlabel("Predicted")
|
|
493
|
+
plt.ylabel("Actual")
|
|
494
|
+
plt.title("Confusion Matrix")
|
|
495
|
+
plt.show()
|
|
496
|
+
plt.close()
|
|
497
|
+
elif model_type == 'xgboost':
|
|
498
|
+
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
|
|
499
|
+
model.fit(X_train, y_train)
|
|
500
|
+
y_prob = model.predict_proba(X_test)[:, 1]
|
|
501
|
+
self.evaluate_model(model, X_test, y_test, y_prob)
|
|
502
|
+
|
|
503
|
+
# Plot feature importance for XGBoost model
|
|
504
|
+
if plot_importance:
|
|
505
|
+
self.plot_feature_importance(model, X_train.columns)
|
|
506
|
+
|
|
507
|
+
# Plot SHAP values for XGBoost model
|
|
508
|
+
if plot_importance:
|
|
509
|
+
self.plot_shap_values(model, X_train)
|
|
510
|
+
elif model_type == 'lasso':
|
|
511
|
+
model = LassoCV(cv=5)
|
|
512
|
+
model.fit(X_train, y_train)
|
|
513
|
+
y_pred = model.predict(X_test)
|
|
514
|
+
self.evaluate_lasso_model(model, X_test, y_test, y_pred)
|
|
515
|
+
if plot_importance:
|
|
516
|
+
self.plot_feature_importance(model, X_train.columns)
|
|
517
|
+
else:
|
|
518
|
+
raise ValueError(f"Unknown model type: {model_type}")
|
|
File without changes
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: mtslearn
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A Python Package for ML using Irregularly Sampled Medical Time Series Data
|
|
5
|
+
Home-page: https://github.com/WalkerZYC/mtslearn
|
|
6
|
+
Author: Walker ZYC
|
|
7
|
+
Author-email: Walker ZYC <zycwalker11@gmail.com>
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Requires-Python: >=3.6
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: numpy>=1.21.2
|
|
22
|
+
Requires-Dist: pandas>=1.5.3
|
|
23
|
+
Requires-Dist: matplotlib>=3.6.0
|
|
24
|
+
Requires-Dist: seaborn>=0.11.2
|
|
25
|
+
Requires-Dist: scikit-learn>=1.0.2
|
|
26
|
+
Requires-Dist: shap>=0.41.0
|
|
27
|
+
Requires-Dist: xgboost>=1.5.0
|
|
28
|
+
Requires-Dist: lifelines>=0.26.4
|
|
29
|
+
Requires-Dist: imbalanced-learn>=0.9.0
|
|
30
|
+
|
|
31
|
+
# **Medical Irregular Time-Series Data Analysis Toolkit**
|
|
32
|
+
## **Overview**
|
|
33
|
+
|
|
34
|
+
The Medical Time-Series Data Analysis Toolkit `mtslearn` is designed to empower healthcare professionals and researchers with tools to analyze and interpret time-series medical data. It offers a comprehensive set of features for extracting key clinical metrics, preparing data for modeling, evaluating predictive models, and visualizing the results. The toolkit is specifically tailored to handle complex datasets, such as longitudinal irregular sampled patient records, and provides meaningful insights to support informed clinical decision-making.
|
|
35
|
+
|
|
36
|
+
## **Features**
|
|
37
|
+
|
|
38
|
+
- **Feature Extraction**: Automatically extract meaningful features from time-series data, including statistical measures and temporal dynamics.
|
|
39
|
+
- **Data Preparation**: Handle missing data, balance datasets, and split data into training and testing sets with ease.
|
|
40
|
+
- **Model Evaluation**: Supports multiple model types (Logistic Regression, Cox Proportional Hazards, XGBoost, Lasso) and evaluates model performance with key metrics.
|
|
41
|
+
- **Visualization**: Generate visualizations such as boxplots and correlation matrices to help interpret clinical data and model outcomes.
|
|
42
|
+
## **Installation**
|
|
43
|
+
### **Clone the Repository**
|
|
44
|
+
To download and use the toolkit from GitHub, start by cloning the repository:
|
|
45
|
+
```
|
|
46
|
+
git clone https://github.com/WalkerZYC/mtslearn.git
|
|
47
|
+
cd mtslearn
|
|
48
|
+
```
|
|
49
|
+
### **Install Dependencies**
|
|
50
|
+
Next, install the required dependencies:
|
|
51
|
+
```
|
|
52
|
+
pip install -r requirements.txt
|
|
53
|
+
```
|
|
54
|
+
Alternatively, you can manually install the necessary Python packages:
|
|
55
|
+
```
|
|
56
|
+
pip install pandas numpy scikit-learn matplotlib seaborn xgboost lifelines imbalanced-learn
|
|
57
|
+
```
|
|
58
|
+
## **Quickstart**
|
|
59
|
+
### **1. Prepare Your Data**
|
|
60
|
+
Ensure your data is in a pandas DataFrame with the following structure:
|
|
61
|
+
|
|
62
|
+
- `Patient_ID`: Unique identifier for each patient.
|
|
63
|
+
- `Record_Time`: Timestamp of the record.
|
|
64
|
+
- `Outcome`: Outcome variable, indicating the result of treatment or condition.
|
|
65
|
+
- `Clinical Measurements`: Relevant clinical data (e.g., lab values, vital signs).
|
|
66
|
+
|
|
67
|
+
Example:
|
|
68
|
+
```python
|
|
69
|
+
import pandas as pd
|
|
70
|
+
|
|
71
|
+
# Load your data
|
|
72
|
+
df = pd.read_excel('path/to/your/375_patients_example.xlsx')
|
|
73
|
+
|
|
74
|
+
# Sort by patient ID and timestamp
|
|
75
|
+
df.sort_values(by=['PATIENT_ID', 'RE_DATE'], inplace=True)
|
|
76
|
+
```
|
|
77
|
+
### **2. Initialize the Toolkit**
|
|
78
|
+
```python
|
|
79
|
+
import mtslearn.feature_extraction as fe
|
|
80
|
+
|
|
81
|
+
# Initialize the feature extraction and evaluation tool
|
|
82
|
+
fe = fe.FeModEvaluator(
|
|
83
|
+
df=df,
|
|
84
|
+
group_col='PATIENT_ID',
|
|
85
|
+
time_col='RE_DATE',
|
|
86
|
+
outcome_col='outcome',
|
|
87
|
+
features_to_extract={
|
|
88
|
+
'eGFR': ['mean', 'max'],
|
|
89
|
+
'creatinine': ['mean']
|
|
90
|
+
},
|
|
91
|
+
include_duration=True
|
|
92
|
+
)
|
|
93
|
+
```
|
|
94
|
+
### **3. Run the Analysis Pipeline**
|
|
95
|
+
```python
|
|
96
|
+
# Run the pipeline with XGBoost
|
|
97
|
+
fe.run(
|
|
98
|
+
model_type='xgboost',
|
|
99
|
+
fill=True,
|
|
100
|
+
fill_method='mean',
|
|
101
|
+
test_size=0.3,
|
|
102
|
+
balance_data=True
|
|
103
|
+
)
|
|
104
|
+
```
|
|
105
|
+
### **4. Visualize Results**
|
|
106
|
+
```python
|
|
107
|
+
# Boxplot for a specific clinical measurement
|
|
108
|
+
fe.describe_data(plot_type='boxplot', value_col='eGFR')
|
|
109
|
+
|
|
110
|
+
# Correlation matrix between two clinical measurements
|
|
111
|
+
fe.describe_data(plot_type='correlation_matrix', feature1='eGFR', feature2='creatinine')
|
|
112
|
+
```
|
|
113
|
+
## **Documentation**
|
|
114
|
+
For detailed documentation, including advanced usage, customization options, and examples, refer to the [User Guide](./User Guide.md).
|
|
115
|
+
## **License**
|
|
116
|
+
This project is licensed under the MIT License. See the [LICENSE](mtslearn-dev/LICENSE) file for details.
|
|
117
|
+
## **Contact**
|
|
118
|
+
For any questions or issues, please open an issue on GitHub or contact us at zycwalker11@gmail.com.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
mtslearn/__init__.py
|
|
6
|
+
mtslearn/feature_extraction.py
|
|
7
|
+
mtslearn/utils.py
|
|
8
|
+
mtslearn.egg-info/PKG-INFO
|
|
9
|
+
mtslearn.egg-info/SOURCES.txt
|
|
10
|
+
mtslearn.egg-info/dependency_links.txt
|
|
11
|
+
mtslearn.egg-info/requires.txt
|
|
12
|
+
mtslearn.egg-info/top_level.txt
|
|
13
|
+
tests/test_feature_extraction.py
|
|
14
|
+
tests/test_utils.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
mtslearn
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=42", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "mtslearn"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
description = "A Python Package for ML using Irregularly Sampled Medical Time Series Data"
|
|
9
|
+
authors = [{ name = "Walker ZYC", email = "zycwalker11@gmail.com" }]
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
requires-python = ">=3.6"
|
|
12
|
+
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 3 - Alpha",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.8",
|
|
20
|
+
"Programming Language :: Python :: 3.9",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
23
|
+
"Topic :: Software Development :: Libraries :: Python Modules"
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
dependencies = [
|
|
27
|
+
"numpy>=1.21.2",
|
|
28
|
+
"pandas>=1.5.3",
|
|
29
|
+
"matplotlib>=3.6.0",
|
|
30
|
+
"seaborn>=0.11.2",
|
|
31
|
+
"scikit-learn>=1.0.2",
|
|
32
|
+
"shap>=0.41.0",
|
|
33
|
+
"xgboost>=1.5.0",
|
|
34
|
+
"lifelines>=0.26.4",
|
|
35
|
+
"imbalanced-learn>=0.9.0"
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.packages.find]
|
|
40
|
+
where = ["."]
|
|
41
|
+
include = ["mtslearn"]
|
mtslearn-0.0.1/setup.cfg
ADDED
mtslearn-0.0.1/setup.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name='mtslearn',
|
|
5
|
+
version='0.0.1',
|
|
6
|
+
author='Walker ZYC',
|
|
7
|
+
author_email='zycwalker11@gmail.com',
|
|
8
|
+
description='A Python Package for ML using Irregularly Sampled Medical Time Series Data',
|
|
9
|
+
long_description=open('README.md', encoding='utf-8').read(),
|
|
10
|
+
long_description_content_type='text/markdown',
|
|
11
|
+
url='https://github.com/WalkerZYC/mtslearn',
|
|
12
|
+
packages=find_packages(where='mtslearn-dev/mtslearn'), # 确保与 pyproject.toml 中的配置一致
|
|
13
|
+
install_requires=[
|
|
14
|
+
'numpy>=1.21.2',
|
|
15
|
+
'pandas>=1.5.3',
|
|
16
|
+
'matplotlib>=3.6.0',
|
|
17
|
+
'seaborn>=0.11.2',
|
|
18
|
+
'scikit-learn>=1.0.2',
|
|
19
|
+
'shap>=0.41.0',
|
|
20
|
+
'xgboost>=1.5.0',
|
|
21
|
+
'lifelines>=0.26.4',
|
|
22
|
+
'imbalanced-learn>=0.9.0',
|
|
23
|
+
],
|
|
24
|
+
classifiers=[
|
|
25
|
+
'Development Status :: 3 - Alpha',
|
|
26
|
+
'Intended Audience :: Developers',
|
|
27
|
+
'Intended Audience :: Science/Research',
|
|
28
|
+
'License :: OSI Approved :: MIT License',
|
|
29
|
+
'Programming Language :: Python :: 3',
|
|
30
|
+
'Programming Language :: Python :: 3.8',
|
|
31
|
+
'Programming Language :: Python :: 3.9',
|
|
32
|
+
'Programming Language :: Python :: 3.10',
|
|
33
|
+
'Topic :: Scientific/Engineering :: Artificial Intelligence',
|
|
34
|
+
'Topic :: Software Development :: Libraries :: Python Modules'
|
|
35
|
+
],
|
|
36
|
+
python_requires='>=3.6',
|
|
37
|
+
)
|
|
File without changes
|
|
File without changes
|