adamops 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adamops/__init__.py +40 -0
- adamops/cli.py +163 -0
- adamops/data/__init__.py +24 -0
- adamops/data/feature_engineering.py +284 -0
- adamops/data/loaders.py +922 -0
- adamops/data/preprocessors.py +227 -0
- adamops/data/splitters.py +218 -0
- adamops/data/validators.py +148 -0
- adamops/deployment/__init__.py +21 -0
- adamops/deployment/api.py +237 -0
- adamops/deployment/cloud.py +191 -0
- adamops/deployment/containerize.py +262 -0
- adamops/deployment/exporters.py +148 -0
- adamops/evaluation/__init__.py +24 -0
- adamops/evaluation/comparison.py +133 -0
- adamops/evaluation/explainability.py +143 -0
- adamops/evaluation/metrics.py +233 -0
- adamops/evaluation/reports.py +165 -0
- adamops/evaluation/visualization.py +238 -0
- adamops/models/__init__.py +21 -0
- adamops/models/automl.py +277 -0
- adamops/models/ensembles.py +228 -0
- adamops/models/modelops.py +308 -0
- adamops/models/registry.py +250 -0
- adamops/monitoring/__init__.py +21 -0
- adamops/monitoring/alerts.py +200 -0
- adamops/monitoring/dashboard.py +117 -0
- adamops/monitoring/drift.py +212 -0
- adamops/monitoring/performance.py +195 -0
- adamops/pipelines/__init__.py +15 -0
- adamops/pipelines/orchestrators.py +183 -0
- adamops/pipelines/workflows.py +212 -0
- adamops/utils/__init__.py +18 -0
- adamops/utils/config.py +457 -0
- adamops/utils/helpers.py +663 -0
- adamops/utils/logging.py +412 -0
- adamops-0.1.0.dist-info/METADATA +310 -0
- adamops-0.1.0.dist-info/RECORD +42 -0
- adamops-0.1.0.dist-info/WHEEL +5 -0
- adamops-0.1.0.dist-info/entry_points.txt +2 -0
- adamops-0.1.0.dist-info/licenses/LICENSE +21 -0
- adamops-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps Data Preprocessors Module
|
|
3
|
+
|
|
4
|
+
Provides data cleaning capabilities: missing values, outliers, duplicates, type conversion.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Dict, List, Optional, Union
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from sklearn.impute import SimpleImputer, KNNImputer
|
|
11
|
+
from sklearn.experimental import enable_iterative_imputer
|
|
12
|
+
from sklearn.impute import IterativeImputer
|
|
13
|
+
from sklearn.ensemble import IsolationForest
|
|
14
|
+
from adamops.utils.logging import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Missing Value Handling
|
|
20
|
+
def handle_missing(
|
|
21
|
+
df: pd.DataFrame, strategy: str = "mean", columns: Optional[List[str]] = None,
|
|
22
|
+
fill_value: Optional[any] = None, n_neighbors: int = 5
|
|
23
|
+
) -> pd.DataFrame:
|
|
24
|
+
"""
|
|
25
|
+
Handle missing values.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
df: DataFrame to process.
|
|
29
|
+
strategy: 'drop', 'mean', 'median', 'mode', 'constant', 'ffill', 'bfill', 'knn', 'iterative'
|
|
30
|
+
columns: Columns to process (None for all).
|
|
31
|
+
fill_value: Value for 'constant' strategy.
|
|
32
|
+
n_neighbors: Neighbors for KNN.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Processed DataFrame.
|
|
36
|
+
"""
|
|
37
|
+
df = df.copy()
|
|
38
|
+
cols = columns or df.columns.tolist()
|
|
39
|
+
logger.info(f"Handling missing values with strategy: {strategy}")
|
|
40
|
+
|
|
41
|
+
if strategy == "drop":
|
|
42
|
+
return df.dropna(subset=cols)
|
|
43
|
+
elif strategy == "ffill":
|
|
44
|
+
df[cols] = df[cols].ffill()
|
|
45
|
+
elif strategy == "bfill":
|
|
46
|
+
df[cols] = df[cols].bfill()
|
|
47
|
+
elif strategy == "constant":
|
|
48
|
+
df[cols] = df[cols].fillna(fill_value)
|
|
49
|
+
elif strategy in ["mean", "median", "most_frequent"]:
|
|
50
|
+
strat = "most_frequent" if strategy == "mode" else strategy
|
|
51
|
+
num_cols = [c for c in cols if pd.api.types.is_numeric_dtype(df[c])]
|
|
52
|
+
if num_cols:
|
|
53
|
+
imputer = SimpleImputer(strategy=strat)
|
|
54
|
+
df[num_cols] = imputer.fit_transform(df[num_cols])
|
|
55
|
+
elif strategy == "mode":
|
|
56
|
+
for col in cols:
|
|
57
|
+
if df[col].isna().any():
|
|
58
|
+
mode_val = df[col].mode().iloc[0] if not df[col].mode().empty else None
|
|
59
|
+
df[col] = df[col].fillna(mode_val)
|
|
60
|
+
elif strategy == "knn":
|
|
61
|
+
num_cols = [c for c in cols if pd.api.types.is_numeric_dtype(df[c])]
|
|
62
|
+
if num_cols:
|
|
63
|
+
imputer = KNNImputer(n_neighbors=n_neighbors)
|
|
64
|
+
df[num_cols] = imputer.fit_transform(df[num_cols])
|
|
65
|
+
elif strategy == "iterative":
|
|
66
|
+
num_cols = [c for c in cols if pd.api.types.is_numeric_dtype(df[c])]
|
|
67
|
+
if num_cols:
|
|
68
|
+
imputer = IterativeImputer(random_state=42)
|
|
69
|
+
df[num_cols] = imputer.fit_transform(df[num_cols])
|
|
70
|
+
|
|
71
|
+
return df
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# Outlier Handling
|
|
75
|
+
def handle_outliers(
|
|
76
|
+
df: pd.DataFrame, method: str = "iqr", columns: Optional[List[str]] = None,
|
|
77
|
+
threshold: float = 1.5, action: str = "clip", contamination: float = 0.1
|
|
78
|
+
) -> pd.DataFrame:
|
|
79
|
+
"""
|
|
80
|
+
Handle outliers.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
df: DataFrame to process.
|
|
84
|
+
method: 'iqr', 'zscore', 'isolation_forest'
|
|
85
|
+
columns: Columns to process (None for numeric).
|
|
86
|
+
threshold: IQR multiplier or Z-score threshold.
|
|
87
|
+
action: 'clip', 'drop', 'nan'
|
|
88
|
+
contamination: For isolation forest.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Processed DataFrame.
|
|
92
|
+
"""
|
|
93
|
+
df = df.copy()
|
|
94
|
+
num_cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
|
|
95
|
+
logger.info(f"Handling outliers with method: {method}, action: {action}")
|
|
96
|
+
|
|
97
|
+
if method == "iqr":
|
|
98
|
+
for col in num_cols:
|
|
99
|
+
Q1, Q3 = df[col].quantile(0.25), df[col].quantile(0.75)
|
|
100
|
+
IQR = Q3 - Q1
|
|
101
|
+
lower, upper = Q1 - threshold * IQR, Q3 + threshold * IQR
|
|
102
|
+
mask = (df[col] < lower) | (df[col] > upper)
|
|
103
|
+
if action == "clip":
|
|
104
|
+
df[col] = df[col].clip(lower, upper)
|
|
105
|
+
elif action == "drop":
|
|
106
|
+
df = df[~mask]
|
|
107
|
+
elif action == "nan":
|
|
108
|
+
df.loc[mask, col] = np.nan
|
|
109
|
+
|
|
110
|
+
elif method == "zscore":
|
|
111
|
+
for col in num_cols:
|
|
112
|
+
z = np.abs((df[col] - df[col].mean()) / df[col].std())
|
|
113
|
+
mask = z > threshold
|
|
114
|
+
if action == "clip":
|
|
115
|
+
mean, std = df[col].mean(), df[col].std()
|
|
116
|
+
lower, upper = mean - threshold * std, mean + threshold * std
|
|
117
|
+
df[col] = df[col].clip(lower, upper)
|
|
118
|
+
elif action == "drop":
|
|
119
|
+
df = df[~mask]
|
|
120
|
+
elif action == "nan":
|
|
121
|
+
df.loc[mask, col] = np.nan
|
|
122
|
+
|
|
123
|
+
elif method == "isolation_forest":
|
|
124
|
+
iso = IsolationForest(contamination=contamination, random_state=42)
|
|
125
|
+
preds = iso.fit_predict(df[num_cols])
|
|
126
|
+
mask = preds == -1
|
|
127
|
+
if action == "drop":
|
|
128
|
+
df = df[~mask]
|
|
129
|
+
elif action == "nan":
|
|
130
|
+
df.loc[mask, num_cols] = np.nan
|
|
131
|
+
|
|
132
|
+
return df
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# Duplicate Handling
|
|
136
|
+
def handle_duplicates(
|
|
137
|
+
df: pd.DataFrame, subset: Optional[List[str]] = None, keep: str = "first"
|
|
138
|
+
) -> pd.DataFrame:
|
|
139
|
+
"""Remove duplicate rows."""
|
|
140
|
+
before = len(df)
|
|
141
|
+
df = df.drop_duplicates(subset=subset, keep=keep)
|
|
142
|
+
logger.info(f"Removed {before - len(df)} duplicates")
|
|
143
|
+
return df
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# Type Conversion
|
|
147
|
+
def convert_types(
|
|
148
|
+
df: pd.DataFrame, type_mapping: Optional[Dict[str, str]] = None,
|
|
149
|
+
auto_convert: bool = True, datetime_columns: Optional[List[str]] = None
|
|
150
|
+
) -> pd.DataFrame:
|
|
151
|
+
"""
|
|
152
|
+
Convert column types.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
df: DataFrame to process.
|
|
156
|
+
type_mapping: {column: target_type}
|
|
157
|
+
auto_convert: Auto-detect and convert types.
|
|
158
|
+
datetime_columns: Columns to parse as datetime.
|
|
159
|
+
"""
|
|
160
|
+
df = df.copy()
|
|
161
|
+
|
|
162
|
+
if type_mapping:
|
|
163
|
+
for col, dtype in type_mapping.items():
|
|
164
|
+
if col in df.columns:
|
|
165
|
+
try:
|
|
166
|
+
df[col] = df[col].astype(dtype)
|
|
167
|
+
except (ValueError, TypeError) as e:
|
|
168
|
+
logger.warning(f"Could not convert {col} to {dtype}: {e}")
|
|
169
|
+
|
|
170
|
+
if datetime_columns:
|
|
171
|
+
for col in datetime_columns:
|
|
172
|
+
if col in df.columns:
|
|
173
|
+
df[col] = pd.to_datetime(df[col], errors='coerce')
|
|
174
|
+
|
|
175
|
+
if auto_convert:
|
|
176
|
+
for col in df.columns:
|
|
177
|
+
if df[col].dtype == 'object':
|
|
178
|
+
try:
|
|
179
|
+
df[col] = pd.to_numeric(df[col], errors='ignore')
|
|
180
|
+
except:
|
|
181
|
+
pass
|
|
182
|
+
|
|
183
|
+
return df
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# Text Cleaning
|
|
187
|
+
def clean_text(
|
|
188
|
+
df: pd.DataFrame, columns: Optional[List[str]] = None,
|
|
189
|
+
lowercase: bool = True, strip: bool = True, remove_special: bool = False
|
|
190
|
+
) -> pd.DataFrame:
|
|
191
|
+
"""Clean text columns."""
|
|
192
|
+
df = df.copy()
|
|
193
|
+
str_cols = columns or df.select_dtypes(include=['object']).columns.tolist()
|
|
194
|
+
|
|
195
|
+
for col in str_cols:
|
|
196
|
+
if strip:
|
|
197
|
+
df[col] = df[col].str.strip()
|
|
198
|
+
if lowercase:
|
|
199
|
+
df[col] = df[col].str.lower()
|
|
200
|
+
if remove_special:
|
|
201
|
+
df[col] = df[col].str.replace(r'[^\w\s]', '', regex=True)
|
|
202
|
+
|
|
203
|
+
return df
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
# Full Pipeline
|
|
207
|
+
def preprocess(
|
|
208
|
+
df: pd.DataFrame, missing_strategy: str = "mean", outlier_method: Optional[str] = None,
|
|
209
|
+
remove_duplicates: bool = True, convert_types_auto: bool = True
|
|
210
|
+
) -> pd.DataFrame:
|
|
211
|
+
"""Full preprocessing pipeline."""
|
|
212
|
+
logger.info("Starting preprocessing pipeline")
|
|
213
|
+
|
|
214
|
+
if remove_duplicates:
|
|
215
|
+
df = handle_duplicates(df)
|
|
216
|
+
|
|
217
|
+
if missing_strategy:
|
|
218
|
+
df = handle_missing(df, strategy=missing_strategy)
|
|
219
|
+
|
|
220
|
+
if outlier_method:
|
|
221
|
+
df = handle_outliers(df, method=outlier_method)
|
|
222
|
+
|
|
223
|
+
if convert_types_auto:
|
|
224
|
+
df = convert_types(df, auto_convert=True)
|
|
225
|
+
|
|
226
|
+
logger.info(f"Preprocessing complete. Shape: {df.shape}")
|
|
227
|
+
return df
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps Data Splitters Module
|
|
3
|
+
|
|
4
|
+
Provides data splitting: train/test, train/val/test, time-series, K-Fold, stratified.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Iterator, List, Optional, Tuple, Union
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from sklearn.model_selection import (
|
|
11
|
+
train_test_split, KFold, StratifiedKFold, TimeSeriesSplit, GroupKFold
|
|
12
|
+
)
|
|
13
|
+
from adamops.utils.logging import get_logger
|
|
14
|
+
|
|
15
|
+
logger = get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def split_train_test(
|
|
19
|
+
X: Union[pd.DataFrame, np.ndarray], y: Optional[Union[pd.Series, np.ndarray]] = None,
|
|
20
|
+
test_size: float = 0.2, random_state: int = 42, stratify: bool = False, shuffle: bool = True
|
|
21
|
+
) -> Tuple:
|
|
22
|
+
"""
|
|
23
|
+
Split data into train and test sets.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
X: Features.
|
|
27
|
+
y: Target (optional).
|
|
28
|
+
test_size: Test set proportion.
|
|
29
|
+
random_state: Random seed.
|
|
30
|
+
stratify: Stratify by target.
|
|
31
|
+
shuffle: Shuffle before splitting.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
(X_train, X_test) or (X_train, X_test, y_train, y_test)
|
|
35
|
+
"""
|
|
36
|
+
stratify_col = y if stratify and y is not None else None
|
|
37
|
+
|
|
38
|
+
if y is not None:
|
|
39
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
40
|
+
X, y, test_size=test_size, random_state=random_state,
|
|
41
|
+
stratify=stratify_col, shuffle=shuffle
|
|
42
|
+
)
|
|
43
|
+
logger.info(f"Split: train={len(X_train)}, test={len(X_test)}")
|
|
44
|
+
return X_train, X_test, y_train, y_test
|
|
45
|
+
else:
|
|
46
|
+
X_train, X_test = train_test_split(
|
|
47
|
+
X, test_size=test_size, random_state=random_state, shuffle=shuffle
|
|
48
|
+
)
|
|
49
|
+
logger.info(f"Split: train={len(X_train)}, test={len(X_test)}")
|
|
50
|
+
return X_train, X_test
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def split_train_val_test(
|
|
54
|
+
X: Union[pd.DataFrame, np.ndarray], y: Optional[Union[pd.Series, np.ndarray]] = None,
|
|
55
|
+
train_size: float = 0.7, val_size: float = 0.15, test_size: float = 0.15,
|
|
56
|
+
random_state: int = 42, stratify: bool = False
|
|
57
|
+
) -> Tuple:
|
|
58
|
+
"""
|
|
59
|
+
Split data into train, validation, and test sets.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
(X_train, X_val, X_test) or (X_train, X_val, X_test, y_train, y_val, y_test)
|
|
63
|
+
"""
|
|
64
|
+
# Normalize sizes
|
|
65
|
+
total = train_size + val_size + test_size
|
|
66
|
+
train_size, val_size, test_size = train_size/total, val_size/total, test_size/total
|
|
67
|
+
|
|
68
|
+
stratify_col = y if stratify and y is not None else None
|
|
69
|
+
|
|
70
|
+
if y is not None:
|
|
71
|
+
# First split: train+val vs test
|
|
72
|
+
X_temp, X_test, y_temp, y_test = train_test_split(
|
|
73
|
+
X, y, test_size=test_size, random_state=random_state,
|
|
74
|
+
stratify=stratify_col
|
|
75
|
+
)
|
|
76
|
+
# Second split: train vs val
|
|
77
|
+
val_ratio = val_size / (train_size + val_size)
|
|
78
|
+
stratify_temp = y_temp if stratify else None
|
|
79
|
+
X_train, X_val, y_train, y_val = train_test_split(
|
|
80
|
+
X_temp, y_temp, test_size=val_ratio, random_state=random_state,
|
|
81
|
+
stratify=stratify_temp
|
|
82
|
+
)
|
|
83
|
+
logger.info(f"Split: train={len(X_train)}, val={len(X_val)}, test={len(X_test)}")
|
|
84
|
+
return X_train, X_val, X_test, y_train, y_val, y_test
|
|
85
|
+
else:
|
|
86
|
+
X_temp, X_test = train_test_split(X, test_size=test_size, random_state=random_state)
|
|
87
|
+
val_ratio = val_size / (train_size + val_size)
|
|
88
|
+
X_train, X_val = train_test_split(X_temp, test_size=val_ratio, random_state=random_state)
|
|
89
|
+
return X_train, X_val, X_test
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def split_timeseries(
|
|
93
|
+
X: Union[pd.DataFrame, np.ndarray], y: Optional[Union[pd.Series, np.ndarray]] = None,
|
|
94
|
+
n_splits: int = 5, test_size: Optional[int] = None, gap: int = 0
|
|
95
|
+
) -> Iterator[Tuple]:
|
|
96
|
+
"""
|
|
97
|
+
Time series split for temporal data.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
X: Features.
|
|
101
|
+
y: Target.
|
|
102
|
+
n_splits: Number of splits.
|
|
103
|
+
test_size: Test set size per split.
|
|
104
|
+
gap: Gap between train and test.
|
|
105
|
+
|
|
106
|
+
Yields:
|
|
107
|
+
(train_idx, test_idx) tuples.
|
|
108
|
+
"""
|
|
109
|
+
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size, gap=gap)
|
|
110
|
+
logger.info(f"Time series split with {n_splits} folds")
|
|
111
|
+
|
|
112
|
+
for train_idx, test_idx in tscv.split(X):
|
|
113
|
+
yield train_idx, test_idx
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def split_kfold(
|
|
117
|
+
X: Union[pd.DataFrame, np.ndarray], y: Optional[Union[pd.Series, np.ndarray]] = None,
|
|
118
|
+
n_splits: int = 5, shuffle: bool = True, random_state: int = 42
|
|
119
|
+
) -> Iterator[Tuple]:
|
|
120
|
+
"""
|
|
121
|
+
K-Fold cross-validation split.
|
|
122
|
+
|
|
123
|
+
Yields:
|
|
124
|
+
(train_idx, test_idx) tuples.
|
|
125
|
+
"""
|
|
126
|
+
kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
|
|
127
|
+
logger.info(f"K-Fold split with {n_splits} folds")
|
|
128
|
+
|
|
129
|
+
for train_idx, test_idx in kf.split(X):
|
|
130
|
+
yield train_idx, test_idx
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def split_stratified_kfold(
|
|
134
|
+
X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray],
|
|
135
|
+
n_splits: int = 5, shuffle: bool = True, random_state: int = 42
|
|
136
|
+
) -> Iterator[Tuple]:
|
|
137
|
+
"""
|
|
138
|
+
Stratified K-Fold cross-validation split.
|
|
139
|
+
|
|
140
|
+
Preserves class distribution in each fold.
|
|
141
|
+
|
|
142
|
+
Yields:
|
|
143
|
+
(train_idx, test_idx) tuples.
|
|
144
|
+
"""
|
|
145
|
+
skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
|
|
146
|
+
logger.info(f"Stratified K-Fold split with {n_splits} folds")
|
|
147
|
+
|
|
148
|
+
for train_idx, test_idx in skf.split(X, y):
|
|
149
|
+
yield train_idx, test_idx
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def split_group_kfold(
|
|
153
|
+
X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray],
|
|
154
|
+
groups: Union[pd.Series, np.ndarray], n_splits: int = 5
|
|
155
|
+
) -> Iterator[Tuple]:
|
|
156
|
+
"""
|
|
157
|
+
Group K-Fold split. Ensures groups are not split across train/test.
|
|
158
|
+
|
|
159
|
+
Yields:
|
|
160
|
+
(train_idx, test_idx) tuples.
|
|
161
|
+
"""
|
|
162
|
+
gkf = GroupKFold(n_splits=n_splits)
|
|
163
|
+
logger.info(f"Group K-Fold split with {n_splits} folds")
|
|
164
|
+
|
|
165
|
+
for train_idx, test_idx in gkf.split(X, y, groups):
|
|
166
|
+
yield train_idx, test_idx
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def get_fold_data(
|
|
170
|
+
X: Union[pd.DataFrame, np.ndarray], y: Optional[Union[pd.Series, np.ndarray]],
|
|
171
|
+
train_idx: np.ndarray, test_idx: np.ndarray
|
|
172
|
+
) -> Tuple:
|
|
173
|
+
"""Get train/test data for a fold."""
|
|
174
|
+
if isinstance(X, pd.DataFrame):
|
|
175
|
+
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
|
|
176
|
+
else:
|
|
177
|
+
X_train, X_test = X[train_idx], X[test_idx]
|
|
178
|
+
|
|
179
|
+
if y is not None:
|
|
180
|
+
if isinstance(y, pd.Series):
|
|
181
|
+
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
|
|
182
|
+
else:
|
|
183
|
+
y_train, y_test = y[train_idx], y[test_idx]
|
|
184
|
+
return X_train, X_test, y_train, y_test
|
|
185
|
+
|
|
186
|
+
return X_train, X_test
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def create_cv_splits(
|
|
190
|
+
X: Union[pd.DataFrame, np.ndarray], y: Optional[Union[pd.Series, np.ndarray]] = None,
|
|
191
|
+
method: str = "kfold", n_splits: int = 5, **kwargs
|
|
192
|
+
) -> List[Tuple]:
|
|
193
|
+
"""
|
|
194
|
+
Create cross-validation splits.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
X: Features.
|
|
198
|
+
y: Target.
|
|
199
|
+
method: 'kfold', 'stratified', 'timeseries', 'group'
|
|
200
|
+
n_splits: Number of folds.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
List of (train_idx, test_idx) tuples.
|
|
204
|
+
"""
|
|
205
|
+
if method == "kfold":
|
|
206
|
+
return list(split_kfold(X, y, n_splits, **kwargs))
|
|
207
|
+
elif method == "stratified":
|
|
208
|
+
if y is None:
|
|
209
|
+
raise ValueError("y is required for stratified split")
|
|
210
|
+
return list(split_stratified_kfold(X, y, n_splits, **kwargs))
|
|
211
|
+
elif method == "timeseries":
|
|
212
|
+
return list(split_timeseries(X, y, n_splits, **kwargs))
|
|
213
|
+
elif method == "group":
|
|
214
|
+
if "groups" not in kwargs:
|
|
215
|
+
raise ValueError("groups is required for group split")
|
|
216
|
+
return list(split_group_kfold(X, y, kwargs["groups"], n_splits))
|
|
217
|
+
else:
|
|
218
|
+
raise ValueError(f"Unknown split method: {method}")
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps Data Validators Module
|
|
3
|
+
|
|
4
|
+
Provides data validation: type validation, missing value checks,
|
|
5
|
+
duplicate detection, shape validation, and statistical checks.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from adamops.utils.logging import get_logger
|
|
14
|
+
|
|
15
|
+
logger = get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ValidationIssue:
|
|
20
|
+
"""Represents a validation issue."""
|
|
21
|
+
severity: str # 'error', 'warning', 'info'
|
|
22
|
+
category: str
|
|
23
|
+
column: Optional[str]
|
|
24
|
+
message: str
|
|
25
|
+
details: Optional[Dict] = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class ColumnStats:
|
|
30
|
+
"""Statistics for a column."""
|
|
31
|
+
name: str
|
|
32
|
+
dtype: str
|
|
33
|
+
count: int
|
|
34
|
+
missing_count: int
|
|
35
|
+
missing_pct: float
|
|
36
|
+
unique_count: int
|
|
37
|
+
unique_pct: float
|
|
38
|
+
mean: Optional[float] = None
|
|
39
|
+
std: Optional[float] = None
|
|
40
|
+
min: Optional[float] = None
|
|
41
|
+
max: Optional[float] = None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class ValidationReport:
|
|
46
|
+
"""Complete validation report."""
|
|
47
|
+
timestamp: str
|
|
48
|
+
shape: Tuple[int, int]
|
|
49
|
+
memory_usage: float
|
|
50
|
+
issues: List[ValidationIssue] = field(default_factory=list)
|
|
51
|
+
column_stats: Dict[str, ColumnStats] = field(default_factory=dict)
|
|
52
|
+
duplicate_rows: int = 0
|
|
53
|
+
passed: bool = True
|
|
54
|
+
|
|
55
|
+
def summary(self) -> str:
|
|
56
|
+
"""Generate text summary."""
|
|
57
|
+
lines = [
|
|
58
|
+
"=" * 50, "VALIDATION REPORT", "=" * 50,
|
|
59
|
+
f"Shape: {self.shape[0]} rows x {self.shape[1]} columns",
|
|
60
|
+
f"Memory: {self.memory_usage:.2f} MB",
|
|
61
|
+
f"Duplicates: {self.duplicate_rows}",
|
|
62
|
+
f"Status: {'PASSED' if self.passed else 'FAILED'}",
|
|
63
|
+
f"Issues: {len(self.issues)}", "=" * 50
|
|
64
|
+
]
|
|
65
|
+
for issue in self.issues:
|
|
66
|
+
col = f"[{issue.column}] " if issue.column else ""
|
|
67
|
+
lines.append(f"[{issue.severity.upper()}] {col}{issue.message}")
|
|
68
|
+
return "\n".join(lines)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class DataValidator:
|
|
72
|
+
"""Data validator for DataFrames."""
|
|
73
|
+
|
|
74
|
+
def __init__(self, missing_threshold: float = 0.5, unique_threshold: float = 0.95):
|
|
75
|
+
self.missing_threshold = missing_threshold
|
|
76
|
+
self.unique_threshold = unique_threshold
|
|
77
|
+
|
|
78
|
+
def validate(self, df: pd.DataFrame, schema: Optional[Dict] = None,
|
|
79
|
+
required_columns: Optional[List[str]] = None) -> ValidationReport:
|
|
80
|
+
"""Validate a DataFrame."""
|
|
81
|
+
report = ValidationReport(
|
|
82
|
+
timestamp=datetime.now().isoformat(),
|
|
83
|
+
shape=df.shape,
|
|
84
|
+
memory_usage=df.memory_usage(deep=True).sum() / 1024**2,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Check required columns
|
|
88
|
+
if required_columns:
|
|
89
|
+
missing = set(required_columns) - set(df.columns)
|
|
90
|
+
for col in missing:
|
|
91
|
+
report.issues.append(ValidationIssue("error", "schema", col, f"Missing: {col}"))
|
|
92
|
+
|
|
93
|
+
# Check duplicates
|
|
94
|
+
dups = df.duplicated().sum()
|
|
95
|
+
report.duplicate_rows = dups
|
|
96
|
+
if dups > 0:
|
|
97
|
+
report.issues.append(ValidationIssue("warning", "duplicate", None, f"{dups} duplicates"))
|
|
98
|
+
|
|
99
|
+
# Column stats
|
|
100
|
+
for col in df.columns:
|
|
101
|
+
series = df[col]
|
|
102
|
+
missing = series.isna().sum()
|
|
103
|
+
stats = ColumnStats(
|
|
104
|
+
name=col, dtype=str(series.dtype), count=len(series),
|
|
105
|
+
missing_count=missing, missing_pct=100*missing/len(series),
|
|
106
|
+
unique_count=series.nunique(), unique_pct=100*series.nunique()/len(series),
|
|
107
|
+
)
|
|
108
|
+
if pd.api.types.is_numeric_dtype(series):
|
|
109
|
+
stats.mean, stats.std = series.mean(), series.std()
|
|
110
|
+
stats.min, stats.max = series.min(), series.max()
|
|
111
|
+
report.column_stats[col] = stats
|
|
112
|
+
|
|
113
|
+
if stats.missing_pct > self.missing_threshold * 100:
|
|
114
|
+
report.issues.append(ValidationIssue("warning", "missing", col,
|
|
115
|
+
f"High missing: {stats.missing_pct:.1f}%"))
|
|
116
|
+
|
|
117
|
+
report.passed = not any(i.severity == "error" for i in report.issues)
|
|
118
|
+
return report
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def validate(df: pd.DataFrame, **kwargs) -> ValidationReport:
|
|
122
|
+
"""Validate a DataFrame."""
|
|
123
|
+
return DataValidator().validate(df, **kwargs)
|
|
124
|
+
|
|
125
|
+
def check_missing(df: pd.DataFrame) -> Dict[str, Dict]:
|
|
126
|
+
"""Check missing values."""
|
|
127
|
+
return {col: {"count": int(df[col].isna().sum()), "pct": 100*df[col].isna().mean()}
|
|
128
|
+
for col in df.columns if df[col].isna().any()}
|
|
129
|
+
|
|
130
|
+
def check_duplicates(df: pd.DataFrame, subset: Optional[List[str]] = None) -> pd.DataFrame:
|
|
131
|
+
"""Get duplicate rows."""
|
|
132
|
+
return df[df.duplicated(subset=subset, keep=False)]
|
|
133
|
+
|
|
134
|
+
def check_types(df: pd.DataFrame) -> Dict[str, str]:
|
|
135
|
+
"""Get column types."""
|
|
136
|
+
return {col: str(dtype) for col, dtype in df.dtypes.items()}
|
|
137
|
+
|
|
138
|
+
def describe_data(df: pd.DataFrame) -> pd.DataFrame:
|
|
139
|
+
"""Generate data description."""
|
|
140
|
+
stats = []
|
|
141
|
+
for col in df.columns:
|
|
142
|
+
s = df[col]
|
|
143
|
+
row = {"column": col, "dtype": str(s.dtype), "missing": s.isna().sum(),
|
|
144
|
+
"unique": s.nunique()}
|
|
145
|
+
if pd.api.types.is_numeric_dtype(s):
|
|
146
|
+
row.update({"mean": s.mean(), "std": s.std(), "min": s.min(), "max": s.max()})
|
|
147
|
+
stats.append(row)
|
|
148
|
+
return pd.DataFrame(stats)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps Deployment Module
|
|
3
|
+
|
|
4
|
+
Provides model deployment capabilities:
|
|
5
|
+
- exporters: Export models to ONNX, PMML, TFLite, CoreML
|
|
6
|
+
- api: Create FastAPI/Flask/Streamlit APIs
|
|
7
|
+
- containerize: Docker and Kubernetes deployment
|
|
8
|
+
- cloud: AWS, GCP, Azure deployment
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from adamops.deployment import exporters
|
|
12
|
+
from adamops.deployment import api
|
|
13
|
+
from adamops.deployment import containerize
|
|
14
|
+
from adamops.deployment import cloud
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"exporters",
|
|
18
|
+
"api",
|
|
19
|
+
"containerize",
|
|
20
|
+
"cloud",
|
|
21
|
+
]
|