createsonline 0.1.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- createsonline/__init__.py +46 -0
- createsonline/admin/__init__.py +7 -0
- createsonline/admin/content.py +526 -0
- createsonline/admin/crud.py +805 -0
- createsonline/admin/field_builder.py +559 -0
- createsonline/admin/integration.py +482 -0
- createsonline/admin/interface.py +2562 -0
- createsonline/admin/model_creator.py +513 -0
- createsonline/admin/model_manager.py +388 -0
- createsonline/admin/modern_dashboard.py +498 -0
- createsonline/admin/permissions.py +264 -0
- createsonline/admin/user_forms.py +594 -0
- createsonline/ai/__init__.py +202 -0
- createsonline/ai/fields.py +1226 -0
- createsonline/ai/orm.py +325 -0
- createsonline/ai/services.py +1244 -0
- createsonline/app.py +506 -0
- createsonline/auth/__init__.py +8 -0
- createsonline/auth/management.py +228 -0
- createsonline/auth/models.py +552 -0
- createsonline/cli/__init__.py +5 -0
- createsonline/cli/commands/__init__.py +122 -0
- createsonline/cli/commands/database.py +416 -0
- createsonline/cli/commands/info.py +173 -0
- createsonline/cli/commands/initdb.py +218 -0
- createsonline/cli/commands/project.py +545 -0
- createsonline/cli/commands/serve.py +173 -0
- createsonline/cli/commands/shell.py +93 -0
- createsonline/cli/commands/users.py +148 -0
- createsonline/cli/main.py +2041 -0
- createsonline/cli/manage.py +274 -0
- createsonline/config/__init__.py +9 -0
- createsonline/config/app.py +2577 -0
- createsonline/config/database.py +179 -0
- createsonline/config/docs.py +384 -0
- createsonline/config/errors.py +160 -0
- createsonline/config/orm.py +43 -0
- createsonline/config/request.py +93 -0
- createsonline/config/settings.py +176 -0
- createsonline/data/__init__.py +23 -0
- createsonline/data/dataframe.py +925 -0
- createsonline/data/io.py +453 -0
- createsonline/data/series.py +557 -0
- createsonline/database/__init__.py +60 -0
- createsonline/database/abstraction.py +440 -0
- createsonline/database/assistant.py +585 -0
- createsonline/database/fields.py +442 -0
- createsonline/database/migrations.py +132 -0
- createsonline/database/models.py +604 -0
- createsonline/database.py +438 -0
- createsonline/http/__init__.py +28 -0
- createsonline/http/client.py +535 -0
- createsonline/ml/__init__.py +55 -0
- createsonline/ml/classification.py +552 -0
- createsonline/ml/clustering.py +680 -0
- createsonline/ml/metrics.py +542 -0
- createsonline/ml/neural.py +560 -0
- createsonline/ml/preprocessing.py +784 -0
- createsonline/ml/regression.py +501 -0
- createsonline/performance/__init__.py +19 -0
- createsonline/performance/cache.py +444 -0
- createsonline/performance/compression.py +335 -0
- createsonline/performance/core.py +419 -0
- createsonline/project_init.py +789 -0
- createsonline/routing.py +528 -0
- createsonline/security/__init__.py +34 -0
- createsonline/security/core.py +811 -0
- createsonline/security/encryption.py +349 -0
- createsonline/server.py +295 -0
- createsonline/static/css/admin.css +263 -0
- createsonline/static/css/common.css +358 -0
- createsonline/static/css/dashboard.css +89 -0
- createsonline/static/favicon.ico +0 -0
- createsonline/static/icons/icon-128x128.png +0 -0
- createsonline/static/icons/icon-128x128.webp +0 -0
- createsonline/static/icons/icon-16x16.png +0 -0
- createsonline/static/icons/icon-16x16.webp +0 -0
- createsonline/static/icons/icon-180x180.png +0 -0
- createsonline/static/icons/icon-180x180.webp +0 -0
- createsonline/static/icons/icon-192x192.png +0 -0
- createsonline/static/icons/icon-192x192.webp +0 -0
- createsonline/static/icons/icon-256x256.png +0 -0
- createsonline/static/icons/icon-256x256.webp +0 -0
- createsonline/static/icons/icon-32x32.png +0 -0
- createsonline/static/icons/icon-32x32.webp +0 -0
- createsonline/static/icons/icon-384x384.png +0 -0
- createsonline/static/icons/icon-384x384.webp +0 -0
- createsonline/static/icons/icon-48x48.png +0 -0
- createsonline/static/icons/icon-48x48.webp +0 -0
- createsonline/static/icons/icon-512x512.png +0 -0
- createsonline/static/icons/icon-512x512.webp +0 -0
- createsonline/static/icons/icon-64x64.png +0 -0
- createsonline/static/icons/icon-64x64.webp +0 -0
- createsonline/static/image/android-chrome-192x192.png +0 -0
- createsonline/static/image/android-chrome-512x512.png +0 -0
- createsonline/static/image/apple-touch-icon.png +0 -0
- createsonline/static/image/favicon-16x16.png +0 -0
- createsonline/static/image/favicon-32x32.png +0 -0
- createsonline/static/image/favicon.ico +0 -0
- createsonline/static/image/favicon.svg +17 -0
- createsonline/static/image/icon-128x128.png +0 -0
- createsonline/static/image/icon-128x128.webp +0 -0
- createsonline/static/image/icon-16x16.png +0 -0
- createsonline/static/image/icon-16x16.webp +0 -0
- createsonline/static/image/icon-180x180.png +0 -0
- createsonline/static/image/icon-180x180.webp +0 -0
- createsonline/static/image/icon-192x192.png +0 -0
- createsonline/static/image/icon-192x192.webp +0 -0
- createsonline/static/image/icon-256x256.png +0 -0
- createsonline/static/image/icon-256x256.webp +0 -0
- createsonline/static/image/icon-32x32.png +0 -0
- createsonline/static/image/icon-32x32.webp +0 -0
- createsonline/static/image/icon-384x384.png +0 -0
- createsonline/static/image/icon-384x384.webp +0 -0
- createsonline/static/image/icon-48x48.png +0 -0
- createsonline/static/image/icon-48x48.webp +0 -0
- createsonline/static/image/icon-512x512.png +0 -0
- createsonline/static/image/icon-512x512.webp +0 -0
- createsonline/static/image/icon-64x64.png +0 -0
- createsonline/static/image/icon-64x64.webp +0 -0
- createsonline/static/image/logo-header-h100.png +0 -0
- createsonline/static/image/logo-header-h100.webp +0 -0
- createsonline/static/image/logo-header-h200@2x.png +0 -0
- createsonline/static/image/logo-header-h200@2x.webp +0 -0
- createsonline/static/image/logo.png +0 -0
- createsonline/static/js/admin.js +274 -0
- createsonline/static/site.webmanifest +35 -0
- createsonline/static/templates/admin/base.html +87 -0
- createsonline/static/templates/admin/dashboard.html +217 -0
- createsonline/static/templates/admin/model_form.html +270 -0
- createsonline/static/templates/admin/model_list.html +202 -0
- createsonline/static/test_script.js +15 -0
- createsonline/static/test_styles.css +59 -0
- createsonline/static_files.py +365 -0
- createsonline/templates/404.html +100 -0
- createsonline/templates/admin_login.html +169 -0
- createsonline/templates/base.html +102 -0
- createsonline/templates/index.html +151 -0
- createsonline/templates.py +205 -0
- createsonline/testing.py +322 -0
- createsonline/utils.py +448 -0
- createsonline/validation/__init__.py +49 -0
- createsonline/validation/fields.py +598 -0
- createsonline/validation/models.py +504 -0
- createsonline/validation/validators.py +561 -0
- createsonline/views.py +184 -0
- createsonline-0.1.26.dist-info/METADATA +46 -0
- createsonline-0.1.26.dist-info/RECORD +152 -0
- createsonline-0.1.26.dist-info/WHEEL +5 -0
- createsonline-0.1.26.dist-info/entry_points.txt +2 -0
- createsonline-0.1.26.dist-info/licenses/LICENSE +21 -0
- createsonline-0.1.26.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,784 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CREATESONLINE ML Preprocessing
|
|
3
|
+
|
|
4
|
+
Pure Python preprocessing utilities.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from typing import Union, List, Optional, Tuple, Dict
|
|
9
|
+
import random
|
|
10
|
+
import math
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class StandardScaler:
|
|
14
|
+
"""
|
|
15
|
+
Standardize features by removing mean and scaling to unit variance
|
|
16
|
+
|
|
17
|
+
Pure Python implementation with numpy.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self):
|
|
21
|
+
"""Initialize StandardScaler"""
|
|
22
|
+
self.mean_ = None
|
|
23
|
+
self.scale_ = None
|
|
24
|
+
self.var_ = None
|
|
25
|
+
self.fitted = False
|
|
26
|
+
|
|
27
|
+
def fit(self, X: Union[np.ndarray, list]) -> 'StandardScaler':
|
|
28
|
+
"""
|
|
29
|
+
Compute the mean and std to be used for later scaling
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
X: Training data (n_samples, n_features)
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Self for method chaining
|
|
36
|
+
"""
|
|
37
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
38
|
+
if X.ndim == 1:
|
|
39
|
+
X = X.reshape(-1, 1)
|
|
40
|
+
|
|
41
|
+
self.mean_ = np.mean(X, axis=0)
|
|
42
|
+
self.var_ = np.var(X, axis=0)
|
|
43
|
+
self.scale_ = np.sqrt(self.var_)
|
|
44
|
+
|
|
45
|
+
# Handle zero variance features
|
|
46
|
+
self.scale_[self.scale_ == 0] = 1.0
|
|
47
|
+
|
|
48
|
+
self.fitted = True
|
|
49
|
+
return self
|
|
50
|
+
|
|
51
|
+
def transform(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
52
|
+
"""
|
|
53
|
+
Standardize the data
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
X: Data to transform (n_samples, n_features)
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Transformed data
|
|
60
|
+
"""
|
|
61
|
+
if not self.fitted:
|
|
62
|
+
raise RuntimeError("Scaler must be fitted before transforming")
|
|
63
|
+
|
|
64
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
65
|
+
if X.ndim == 1:
|
|
66
|
+
X = X.reshape(-1, 1)
|
|
67
|
+
|
|
68
|
+
return (X - self.mean_) / self.scale_
|
|
69
|
+
|
|
70
|
+
def fit_transform(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
71
|
+
"""
|
|
72
|
+
Fit to data, then transform it
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
X: Data to fit and transform
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Transformed data
|
|
79
|
+
"""
|
|
80
|
+
return self.fit(X).transform(X)
|
|
81
|
+
|
|
82
|
+
def inverse_transform(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
83
|
+
"""
|
|
84
|
+
Scale back the data to the original representation
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
X: Transformed data
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Original scale data
|
|
91
|
+
"""
|
|
92
|
+
if not self.fitted:
|
|
93
|
+
raise RuntimeError("Scaler must be fitted before inverse transforming")
|
|
94
|
+
|
|
95
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
96
|
+
if X.ndim == 1:
|
|
97
|
+
X = X.reshape(-1, 1)
|
|
98
|
+
|
|
99
|
+
return X * self.scale_ + self.mean_
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class MinMaxScaler:
|
|
103
|
+
"""
|
|
104
|
+
Scale features to a given range (default 0-1)
|
|
105
|
+
|
|
106
|
+
Pure Python implementation with numpy.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def __init__(self, feature_range: Tuple[float, float] = (0, 1)):
|
|
110
|
+
"""
|
|
111
|
+
Initialize MinMaxScaler
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
feature_range: Desired range of transformed data
|
|
115
|
+
"""
|
|
116
|
+
self.feature_range = feature_range
|
|
117
|
+
self.min_ = None
|
|
118
|
+
self.scale_ = None
|
|
119
|
+
self.data_min_ = None
|
|
120
|
+
self.data_max_ = None
|
|
121
|
+
self.data_range_ = None
|
|
122
|
+
self.fitted = False
|
|
123
|
+
|
|
124
|
+
def fit(self, X: Union[np.ndarray, list]) -> 'MinMaxScaler':
|
|
125
|
+
"""
|
|
126
|
+
Compute the minimum and maximum to be used for later scaling
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
X: Training data (n_samples, n_features)
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Self for method chaining
|
|
133
|
+
"""
|
|
134
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
135
|
+
if X.ndim == 1:
|
|
136
|
+
X = X.reshape(-1, 1)
|
|
137
|
+
|
|
138
|
+
self.data_min_ = np.min(X, axis=0)
|
|
139
|
+
self.data_max_ = np.max(X, axis=0)
|
|
140
|
+
self.data_range_ = self.data_max_ - self.data_min_
|
|
141
|
+
|
|
142
|
+
# Handle constant features
|
|
143
|
+
self.data_range_[self.data_range_ == 0] = 1.0
|
|
144
|
+
|
|
145
|
+
feature_range_min, feature_range_max = self.feature_range
|
|
146
|
+
self.scale_ = (feature_range_max - feature_range_min) / self.data_range_
|
|
147
|
+
self.min_ = feature_range_min - self.data_min_ * self.scale_
|
|
148
|
+
|
|
149
|
+
self.fitted = True
|
|
150
|
+
return self
|
|
151
|
+
|
|
152
|
+
def transform(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
153
|
+
"""
|
|
154
|
+
Scale the data to the specified range
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
X: Data to transform (n_samples, n_features)
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Transformed data
|
|
161
|
+
"""
|
|
162
|
+
if not self.fitted:
|
|
163
|
+
raise RuntimeError("Scaler must be fitted before transforming")
|
|
164
|
+
|
|
165
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
166
|
+
if X.ndim == 1:
|
|
167
|
+
X = X.reshape(-1, 1)
|
|
168
|
+
|
|
169
|
+
return X * self.scale_ + self.min_
|
|
170
|
+
|
|
171
|
+
def fit_transform(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
172
|
+
"""
|
|
173
|
+
Fit to data, then transform it
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
X: Data to fit and transform
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Transformed data
|
|
180
|
+
"""
|
|
181
|
+
return self.fit(X).transform(X)
|
|
182
|
+
|
|
183
|
+
def inverse_transform(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
184
|
+
"""
|
|
185
|
+
Undo the scaling of the data
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
X: Transformed data
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Original scale data
|
|
192
|
+
"""
|
|
193
|
+
if not self.fitted:
|
|
194
|
+
raise RuntimeError("Scaler must be fitted before inverse transforming")
|
|
195
|
+
|
|
196
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
197
|
+
if X.ndim == 1:
|
|
198
|
+
X = X.reshape(-1, 1)
|
|
199
|
+
|
|
200
|
+
return (X - self.min_) / self.scale_
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class LabelEncoder:
|
|
204
|
+
"""
|
|
205
|
+
Encode categorical labels as integers
|
|
206
|
+
|
|
207
|
+
Pure Python implementation.
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
def __init__(self):
|
|
211
|
+
"""Initialize LabelEncoder"""
|
|
212
|
+
self.classes_ = None
|
|
213
|
+
self.class_to_index_ = None
|
|
214
|
+
self.fitted = False
|
|
215
|
+
|
|
216
|
+
def fit(self, y: Union[np.ndarray, list]) -> 'LabelEncoder':
|
|
217
|
+
"""
|
|
218
|
+
Fit label encoder
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
y: Target values
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Self for method chaining
|
|
225
|
+
"""
|
|
226
|
+
y = np.array(y) if not isinstance(y, np.ndarray) else y
|
|
227
|
+
|
|
228
|
+
self.classes_ = np.unique(y)
|
|
229
|
+
self.class_to_index_ = {cls: i for i, cls in enumerate(self.classes_)}
|
|
230
|
+
|
|
231
|
+
self.fitted = True
|
|
232
|
+
return self
|
|
233
|
+
|
|
234
|
+
def transform(self, y: Union[np.ndarray, list]) -> np.ndarray:
|
|
235
|
+
"""
|
|
236
|
+
Transform labels to normalized encoding
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
y: Target values
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Encoded labels
|
|
243
|
+
"""
|
|
244
|
+
if not self.fitted:
|
|
245
|
+
raise RuntimeError("LabelEncoder must be fitted before transforming")
|
|
246
|
+
|
|
247
|
+
y = np.array(y) if not isinstance(y, np.ndarray) else y
|
|
248
|
+
|
|
249
|
+
encoded = np.zeros(len(y), dtype=int)
|
|
250
|
+
for i, label in enumerate(y):
|
|
251
|
+
if label in self.class_to_index_:
|
|
252
|
+
encoded[i] = self.class_to_index_[label]
|
|
253
|
+
else:
|
|
254
|
+
raise ValueError(f"Unseen label: {label}")
|
|
255
|
+
|
|
256
|
+
return encoded
|
|
257
|
+
|
|
258
|
+
def fit_transform(self, y: Union[np.ndarray, list]) -> np.ndarray:
|
|
259
|
+
"""
|
|
260
|
+
Fit label encoder and return encoded labels
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
y: Target values
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Encoded labels
|
|
267
|
+
"""
|
|
268
|
+
return self.fit(y).transform(y)
|
|
269
|
+
|
|
270
|
+
def inverse_transform(self, y: Union[np.ndarray, list]) -> np.ndarray:
|
|
271
|
+
"""
|
|
272
|
+
Transform labels back to original encoding
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
y: Encoded labels
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
Original labels
|
|
279
|
+
"""
|
|
280
|
+
if not self.fitted:
|
|
281
|
+
raise RuntimeError("LabelEncoder must be fitted before inverse transforming")
|
|
282
|
+
|
|
283
|
+
y = np.array(y) if not isinstance(y, np.ndarray) else y
|
|
284
|
+
|
|
285
|
+
original = np.zeros(len(y), dtype=object)
|
|
286
|
+
for i, encoded_label in enumerate(y):
|
|
287
|
+
if 0 <= encoded_label < len(self.classes_):
|
|
288
|
+
original[i] = self.classes_[encoded_label]
|
|
289
|
+
else:
|
|
290
|
+
raise ValueError(f"Invalid encoded label: {encoded_label}")
|
|
291
|
+
|
|
292
|
+
return original
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
class OneHotEncoder:
|
|
296
|
+
"""
|
|
297
|
+
Encode categorical features as one-hot numeric array
|
|
298
|
+
|
|
299
|
+
Pure Python implementation.
|
|
300
|
+
"""
|
|
301
|
+
|
|
302
|
+
def __init__(self, drop_first: bool = False):
|
|
303
|
+
"""
|
|
304
|
+
Initialize OneHotEncoder
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
drop_first: Whether to drop the first category to avoid multicollinearity
|
|
308
|
+
"""
|
|
309
|
+
self.drop_first = drop_first
|
|
310
|
+
self.categories_ = None
|
|
311
|
+
self.n_features_out_ = None
|
|
312
|
+
self.fitted = False
|
|
313
|
+
|
|
314
|
+
def fit(self, X: Union[np.ndarray, list]) -> 'OneHotEncoder':
|
|
315
|
+
"""
|
|
316
|
+
Fit OneHotEncoder to X
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
X: Categorical data (n_samples, n_features)
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Self for method chaining
|
|
323
|
+
"""
|
|
324
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
325
|
+
if X.ndim == 1:
|
|
326
|
+
X = X.reshape(-1, 1)
|
|
327
|
+
|
|
328
|
+
n_features = X.shape[1]
|
|
329
|
+
self.categories_ = []
|
|
330
|
+
|
|
331
|
+
for feature_idx in range(n_features):
|
|
332
|
+
unique_categories = np.unique(X[:, feature_idx])
|
|
333
|
+
self.categories_.append(unique_categories)
|
|
334
|
+
|
|
335
|
+
# Calculate output size
|
|
336
|
+
self.n_features_out_ = sum(
|
|
337
|
+
len(cats) - (1 if self.drop_first else 0)
|
|
338
|
+
for cats in self.categories_
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
self.fitted = True
|
|
342
|
+
return self
|
|
343
|
+
|
|
344
|
+
def transform(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
345
|
+
"""
|
|
346
|
+
Transform X using one-hot encoding
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
X: Categorical data (n_samples, n_features)
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
One-hot encoded data
|
|
353
|
+
"""
|
|
354
|
+
if not self.fitted:
|
|
355
|
+
raise RuntimeError("OneHotEncoder must be fitted before transforming")
|
|
356
|
+
|
|
357
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
358
|
+
if X.ndim == 1:
|
|
359
|
+
X = X.reshape(-1, 1)
|
|
360
|
+
|
|
361
|
+
n_samples, n_features = X.shape
|
|
362
|
+
encoded = np.zeros((n_samples, self.n_features_out_))
|
|
363
|
+
|
|
364
|
+
col_idx = 0
|
|
365
|
+
for feature_idx in range(n_features):
|
|
366
|
+
categories = self.categories_[feature_idx]
|
|
367
|
+
start_idx = 1 if self.drop_first else 0
|
|
368
|
+
|
|
369
|
+
for cat_idx in range(start_idx, len(categories)):
|
|
370
|
+
category = categories[cat_idx]
|
|
371
|
+
mask = X[:, feature_idx] == category
|
|
372
|
+
encoded[mask, col_idx] = 1
|
|
373
|
+
col_idx += 1
|
|
374
|
+
|
|
375
|
+
return encoded
|
|
376
|
+
|
|
377
|
+
def fit_transform(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
378
|
+
"""
|
|
379
|
+
Fit OneHotEncoder to X, then transform X
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
X: Categorical data
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
One-hot encoded data
|
|
386
|
+
"""
|
|
387
|
+
return self.fit(X).transform(X)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
class PolynomialFeatures:
|
|
391
|
+
"""
|
|
392
|
+
Generate polynomial and interaction features
|
|
393
|
+
|
|
394
|
+
Pure Python implementation.
|
|
395
|
+
"""
|
|
396
|
+
|
|
397
|
+
def __init__(self, degree: int = 2, include_bias: bool = True, interaction_only: bool = False):
|
|
398
|
+
"""
|
|
399
|
+
Initialize PolynomialFeatures
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
degree: Maximum degree of polynomial features
|
|
403
|
+
include_bias: Whether to include bias column (all ones)
|
|
404
|
+
interaction_only: Whether to produce interaction features only
|
|
405
|
+
"""
|
|
406
|
+
self.degree = degree
|
|
407
|
+
self.include_bias = include_bias
|
|
408
|
+
self.interaction_only = interaction_only
|
|
409
|
+
self.n_input_features_ = None
|
|
410
|
+
self.n_output_features_ = None
|
|
411
|
+
self.fitted = False
|
|
412
|
+
|
|
413
|
+
def fit(self, X: Union[np.ndarray, list]) -> 'PolynomialFeatures':
|
|
414
|
+
"""
|
|
415
|
+
Compute number of output features
|
|
416
|
+
|
|
417
|
+
Args:
|
|
418
|
+
X: Input data (n_samples, n_features)
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
Self for method chaining
|
|
422
|
+
"""
|
|
423
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
424
|
+
if X.ndim == 1:
|
|
425
|
+
X = X.reshape(-1, 1)
|
|
426
|
+
|
|
427
|
+
self.n_input_features_ = X.shape[1]
|
|
428
|
+
|
|
429
|
+
# Calculate number of output features
|
|
430
|
+
if self.interaction_only:
|
|
431
|
+
# Only interaction terms
|
|
432
|
+
self.n_output_features_ = 1 # bias
|
|
433
|
+
for d in range(2, self.degree + 1):
|
|
434
|
+
self.n_output_features_ += math.comb(self.n_input_features_, d)
|
|
435
|
+
if not self.include_bias:
|
|
436
|
+
self.n_output_features_ -= 1
|
|
437
|
+
else:
|
|
438
|
+
# All polynomial terms
|
|
439
|
+
self.n_output_features_ = math.comb(self.n_input_features_ + self.degree, self.degree)
|
|
440
|
+
if not self.include_bias:
|
|
441
|
+
self.n_output_features_ -= 1
|
|
442
|
+
|
|
443
|
+
self.fitted = True
|
|
444
|
+
return self
|
|
445
|
+
|
|
446
|
+
def transform(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
447
|
+
"""
|
|
448
|
+
Transform data to polynomial features
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
X: Input data (n_samples, n_features)
|
|
452
|
+
|
|
453
|
+
Returns:
|
|
454
|
+
Polynomial features
|
|
455
|
+
"""
|
|
456
|
+
if not self.fitted:
|
|
457
|
+
raise RuntimeError("PolynomialFeatures must be fitted before transforming")
|
|
458
|
+
|
|
459
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
460
|
+
if X.ndim == 1:
|
|
461
|
+
X = X.reshape(-1, 1)
|
|
462
|
+
|
|
463
|
+
n_samples, n_features = X.shape
|
|
464
|
+
|
|
465
|
+
# For simplicity, implement basic polynomial features
|
|
466
|
+
# This is a simplified version - full implementation would handle all combinations
|
|
467
|
+
features = []
|
|
468
|
+
|
|
469
|
+
if self.include_bias:
|
|
470
|
+
features.append(np.ones(n_samples))
|
|
471
|
+
|
|
472
|
+
# Original features
|
|
473
|
+
if not self.interaction_only:
|
|
474
|
+
for i in range(n_features):
|
|
475
|
+
features.append(X[:, i])
|
|
476
|
+
|
|
477
|
+
# Polynomial features
|
|
478
|
+
for degree in range(2, self.degree + 1):
|
|
479
|
+
if self.interaction_only:
|
|
480
|
+
# Only cross terms
|
|
481
|
+
for i in range(n_features):
|
|
482
|
+
for j in range(i + 1, n_features):
|
|
483
|
+
features.append(X[:, i] * X[:, j])
|
|
484
|
+
else:
|
|
485
|
+
# All polynomial terms
|
|
486
|
+
for i in range(n_features):
|
|
487
|
+
features.append(X[:, i] ** degree)
|
|
488
|
+
|
|
489
|
+
# Cross terms
|
|
490
|
+
if degree == 2: # Only implement degree 2 cross terms for simplicity
|
|
491
|
+
for i in range(n_features):
|
|
492
|
+
for j in range(i + 1, n_features):
|
|
493
|
+
features.append(X[:, i] * X[:, j])
|
|
494
|
+
|
|
495
|
+
return np.column_stack(features)
|
|
496
|
+
|
|
497
|
+
def fit_transform(self, X: Union[np.ndarray, list]) -> np.ndarray:
|
|
498
|
+
"""
|
|
499
|
+
Fit to data, then transform it
|
|
500
|
+
|
|
501
|
+
Args:
|
|
502
|
+
X: Input data
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
Polynomial features
|
|
506
|
+
"""
|
|
507
|
+
return self.fit(X).transform(X)
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
# Data splitting functions
|
|
511
|
+
|
|
512
|
+
def train_test_split(
|
|
513
|
+
*arrays,
|
|
514
|
+
test_size: Union[float, int] = 0.25,
|
|
515
|
+
train_size: Optional[Union[float, int]] = None,
|
|
516
|
+
random_state: Optional[int] = None,
|
|
517
|
+
shuffle: bool = True,
|
|
518
|
+
stratify: Optional[Union[np.ndarray, list]] = None
|
|
519
|
+
) -> List[np.ndarray]:
|
|
520
|
+
"""
|
|
521
|
+
Split arrays into random train and test subsets
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
*arrays: Sequence of indexables with same length
|
|
525
|
+
test_size: Proportion or absolute number of test samples
|
|
526
|
+
train_size: Proportion or absolute number of train samples
|
|
527
|
+
random_state: Random seed for reproducibility
|
|
528
|
+
shuffle: Whether to shuffle data before splitting
|
|
529
|
+
stratify: Array for stratified splitting
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
List of train-test splits of inputs
|
|
533
|
+
"""
|
|
534
|
+
if len(arrays) == 0:
|
|
535
|
+
raise ValueError("At least one array required as input")
|
|
536
|
+
|
|
537
|
+
# Convert to numpy arrays
|
|
538
|
+
arrays = [np.array(arr) if not isinstance(arr, np.ndarray) else arr for arr in arrays]
|
|
539
|
+
|
|
540
|
+
# Check that all arrays have the same length
|
|
541
|
+
n_samples = len(arrays[0])
|
|
542
|
+
for arr in arrays[1:]:
|
|
543
|
+
if len(arr) != n_samples:
|
|
544
|
+
raise ValueError("All arrays must have the same length")
|
|
545
|
+
|
|
546
|
+
if random_state is not None:
|
|
547
|
+
random.seed(random_state)
|
|
548
|
+
np.random.seed(random_state)
|
|
549
|
+
|
|
550
|
+
# Calculate split sizes
|
|
551
|
+
if isinstance(test_size, float):
|
|
552
|
+
test_size = int(n_samples * test_size)
|
|
553
|
+
|
|
554
|
+
if train_size is not None:
|
|
555
|
+
if isinstance(train_size, float):
|
|
556
|
+
train_size = int(n_samples * train_size)
|
|
557
|
+
if train_size + test_size > n_samples:
|
|
558
|
+
raise ValueError("train_size + test_size exceeds total samples")
|
|
559
|
+
else:
|
|
560
|
+
train_size = n_samples - test_size
|
|
561
|
+
|
|
562
|
+
# Create indices
|
|
563
|
+
indices = list(range(n_samples))
|
|
564
|
+
|
|
565
|
+
if stratify is not None:
|
|
566
|
+
# Stratified split
|
|
567
|
+
stratify = np.array(stratify) if not isinstance(stratify, np.ndarray) else stratify
|
|
568
|
+
unique_classes = np.unique(stratify)
|
|
569
|
+
|
|
570
|
+
train_indices = []
|
|
571
|
+
test_indices = []
|
|
572
|
+
|
|
573
|
+
for cls in unique_classes:
|
|
574
|
+
cls_indices = [i for i in indices if stratify[i] == cls]
|
|
575
|
+
if shuffle:
|
|
576
|
+
random.shuffle(cls_indices)
|
|
577
|
+
|
|
578
|
+
cls_test_size = int(len(cls_indices) * (test_size / n_samples))
|
|
579
|
+
cls_train_size = len(cls_indices) - cls_test_size
|
|
580
|
+
|
|
581
|
+
test_indices.extend(cls_indices[:cls_test_size])
|
|
582
|
+
train_indices.extend(cls_indices[cls_test_size:cls_test_size + cls_train_size])
|
|
583
|
+
|
|
584
|
+
if shuffle:
|
|
585
|
+
random.shuffle(train_indices)
|
|
586
|
+
random.shuffle(test_indices)
|
|
587
|
+
|
|
588
|
+
else:
|
|
589
|
+
# Regular split
|
|
590
|
+
if shuffle:
|
|
591
|
+
random.shuffle(indices)
|
|
592
|
+
|
|
593
|
+
test_indices = indices[:test_size]
|
|
594
|
+
train_indices = indices[test_size:test_size + train_size]
|
|
595
|
+
|
|
596
|
+
# Split arrays
|
|
597
|
+
result = []
|
|
598
|
+
for arr in arrays:
|
|
599
|
+
train_arr = arr[train_indices]
|
|
600
|
+
test_arr = arr[test_indices]
|
|
601
|
+
result.extend([train_arr, test_arr])
|
|
602
|
+
|
|
603
|
+
return result
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def cross_validate(
|
|
607
|
+
estimator,
|
|
608
|
+
X: Union[np.ndarray, list],
|
|
609
|
+
y: Union[np.ndarray, list],
|
|
610
|
+
cv: int = 5,
|
|
611
|
+
scoring: str = 'accuracy',
|
|
612
|
+
random_state: Optional[int] = None
|
|
613
|
+
) -> Dict[str, np.ndarray]:
|
|
614
|
+
"""
|
|
615
|
+
Evaluate metric(s) by cross-validation
|
|
616
|
+
|
|
617
|
+
Args:
|
|
618
|
+
estimator: ML estimator object
|
|
619
|
+
X: Features
|
|
620
|
+
y: Target
|
|
621
|
+
cv: Number of folds
|
|
622
|
+
scoring: Scoring metric ('accuracy', 'precision', 'recall', 'f1', 'mse', 'r2')
|
|
623
|
+
random_state: Random seed for reproducibility
|
|
624
|
+
|
|
625
|
+
Returns:
|
|
626
|
+
Dictionary with test scores
|
|
627
|
+
"""
|
|
628
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
629
|
+
y = np.array(y) if not isinstance(y, np.ndarray) else y
|
|
630
|
+
|
|
631
|
+
if X.ndim == 1:
|
|
632
|
+
X = X.reshape(-1, 1)
|
|
633
|
+
|
|
634
|
+
n_samples = len(X)
|
|
635
|
+
if n_samples != len(y):
|
|
636
|
+
raise ValueError("X and y must have the same length")
|
|
637
|
+
|
|
638
|
+
if random_state is not None:
|
|
639
|
+
random.seed(random_state)
|
|
640
|
+
|
|
641
|
+
# Create folds
|
|
642
|
+
indices = list(range(n_samples))
|
|
643
|
+
random.shuffle(indices)
|
|
644
|
+
|
|
645
|
+
fold_size = n_samples // cv
|
|
646
|
+
folds = []
|
|
647
|
+
|
|
648
|
+
for i in range(cv):
|
|
649
|
+
start = i * fold_size
|
|
650
|
+
end = start + fold_size if i < cv - 1 else n_samples
|
|
651
|
+
test_indices = indices[start:end]
|
|
652
|
+
train_indices = [idx for idx in indices if idx not in test_indices]
|
|
653
|
+
folds.append((train_indices, test_indices))
|
|
654
|
+
|
|
655
|
+
# Evaluate each fold
|
|
656
|
+
scores = []
|
|
657
|
+
|
|
658
|
+
for train_indices, test_indices in folds:
|
|
659
|
+
X_train, X_test = X[train_indices], X[test_indices]
|
|
660
|
+
y_train, y_test = y[train_indices], y[test_indices]
|
|
661
|
+
|
|
662
|
+
# Clone estimator (simple copy for basic estimators)
|
|
663
|
+
fold_estimator = type(estimator)(**estimator.__dict__)
|
|
664
|
+
|
|
665
|
+
# Fit and predict
|
|
666
|
+
fold_estimator.fit(X_train, y_train)
|
|
667
|
+
y_pred = fold_estimator.predict(X_test)
|
|
668
|
+
|
|
669
|
+
# Calculate score
|
|
670
|
+
if scoring == 'accuracy':
|
|
671
|
+
from .metrics import accuracy_score
|
|
672
|
+
score = accuracy_score(y_test, y_pred)
|
|
673
|
+
elif scoring == 'precision':
|
|
674
|
+
from .metrics import precision_score
|
|
675
|
+
score = precision_score(y_test, y_pred, average='weighted')
|
|
676
|
+
elif scoring == 'recall':
|
|
677
|
+
from .metrics import recall_score
|
|
678
|
+
score = recall_score(y_test, y_pred, average='weighted')
|
|
679
|
+
elif scoring == 'f1':
|
|
680
|
+
from .metrics import f1_score
|
|
681
|
+
score = f1_score(y_test, y_pred, average='weighted')
|
|
682
|
+
elif scoring == 'mse':
|
|
683
|
+
from .metrics import mean_squared_error
|
|
684
|
+
score = mean_squared_error(y_test, y_pred)
|
|
685
|
+
elif scoring == 'r2':
|
|
686
|
+
from .metrics import r2_score
|
|
687
|
+
score = r2_score(y_test, y_pred)
|
|
688
|
+
else:
|
|
689
|
+
raise ValueError(f"Unknown scoring: {scoring}")
|
|
690
|
+
|
|
691
|
+
scores.append(score)
|
|
692
|
+
|
|
693
|
+
return {'test_score': np.array(scores)}
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
# Feature selection utilities
|
|
697
|
+
|
|
698
|
+
def select_k_best_features(
|
|
699
|
+
X: Union[np.ndarray, list],
|
|
700
|
+
y: Union[np.ndarray, list],
|
|
701
|
+
k: int = 10,
|
|
702
|
+
score_func: str = 'f_classif'
|
|
703
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
|
704
|
+
"""
|
|
705
|
+
Select k best features based on statistical tests
|
|
706
|
+
|
|
707
|
+
Args:
|
|
708
|
+
X: Features
|
|
709
|
+
y: Target
|
|
710
|
+
k: Number of features to select
|
|
711
|
+
score_func: Scoring function ('f_classif', 'mutual_info')
|
|
712
|
+
|
|
713
|
+
Returns:
|
|
714
|
+
Tuple of (selected_features, feature_indices)
|
|
715
|
+
"""
|
|
716
|
+
X = np.array(X) if not isinstance(X, np.ndarray) else X
|
|
717
|
+
y = np.array(y) if not isinstance(y, np.ndarray) else y
|
|
718
|
+
|
|
719
|
+
if X.ndim == 1:
|
|
720
|
+
X = X.reshape(-1, 1)
|
|
721
|
+
|
|
722
|
+
n_features = X.shape[1]
|
|
723
|
+
k = min(k, n_features)
|
|
724
|
+
|
|
725
|
+
if score_func == 'f_classif':
|
|
726
|
+
# Simple F-test for classification
|
|
727
|
+
scores = []
|
|
728
|
+
for feature_idx in range(n_features):
|
|
729
|
+
feature = X[:, feature_idx]
|
|
730
|
+
|
|
731
|
+
# Calculate F-statistic (simplified)
|
|
732
|
+
classes = np.unique(y)
|
|
733
|
+
between_class_var = 0
|
|
734
|
+
within_class_var = 0
|
|
735
|
+
overall_mean = np.mean(feature)
|
|
736
|
+
|
|
737
|
+
for cls in classes:
|
|
738
|
+
class_data = feature[y == cls]
|
|
739
|
+
class_mean = np.mean(class_data)
|
|
740
|
+
class_size = len(class_data)
|
|
741
|
+
|
|
742
|
+
between_class_var += class_size * (class_mean - overall_mean) ** 2
|
|
743
|
+
within_class_var += np.sum((class_data - class_mean) ** 2)
|
|
744
|
+
|
|
745
|
+
between_class_var /= (len(classes) - 1)
|
|
746
|
+
within_class_var /= (len(y) - len(classes))
|
|
747
|
+
|
|
748
|
+
f_stat = between_class_var / within_class_var if within_class_var > 0 else 0
|
|
749
|
+
scores.append(f_stat)
|
|
750
|
+
|
|
751
|
+
elif score_func == 'mutual_info':
|
|
752
|
+
# Simplified mutual information
|
|
753
|
+
scores = []
|
|
754
|
+
for feature_idx in range(n_features):
|
|
755
|
+
feature = X[:, feature_idx]
|
|
756
|
+
|
|
757
|
+
# Discretize continuous features (simple binning)
|
|
758
|
+
if len(np.unique(feature)) > 10: # Assume continuous
|
|
759
|
+
bins = np.linspace(np.min(feature), np.max(feature), 5)
|
|
760
|
+
feature_binned = np.digitize(feature, bins)
|
|
761
|
+
else:
|
|
762
|
+
feature_binned = feature
|
|
763
|
+
|
|
764
|
+
# Calculate mutual information (simplified)
|
|
765
|
+
mi = 0.0
|
|
766
|
+
for f_val in np.unique(feature_binned):
|
|
767
|
+
for y_val in np.unique(y):
|
|
768
|
+
p_xy = np.mean((feature_binned == f_val) & (y == y_val))
|
|
769
|
+
p_x = np.mean(feature_binned == f_val)
|
|
770
|
+
p_y = np.mean(y == y_val)
|
|
771
|
+
|
|
772
|
+
if p_xy > 0 and p_x > 0 and p_y > 0:
|
|
773
|
+
mi += p_xy * np.log(p_xy / (p_x * p_y))
|
|
774
|
+
|
|
775
|
+
scores.append(mi)
|
|
776
|
+
|
|
777
|
+
else:
|
|
778
|
+
raise ValueError(f"Unknown score_func: {score_func}")
|
|
779
|
+
|
|
780
|
+
# Select k best features
|
|
781
|
+
feature_indices = np.argsort(scores)[-k:]
|
|
782
|
+
selected_features = X[:, feature_indices]
|
|
783
|
+
|
|
784
|
+
return selected_features, feature_indices
|