dragon-ml-toolbox 13.3.2__py3-none-any.whl → 13.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-13.3.2.dist-info → dragon_ml_toolbox-13.4.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-13.3.2.dist-info → dragon_ml_toolbox-13.4.0.dist-info}/RECORD +7 -7
- ml_tools/ML_datasetmaster.py +61 -20
- {dragon_ml_toolbox-13.3.2.dist-info → dragon_ml_toolbox-13.4.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-13.3.2.dist-info → dragon_ml_toolbox-13.4.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-13.3.2.dist-info → dragon_ml_toolbox-13.4.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-13.3.2.dist-info → dragon_ml_toolbox-13.4.0.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
dragon_ml_toolbox-13.
|
|
2
|
-
dragon_ml_toolbox-13.
|
|
1
|
+
dragon_ml_toolbox-13.4.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-13.4.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
|
|
3
3
|
ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
|
|
4
4
|
ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
|
|
5
5
|
ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
|
|
6
6
|
ml_tools/MICE_imputation.py,sha256=X273Qlgoqqg7KTmoKd75YDyAPB0UIbTzGP3xsCmRh3E,11717
|
|
7
7
|
ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
|
|
8
|
-
ml_tools/ML_datasetmaster.py,sha256=
|
|
8
|
+
ml_tools/ML_datasetmaster.py,sha256=6caWbq6eu1RE9V51gmceD71PtMctJRjFuLvkkK5ChiY,36271
|
|
9
9
|
ml_tools/ML_evaluation.py,sha256=3u5dOhS77gn3kAshKr2GwSa5xZBF0YM77ZkFevqNPvA,18528
|
|
10
10
|
ml_tools/ML_evaluation_multi.py,sha256=L6Ub_uObXsI7ToVCF6DtmAFekHRcga5wWMOnRYRR-BY,16121
|
|
11
11
|
ml_tools/ML_inference.py,sha256=yq2gdN6s_OUYC5ZLQrIJC5BA5H33q8UKODXwb-_0M2c,23549
|
|
@@ -35,7 +35,7 @@ ml_tools/optimization_tools.py,sha256=TYFQ2nSnp7xxs-VyoZISWgnGJghFbsWasHjruegyJR
|
|
|
35
35
|
ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
|
|
36
36
|
ml_tools/serde.py,sha256=c8uDYjYry_VrLvoG4ixqDj5pij88lVn6Tu4NHcPkwDU,6943
|
|
37
37
|
ml_tools/utilities.py,sha256=OcAyV1tEcYAfOWlGjRgopsjDLxU3DcI5EynzvWV4q3A,15754
|
|
38
|
-
dragon_ml_toolbox-13.
|
|
39
|
-
dragon_ml_toolbox-13.
|
|
40
|
-
dragon_ml_toolbox-13.
|
|
41
|
-
dragon_ml_toolbox-13.
|
|
38
|
+
dragon_ml_toolbox-13.4.0.dist-info/METADATA,sha256=Ixk5If3BJhjyJy9_mirNJ2QckMELXFQiJa9_8RWfreI,6166
|
|
39
|
+
dragon_ml_toolbox-13.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
40
|
+
dragon_ml_toolbox-13.4.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
41
|
+
dragon_ml_toolbox-13.4.0.dist-info/RECORD,,
|
ml_tools/ML_datasetmaster.py
CHANGED
|
@@ -126,8 +126,8 @@ class _BaseDatasetMaker(ABC):
|
|
|
126
126
|
else:
|
|
127
127
|
_LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
|
|
128
128
|
|
|
129
|
-
X_train_values = X_train.
|
|
130
|
-
X_test_values = X_test.
|
|
129
|
+
X_train_values = X_train.to_numpy()
|
|
130
|
+
X_test_values = X_test.to_numpy()
|
|
131
131
|
|
|
132
132
|
# continuous_feature_indices is derived
|
|
133
133
|
if self.scaler is None and continuous_feature_indices:
|
|
@@ -253,26 +253,42 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
253
253
|
pandas_df: pandas.DataFrame,
|
|
254
254
|
schema: FeatureSchema,
|
|
255
255
|
kind: Literal["regression", "classification"],
|
|
256
|
+
scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
|
|
256
257
|
test_size: float = 0.2,
|
|
257
|
-
random_state: int = 42
|
|
258
|
-
scaler: Optional[PytorchScaler] = None):
|
|
258
|
+
random_state: int = 42):
|
|
259
259
|
"""
|
|
260
260
|
Args:
|
|
261
261
|
pandas_df (pandas.DataFrame):
|
|
262
262
|
The pre-processed input DataFrame containing all columns. (features and single target).
|
|
263
263
|
schema (FeatureSchema):
|
|
264
264
|
The definitive schema object from data_exploration.
|
|
265
|
-
kind (
|
|
265
|
+
kind ("regression" | "classification"):
|
|
266
266
|
The type of ML task. This determines the data type of the labels.
|
|
267
|
+
scaler ("fit" | "none" | PytorchScaler):
|
|
268
|
+
Strategy for data scaling:
|
|
269
|
+
- "fit": Fit a new PytorchScaler on continuous features.
|
|
270
|
+
- "none": Do not scale data (e.g., for TabularTransformer).
|
|
271
|
+
- PytorchScaler instance: Use a pre-fitted scaler to transform data.
|
|
267
272
|
test_size (float):
|
|
268
273
|
The proportion of the dataset to allocate to the test split.
|
|
269
274
|
random_state (int):
|
|
270
275
|
The seed for the random number of generator for reproducibility.
|
|
271
|
-
|
|
272
|
-
A pre-fitted PytorchScaler instance, if None a new scaler will be created.
|
|
276
|
+
|
|
273
277
|
"""
|
|
274
278
|
super().__init__()
|
|
275
|
-
|
|
279
|
+
|
|
280
|
+
_apply_scaling: bool = False
|
|
281
|
+
if scaler == "fit":
|
|
282
|
+
self.scaler = None # To be created
|
|
283
|
+
_apply_scaling = True
|
|
284
|
+
elif scaler == "none":
|
|
285
|
+
self.scaler = None
|
|
286
|
+
elif isinstance(scaler, PytorchScaler):
|
|
287
|
+
self.scaler = scaler # Use the provided one
|
|
288
|
+
_apply_scaling = True
|
|
289
|
+
else:
|
|
290
|
+
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
|
|
291
|
+
raise ValueError()
|
|
276
292
|
|
|
277
293
|
# --- 1. Identify features (from schema) ---
|
|
278
294
|
self._feature_names = list(schema.feature_names)
|
|
@@ -310,9 +326,14 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
310
326
|
label_dtype = torch.float32 if kind == "regression" else torch.int64
|
|
311
327
|
|
|
312
328
|
# --- 4. Scale (using the schema) ---
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
329
|
+
if _apply_scaling:
|
|
330
|
+
X_train_final, X_test_final = self._prepare_scaler(
|
|
331
|
+
X_train, y_train, X_test, label_dtype, schema
|
|
332
|
+
)
|
|
333
|
+
else:
|
|
334
|
+
_LOGGER.info("Features have not been scaled as specified.")
|
|
335
|
+
X_train_final = X_train.to_numpy()
|
|
336
|
+
X_test_final = X_test.to_numpy()
|
|
316
337
|
|
|
317
338
|
# --- 5. Create Datasets ---
|
|
318
339
|
self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
@@ -336,9 +357,9 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
336
357
|
pandas_df: pandas.DataFrame,
|
|
337
358
|
target_columns: List[str],
|
|
338
359
|
schema: FeatureSchema,
|
|
360
|
+
scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
|
|
339
361
|
test_size: float = 0.2,
|
|
340
|
-
random_state: int = 42
|
|
341
|
-
scaler: Optional[PytorchScaler] = None):
|
|
362
|
+
random_state: int = 42):
|
|
342
363
|
"""
|
|
343
364
|
Args:
|
|
344
365
|
pandas_df (pandas.DataFrame):
|
|
@@ -348,20 +369,35 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
348
369
|
List of target column names.
|
|
349
370
|
schema (FeatureSchema):
|
|
350
371
|
The definitive schema object from data_exploration.
|
|
372
|
+
scaler ("fit" | "none" | PytorchScaler):
|
|
373
|
+
Strategy for data scaling:
|
|
374
|
+
- "fit": Fit a new PytorchScaler on continuous features.
|
|
375
|
+
- "none": Do not scale data (e.g., for TabularTransformer).
|
|
376
|
+
- PytorchScaler instance: Use a pre-fitted scaler to transform data.
|
|
351
377
|
test_size (float):
|
|
352
378
|
The proportion of the dataset to allocate to the test split.
|
|
353
379
|
random_state (int):
|
|
354
380
|
The seed for the random number generator for reproducibility.
|
|
355
|
-
scaler (PytorchScaler | None):
|
|
356
|
-
A pre-fitted PytorchScaler instance.
|
|
357
381
|
|
|
358
382
|
## Note:
|
|
359
383
|
For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
|
|
360
384
|
This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
|
|
361
385
|
"""
|
|
362
386
|
super().__init__()
|
|
363
|
-
|
|
364
|
-
|
|
387
|
+
|
|
388
|
+
_apply_scaling: bool = False
|
|
389
|
+
if scaler == "fit":
|
|
390
|
+
self.scaler = None
|
|
391
|
+
_apply_scaling = True
|
|
392
|
+
elif scaler == "none":
|
|
393
|
+
self.scaler = None
|
|
394
|
+
elif isinstance(scaler, PytorchScaler):
|
|
395
|
+
self.scaler = scaler # Use the provided one
|
|
396
|
+
_apply_scaling = True
|
|
397
|
+
else:
|
|
398
|
+
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
|
|
399
|
+
raise ValueError()
|
|
400
|
+
|
|
365
401
|
# --- 1. Get features and targets from schema/args ---
|
|
366
402
|
self._feature_names = list(schema.feature_names)
|
|
367
403
|
self._target_names = target_columns
|
|
@@ -403,9 +439,14 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
403
439
|
label_dtype = torch.float32
|
|
404
440
|
|
|
405
441
|
# --- 4. Scale (using the schema) ---
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
442
|
+
if _apply_scaling:
|
|
443
|
+
X_train_final, X_test_final = self._prepare_scaler(
|
|
444
|
+
X_train, y_train, X_test, label_dtype, schema
|
|
445
|
+
)
|
|
446
|
+
else:
|
|
447
|
+
_LOGGER.info("Features have not been scaled as specified.")
|
|
448
|
+
X_train_final = X_train.to_numpy()
|
|
449
|
+
X_test_final = X_test.to_numpy()
|
|
409
450
|
|
|
410
451
|
# --- 5. Create Datasets ---
|
|
411
452
|
# _PytorchDataset now correctly handles y_train (a DataFrame)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|