dragon-ml-toolbox 13.2.0__tar.gz → 14.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-13.2.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-14.1.0}/PKG-INFO +2 -1
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0/dragon_ml_toolbox.egg-info}/PKG-INFO +2 -1
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +7 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/dragon_ml_toolbox.egg-info/requires.txt +1 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/MICE_imputation.py +207 -5
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_datasetmaster.py +63 -205
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_evaluation.py +23 -15
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_evaluation_multi.py +5 -6
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_inference.py +0 -1
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_models.py +23 -7
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_trainer.py +463 -20
- dragon_ml_toolbox-14.1.0/ml_tools/ML_utilities.py +528 -0
- dragon_ml_toolbox-14.1.0/ml_tools/ML_vision_datasetmaster.py +1315 -0
- dragon_ml_toolbox-14.1.0/ml_tools/ML_vision_evaluation.py +260 -0
- dragon_ml_toolbox-14.1.0/ml_tools/ML_vision_inference.py +428 -0
- dragon_ml_toolbox-14.1.0/ml_tools/ML_vision_models.py +627 -0
- dragon_ml_toolbox-14.1.0/ml_tools/ML_vision_transformers.py +58 -0
- dragon_ml_toolbox-14.1.0/ml_tools/_ML_pytorch_tabular.py +543 -0
- dragon_ml_toolbox-14.1.0/ml_tools/_ML_vision_recipe.py +88 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/_schema.py +26 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/custom_logger.py +37 -14
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/data_exploration.py +502 -93
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/keys.py +38 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/math_utilities.py +1 -1
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/serde.py +23 -3
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/utilities.py +192 -3
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/pyproject.toml +12 -2
- dragon_ml_toolbox-13.2.0/ml_tools/ML_utilities.py +0 -230
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/LICENSE +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/README.md +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ETL_cleaning.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ETL_engineering.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/GUI_tools.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_callbacks.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_optimization.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_scaler.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/PSO_optimization.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/RNN_forecast.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/SQL.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/_logger.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/_script_info.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/constants.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ensemble_evaluation.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ensemble_inference.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/optimization_tools.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/path_manager.py +0 -0
- {dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version:
|
|
3
|
+
Version: 14.1.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
5
|
Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -34,6 +34,7 @@ Requires-Dist: Pillow; extra == "ml"
|
|
|
34
34
|
Requires-Dist: evotorch; extra == "ml"
|
|
35
35
|
Requires-Dist: pyarrow; extra == "ml"
|
|
36
36
|
Requires-Dist: colorlog; extra == "ml"
|
|
37
|
+
Requires-Dist: torchmetrics; extra == "ml"
|
|
37
38
|
Provides-Extra: mice
|
|
38
39
|
Requires-Dist: numpy<2.0; extra == "mice"
|
|
39
40
|
Requires-Dist: pandas; extra == "mice"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version:
|
|
3
|
+
Version: 14.1.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
5
|
Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -34,6 +34,7 @@ Requires-Dist: Pillow; extra == "ml"
|
|
|
34
34
|
Requires-Dist: evotorch; extra == "ml"
|
|
35
35
|
Requires-Dist: pyarrow; extra == "ml"
|
|
36
36
|
Requires-Dist: colorlog; extra == "ml"
|
|
37
|
+
Requires-Dist: torchmetrics; extra == "ml"
|
|
37
38
|
Provides-Extra: mice
|
|
38
39
|
Requires-Dist: numpy<2.0; extra == "mice"
|
|
39
40
|
Requires-Dist: pandas; extra == "mice"
|
{dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/dragon_ml_toolbox.egg-info/SOURCES.txt
RENAMED
|
@@ -21,10 +21,17 @@ ml_tools/ML_optimization.py
|
|
|
21
21
|
ml_tools/ML_scaler.py
|
|
22
22
|
ml_tools/ML_trainer.py
|
|
23
23
|
ml_tools/ML_utilities.py
|
|
24
|
+
ml_tools/ML_vision_datasetmaster.py
|
|
25
|
+
ml_tools/ML_vision_evaluation.py
|
|
26
|
+
ml_tools/ML_vision_inference.py
|
|
27
|
+
ml_tools/ML_vision_models.py
|
|
28
|
+
ml_tools/ML_vision_transformers.py
|
|
24
29
|
ml_tools/PSO_optimization.py
|
|
25
30
|
ml_tools/RNN_forecast.py
|
|
26
31
|
ml_tools/SQL.py
|
|
27
32
|
ml_tools/VIF_factor.py
|
|
33
|
+
ml_tools/_ML_pytorch_tabular.py
|
|
34
|
+
ml_tools/_ML_vision_recipe.py
|
|
28
35
|
ml_tools/__init__.py
|
|
29
36
|
ml_tools/_logger.py
|
|
30
37
|
ml_tools/_schema.py
|
|
@@ -7,19 +7,20 @@ from plotnine import ggplot, labs, theme, element_blank # type: ignore
|
|
|
7
7
|
from typing import Optional, Union
|
|
8
8
|
|
|
9
9
|
from .utilities import load_dataframe, merge_dataframes, save_dataframe_filename
|
|
10
|
-
from .math_utilities import threshold_binary_values
|
|
10
|
+
from .math_utilities import threshold_binary_values, discretize_categorical_values
|
|
11
11
|
from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
|
|
12
12
|
from ._logger import _LOGGER
|
|
13
13
|
from ._script_info import _script_info
|
|
14
|
+
from ._schema import FeatureSchema
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
__all__ = [
|
|
18
|
+
"MiceImputer",
|
|
17
19
|
"apply_mice",
|
|
18
20
|
"save_imputed_datasets",
|
|
19
|
-
"get_na_column_names",
|
|
20
21
|
"get_convergence_diagnostic",
|
|
21
22
|
"get_imputed_distributions",
|
|
22
|
-
"run_mice_pipeline"
|
|
23
|
+
"run_mice_pipeline",
|
|
23
24
|
]
|
|
24
25
|
|
|
25
26
|
|
|
@@ -79,7 +80,7 @@ def save_imputed_datasets(save_dir: Union[str, Path], imputed_datasets: list, df
|
|
|
79
80
|
|
|
80
81
|
|
|
81
82
|
#Get names of features that had missing values before imputation
|
|
82
|
-
def
|
|
83
|
+
def _get_na_column_names(df: pd.DataFrame):
|
|
83
84
|
return [col for col in df.columns if df[col].isna().any()]
|
|
84
85
|
|
|
85
86
|
|
|
@@ -264,7 +265,7 @@ def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str]
|
|
|
264
265
|
|
|
265
266
|
save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
|
|
266
267
|
|
|
267
|
-
imputed_column_names =
|
|
268
|
+
imputed_column_names = _get_na_column_names(df=df)
|
|
268
269
|
|
|
269
270
|
get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
|
|
270
271
|
|
|
@@ -278,5 +279,206 @@ def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
|
|
|
278
279
|
return df_feats, df_targets
|
|
279
280
|
|
|
280
281
|
|
|
282
|
+
# modern implementation
|
|
283
|
+
class MiceImputer:
|
|
284
|
+
"""
|
|
285
|
+
A modern MICE imputation pipeline that uses a FeatureSchema
|
|
286
|
+
to correctly discretize categorical features after imputation.
|
|
287
|
+
"""
|
|
288
|
+
def __init__(self,
|
|
289
|
+
schema: FeatureSchema,
|
|
290
|
+
iterations: int=20,
|
|
291
|
+
resulting_datasets: int = 1,
|
|
292
|
+
random_state: int = 101):
|
|
293
|
+
|
|
294
|
+
self.schema = schema
|
|
295
|
+
self.random_state = random_state
|
|
296
|
+
self.iterations = iterations
|
|
297
|
+
self.resulting_datasets = resulting_datasets
|
|
298
|
+
|
|
299
|
+
# --- Store schema info ---
|
|
300
|
+
|
|
301
|
+
# 1. Categorical info
|
|
302
|
+
if not self.schema.categorical_index_map:
|
|
303
|
+
_LOGGER.warning("FeatureSchema has no 'categorical_index_map'. No discretization will be applied.")
|
|
304
|
+
self.cat_info = {}
|
|
305
|
+
else:
|
|
306
|
+
self.cat_info = self.schema.categorical_index_map
|
|
307
|
+
|
|
308
|
+
# 2. Ordered feature names (critical for index mapping)
|
|
309
|
+
self.ordered_features = list(self.schema.feature_names)
|
|
310
|
+
|
|
311
|
+
# 3. Names of categorical features
|
|
312
|
+
self.categorical_features = list(self.schema.categorical_feature_names)
|
|
313
|
+
|
|
314
|
+
_LOGGER.info(f"MiceImputer initialized. Found {len(self.cat_info)} categorical features to discretize.")
|
|
315
|
+
|
|
316
|
+
def _post_process(self, imputed_df: pd.DataFrame) -> pd.DataFrame:
|
|
317
|
+
"""
|
|
318
|
+
Applies schema-based discretization to a completed dataframe.
|
|
319
|
+
|
|
320
|
+
This method works around the behavior of `discretize_categorical_values`
|
|
321
|
+
(which returns a full int32 array) by:
|
|
322
|
+
1. Calling it on the full, ordered feature array.
|
|
323
|
+
2. Extracting *only* the valid discretized categorical columns.
|
|
324
|
+
3. Updating the original float dataframe with these integer values.
|
|
325
|
+
"""
|
|
326
|
+
# If no categorical features are defined, return the df as-is.
|
|
327
|
+
if not self.cat_info:
|
|
328
|
+
return imputed_df
|
|
329
|
+
|
|
330
|
+
try:
|
|
331
|
+
# 1. Ensure DataFrame columns match the schema order
|
|
332
|
+
# This is critical for the index-based categorical_info
|
|
333
|
+
df_ordered: pd.DataFrame = imputed_df[self.ordered_features] # type: ignore
|
|
334
|
+
|
|
335
|
+
# 2. Convert to NumPy array
|
|
336
|
+
array_ordered = df_ordered.to_numpy()
|
|
337
|
+
|
|
338
|
+
# 3. Apply discretization utility (which returns a full int32 array)
|
|
339
|
+
# This array has *correct* categorical values but *truncated* continuous values.
|
|
340
|
+
discretized_array_int32 = discretize_categorical_values(
|
|
341
|
+
array_ordered,
|
|
342
|
+
self.cat_info,
|
|
343
|
+
start_at_zero=True # Assuming 0-based indexing
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# 4. Create a new DF from the int32 array, keeping the categorical columns.
|
|
347
|
+
df_discretized_cats = pd.DataFrame(
|
|
348
|
+
discretized_array_int32,
|
|
349
|
+
columns=self.ordered_features,
|
|
350
|
+
index=df_ordered.index # <-- Critical: align index
|
|
351
|
+
)[self.categorical_features] # <-- Select only cat features
|
|
352
|
+
|
|
353
|
+
# 5. "Rejoin": Start with a fresh copy of the *original* imputed DF (which has correct continuous floats).
|
|
354
|
+
final_df = df_ordered.copy()
|
|
355
|
+
|
|
356
|
+
# 6. Use .update() to "paste" the integer categorical values
|
|
357
|
+
# over the old float categorical values. Continuous floats are unaffected.
|
|
358
|
+
final_df.update(df_discretized_cats)
|
|
359
|
+
|
|
360
|
+
return final_df
|
|
361
|
+
|
|
362
|
+
except Exception as e:
|
|
363
|
+
_LOGGER.error(f"Failed during post-processing discretization:\n\tInput DF shape: {imputed_df.shape}\n\tSchema features: {len(self.ordered_features)}\n\tCategorical info keys: {list(self.cat_info.keys())}\n{e}")
|
|
364
|
+
raise
|
|
365
|
+
|
|
366
|
+
def _run_mice(self,
|
|
367
|
+
df: pd.DataFrame,
|
|
368
|
+
df_name: str) -> tuple[mf.ImputationKernel, list[pd.DataFrame], list[str]]:
|
|
369
|
+
"""
|
|
370
|
+
Runs the MICE kernel and applies schema-based post-processing.
|
|
371
|
+
|
|
372
|
+
Parameters:
|
|
373
|
+
df (pd.DataFrame): The input dataframe *with NaNs*. Should only contain feature columns.
|
|
374
|
+
df_name (str): The base name for the dataset.
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
tuple[mf.ImputationKernel, list[pd.DataFrame], list[str]]:
|
|
378
|
+
- The trained MICE kernel
|
|
379
|
+
- A list of imputed and processed DataFrames
|
|
380
|
+
- A list of names for the new DataFrames
|
|
381
|
+
"""
|
|
382
|
+
# Ensure input df only contains features from the schema and is in the correct order.
|
|
383
|
+
try:
|
|
384
|
+
df_feats = df[self.ordered_features]
|
|
385
|
+
except KeyError as e:
|
|
386
|
+
_LOGGER.error(f"Input DataFrame is missing required schema columns: {e}")
|
|
387
|
+
raise
|
|
388
|
+
|
|
389
|
+
# 1. Initialize kernel
|
|
390
|
+
kernel = mf.ImputationKernel(
|
|
391
|
+
data=df_feats,
|
|
392
|
+
num_datasets=self.resulting_datasets,
|
|
393
|
+
random_state=self.random_state
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
_LOGGER.info("➡️ Schema-based MICE imputation running...")
|
|
397
|
+
|
|
398
|
+
# 2. Perform MICE
|
|
399
|
+
kernel.mice(self.iterations)
|
|
400
|
+
|
|
401
|
+
# 3. Retrieve, process, and collect datasets
|
|
402
|
+
imputed_datasets = []
|
|
403
|
+
for i in range(self.resulting_datasets):
|
|
404
|
+
# complete_data returns a pd.DataFrame
|
|
405
|
+
completed_df = kernel.complete_data(dataset=i)
|
|
406
|
+
|
|
407
|
+
# Apply our new discretization and ordering
|
|
408
|
+
processed_df = self._post_process(completed_df)
|
|
409
|
+
imputed_datasets.append(processed_df)
|
|
410
|
+
|
|
411
|
+
if not imputed_datasets:
|
|
412
|
+
_LOGGER.error("No imputed datasets were generated.")
|
|
413
|
+
raise ValueError()
|
|
414
|
+
|
|
415
|
+
# 4. Generate names
|
|
416
|
+
if self.resulting_datasets == 1:
|
|
417
|
+
imputed_dataset_names = [f"{df_name}_MICE"]
|
|
418
|
+
else:
|
|
419
|
+
imputed_dataset_names = [f"{df_name}_MICE_{i+1}" for i in range(self.resulting_datasets)]
|
|
420
|
+
|
|
421
|
+
# 5. Validate indexes
|
|
422
|
+
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
423
|
+
assert imputed_df.shape[0] == df.shape[0], f"❌ Row count mismatch in dataset {subname}"
|
|
424
|
+
assert all(imputed_df.index == df.index), f"❌ Index mismatch in dataset {subname}"
|
|
425
|
+
|
|
426
|
+
_LOGGER.info("Schema-based MICE imputation complete.")
|
|
427
|
+
|
|
428
|
+
return kernel, imputed_datasets, imputed_dataset_names
|
|
429
|
+
|
|
430
|
+
def run_pipeline(self,
|
|
431
|
+
df_path_or_dir: Union[str,Path],
|
|
432
|
+
save_datasets_dir: Union[str,Path],
|
|
433
|
+
save_metrics_dir: Union[str,Path],
|
|
434
|
+
):
|
|
435
|
+
"""
|
|
436
|
+
Runs the complete MICE imputation pipeline.
|
|
437
|
+
|
|
438
|
+
This method automates the entire workflow:
|
|
439
|
+
1. Loads data from a CSV file path or a directory with CSV files.
|
|
440
|
+
2. Separates features and targets based on the `FeatureSchema`.
|
|
441
|
+
3. Runs the MICE algorithm on the feature set.
|
|
442
|
+
4. Applies schema-based post-processing to discretize categorical features.
|
|
443
|
+
5. Saves the final, processed, and imputed dataset(s) (re-joined with targets) to `save_datasets_dir`.
|
|
444
|
+
6. Generates and saves convergence and distribution plots for all imputed columns to `save_metrics_dir`.
|
|
445
|
+
|
|
446
|
+
Parameters
|
|
447
|
+
----------
|
|
448
|
+
df_path_or_dir :[str,Path]
|
|
449
|
+
Path to a single CSV file or a directory containing multiple CSV files to impute.
|
|
450
|
+
save_datasets_dir : [str,Path]
|
|
451
|
+
Directory where the final imputed and processed dataset(s) will be saved as CSVs.
|
|
452
|
+
save_metrics_dir : [str,Path]
|
|
453
|
+
Directory where convergence and distribution plots will be saved.
|
|
454
|
+
"""
|
|
455
|
+
# Check paths
|
|
456
|
+
save_datasets_path = make_fullpath(save_datasets_dir, make=True)
|
|
457
|
+
save_metrics_path = make_fullpath(save_metrics_dir, make=True)
|
|
458
|
+
|
|
459
|
+
input_path = make_fullpath(df_path_or_dir)
|
|
460
|
+
if input_path.is_file():
|
|
461
|
+
all_file_paths = [input_path]
|
|
462
|
+
else:
|
|
463
|
+
all_file_paths = list(list_csv_paths(input_path).values())
|
|
464
|
+
|
|
465
|
+
for df_path in all_file_paths:
|
|
466
|
+
|
|
467
|
+
df, df_name = load_dataframe(df_path=df_path, kind="pandas")
|
|
468
|
+
|
|
469
|
+
df_features: pd.DataFrame = df[self.schema.feature_names] # type: ignore
|
|
470
|
+
df_targets = df.drop(columns=self.schema.feature_names)
|
|
471
|
+
|
|
472
|
+
imputed_column_names = _get_na_column_names(df=df_features)
|
|
473
|
+
|
|
474
|
+
kernel, imputed_datasets, imputed_dataset_names = self._run_mice(df=df_features, df_name=df_name)
|
|
475
|
+
|
|
476
|
+
save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
|
|
477
|
+
|
|
478
|
+
get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
|
|
479
|
+
|
|
480
|
+
get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_path, column_names=imputed_column_names)
|
|
481
|
+
|
|
482
|
+
|
|
281
483
|
def info():
|
|
282
484
|
_script_info(__all__)
|
|
@@ -1,13 +1,10 @@
|
|
|
1
1
|
import torch
|
|
2
|
-
from torch.utils.data import Dataset
|
|
2
|
+
from torch.utils.data import Dataset
|
|
3
3
|
import pandas
|
|
4
4
|
import numpy
|
|
5
5
|
from sklearn.model_selection import train_test_split
|
|
6
6
|
from typing import Literal, Union, Tuple, List, Optional
|
|
7
7
|
from abc import ABC, abstractmethod
|
|
8
|
-
from PIL import Image, ImageOps
|
|
9
|
-
from torchvision.datasets import ImageFolder
|
|
10
|
-
from torchvision import transforms
|
|
11
8
|
import matplotlib.pyplot as plt
|
|
12
9
|
from pathlib import Path
|
|
13
10
|
|
|
@@ -23,9 +20,7 @@ from ._schema import FeatureSchema
|
|
|
23
20
|
__all__ = [
|
|
24
21
|
"DatasetMaker",
|
|
25
22
|
"DatasetMakerMulti",
|
|
26
|
-
"
|
|
27
|
-
"SequenceMaker",
|
|
28
|
-
"ResizeAspectFill",
|
|
23
|
+
"SequenceMaker"
|
|
29
24
|
]
|
|
30
25
|
|
|
31
26
|
|
|
@@ -126,8 +121,8 @@ class _BaseDatasetMaker(ABC):
|
|
|
126
121
|
else:
|
|
127
122
|
_LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
|
|
128
123
|
|
|
129
|
-
X_train_values = X_train.
|
|
130
|
-
X_test_values = X_test.
|
|
124
|
+
X_train_values = X_train.to_numpy()
|
|
125
|
+
X_test_values = X_test.to_numpy()
|
|
131
126
|
|
|
132
127
|
# continuous_feature_indices is derived
|
|
133
128
|
if self.scaler is None and continuous_feature_indices:
|
|
@@ -253,26 +248,42 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
253
248
|
pandas_df: pandas.DataFrame,
|
|
254
249
|
schema: FeatureSchema,
|
|
255
250
|
kind: Literal["regression", "classification"],
|
|
251
|
+
scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
|
|
256
252
|
test_size: float = 0.2,
|
|
257
|
-
random_state: int = 42
|
|
258
|
-
scaler: Optional[PytorchScaler] = None):
|
|
253
|
+
random_state: int = 42):
|
|
259
254
|
"""
|
|
260
255
|
Args:
|
|
261
256
|
pandas_df (pandas.DataFrame):
|
|
262
257
|
The pre-processed input DataFrame containing all columns. (features and single target).
|
|
263
258
|
schema (FeatureSchema):
|
|
264
259
|
The definitive schema object from data_exploration.
|
|
265
|
-
kind (
|
|
260
|
+
kind ("regression" | "classification"):
|
|
266
261
|
The type of ML task. This determines the data type of the labels.
|
|
262
|
+
scaler ("fit" | "none" | PytorchScaler):
|
|
263
|
+
Strategy for data scaling:
|
|
264
|
+
- "fit": Fit a new PytorchScaler on continuous features.
|
|
265
|
+
- "none": Do not scale data (e.g., for TabularTransformer).
|
|
266
|
+
- PytorchScaler instance: Use a pre-fitted scaler to transform data.
|
|
267
267
|
test_size (float):
|
|
268
268
|
The proportion of the dataset to allocate to the test split.
|
|
269
269
|
random_state (int):
|
|
270
270
|
The seed for the random number of generator for reproducibility.
|
|
271
|
-
|
|
272
|
-
A pre-fitted PytorchScaler instance, if None a new scaler will be created.
|
|
271
|
+
|
|
273
272
|
"""
|
|
274
273
|
super().__init__()
|
|
275
|
-
|
|
274
|
+
|
|
275
|
+
_apply_scaling: bool = False
|
|
276
|
+
if scaler == "fit":
|
|
277
|
+
self.scaler = None # To be created
|
|
278
|
+
_apply_scaling = True
|
|
279
|
+
elif scaler == "none":
|
|
280
|
+
self.scaler = None
|
|
281
|
+
elif isinstance(scaler, PytorchScaler):
|
|
282
|
+
self.scaler = scaler # Use the provided one
|
|
283
|
+
_apply_scaling = True
|
|
284
|
+
else:
|
|
285
|
+
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
|
|
286
|
+
raise ValueError()
|
|
276
287
|
|
|
277
288
|
# --- 1. Identify features (from schema) ---
|
|
278
289
|
self._feature_names = list(schema.feature_names)
|
|
@@ -310,9 +321,14 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
310
321
|
label_dtype = torch.float32 if kind == "regression" else torch.int64
|
|
311
322
|
|
|
312
323
|
# --- 4. Scale (using the schema) ---
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
324
|
+
if _apply_scaling:
|
|
325
|
+
X_train_final, X_test_final = self._prepare_scaler(
|
|
326
|
+
X_train, y_train, X_test, label_dtype, schema
|
|
327
|
+
)
|
|
328
|
+
else:
|
|
329
|
+
_LOGGER.info("Features have not been scaled as specified.")
|
|
330
|
+
X_train_final = X_train.to_numpy()
|
|
331
|
+
X_test_final = X_test.to_numpy()
|
|
316
332
|
|
|
317
333
|
# --- 5. Create Datasets ---
|
|
318
334
|
self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
@@ -336,9 +352,9 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
336
352
|
pandas_df: pandas.DataFrame,
|
|
337
353
|
target_columns: List[str],
|
|
338
354
|
schema: FeatureSchema,
|
|
355
|
+
scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
|
|
339
356
|
test_size: float = 0.2,
|
|
340
|
-
random_state: int = 42
|
|
341
|
-
scaler: Optional[PytorchScaler] = None):
|
|
357
|
+
random_state: int = 42):
|
|
342
358
|
"""
|
|
343
359
|
Args:
|
|
344
360
|
pandas_df (pandas.DataFrame):
|
|
@@ -348,20 +364,35 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
348
364
|
List of target column names.
|
|
349
365
|
schema (FeatureSchema):
|
|
350
366
|
The definitive schema object from data_exploration.
|
|
367
|
+
scaler ("fit" | "none" | PytorchScaler):
|
|
368
|
+
Strategy for data scaling:
|
|
369
|
+
- "fit": Fit a new PytorchScaler on continuous features.
|
|
370
|
+
- "none": Do not scale data (e.g., for TabularTransformer).
|
|
371
|
+
- PytorchScaler instance: Use a pre-fitted scaler to transform data.
|
|
351
372
|
test_size (float):
|
|
352
373
|
The proportion of the dataset to allocate to the test split.
|
|
353
374
|
random_state (int):
|
|
354
375
|
The seed for the random number generator for reproducibility.
|
|
355
|
-
scaler (PytorchScaler | None):
|
|
356
|
-
A pre-fitted PytorchScaler instance.
|
|
357
376
|
|
|
358
377
|
## Note:
|
|
359
378
|
For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
|
|
360
379
|
This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
|
|
361
380
|
"""
|
|
362
381
|
super().__init__()
|
|
363
|
-
|
|
364
|
-
|
|
382
|
+
|
|
383
|
+
_apply_scaling: bool = False
|
|
384
|
+
if scaler == "fit":
|
|
385
|
+
self.scaler = None
|
|
386
|
+
_apply_scaling = True
|
|
387
|
+
elif scaler == "none":
|
|
388
|
+
self.scaler = None
|
|
389
|
+
elif isinstance(scaler, PytorchScaler):
|
|
390
|
+
self.scaler = scaler # Use the provided one
|
|
391
|
+
_apply_scaling = True
|
|
392
|
+
else:
|
|
393
|
+
_LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
|
|
394
|
+
raise ValueError()
|
|
395
|
+
|
|
365
396
|
# --- 1. Get features and targets from schema/args ---
|
|
366
397
|
self._feature_names = list(schema.feature_names)
|
|
367
398
|
self._target_names = target_columns
|
|
@@ -403,9 +434,14 @@ class DatasetMakerMulti(_BaseDatasetMaker):
|
|
|
403
434
|
label_dtype = torch.float32
|
|
404
435
|
|
|
405
436
|
# --- 4. Scale (using the schema) ---
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
437
|
+
if _apply_scaling:
|
|
438
|
+
X_train_final, X_test_final = self._prepare_scaler(
|
|
439
|
+
X_train, y_train, X_test, label_dtype, schema
|
|
440
|
+
)
|
|
441
|
+
else:
|
|
442
|
+
_LOGGER.info("Features have not been scaled as specified.")
|
|
443
|
+
X_train_final = X_train.to_numpy()
|
|
444
|
+
X_test_final = X_test.to_numpy()
|
|
409
445
|
|
|
410
446
|
# --- 5. Create Datasets ---
|
|
411
447
|
# _PytorchDataset now correctly handles y_train (a DataFrame)
|
|
@@ -432,149 +468,6 @@ class _BaseMaker(ABC):
|
|
|
432
468
|
pass
|
|
433
469
|
|
|
434
470
|
|
|
435
|
-
# --- VisionDatasetMaker ---
|
|
436
|
-
class VisionDatasetMaker(_BaseMaker):
|
|
437
|
-
"""
|
|
438
|
-
Creates processed PyTorch datasets for computer vision tasks from an
|
|
439
|
-
image folder directory.
|
|
440
|
-
|
|
441
|
-
Uses online augmentations per epoch (image augmentation without creating new files).
|
|
442
|
-
"""
|
|
443
|
-
def __init__(self, full_dataset: ImageFolder):
|
|
444
|
-
super().__init__()
|
|
445
|
-
self.full_dataset = full_dataset
|
|
446
|
-
self.labels = [s[1] for s in self.full_dataset.samples]
|
|
447
|
-
self.class_map = full_dataset.class_to_idx
|
|
448
|
-
|
|
449
|
-
self._is_split = False
|
|
450
|
-
self._are_transforms_configured = False
|
|
451
|
-
|
|
452
|
-
@classmethod
|
|
453
|
-
def from_folder(cls, root_dir: str) -> 'VisionDatasetMaker':
|
|
454
|
-
"""Creates a maker instance from a root directory of images."""
|
|
455
|
-
initial_transform = transforms.Compose([transforms.ToTensor()])
|
|
456
|
-
full_dataset = ImageFolder(root=root_dir, transform=initial_transform)
|
|
457
|
-
_LOGGER.info(f"Found {len(full_dataset)} images in {len(full_dataset.classes)} classes.")
|
|
458
|
-
return cls(full_dataset)
|
|
459
|
-
|
|
460
|
-
@staticmethod
|
|
461
|
-
def inspect_folder(path: Union[str, Path]):
|
|
462
|
-
"""
|
|
463
|
-
Logs a report of the types, sizes, and channels of image files
|
|
464
|
-
found in the directory and its subdirectories.
|
|
465
|
-
"""
|
|
466
|
-
path_obj = make_fullpath(path)
|
|
467
|
-
|
|
468
|
-
non_image_files = set()
|
|
469
|
-
img_types = set()
|
|
470
|
-
img_sizes = set()
|
|
471
|
-
img_channels = set()
|
|
472
|
-
img_counter = 0
|
|
473
|
-
|
|
474
|
-
_LOGGER.info(f"Inspecting folder: {path_obj}...")
|
|
475
|
-
# Use rglob to recursively find all files
|
|
476
|
-
for filepath in path_obj.rglob('*'):
|
|
477
|
-
if filepath.is_file():
|
|
478
|
-
try:
|
|
479
|
-
# Using PIL to open is a more reliable check
|
|
480
|
-
with Image.open(filepath) as img:
|
|
481
|
-
img_types.add(img.format)
|
|
482
|
-
img_sizes.add(img.size)
|
|
483
|
-
img_channels.update(img.getbands())
|
|
484
|
-
img_counter += 1
|
|
485
|
-
except (IOError, SyntaxError):
|
|
486
|
-
non_image_files.add(filepath.name)
|
|
487
|
-
|
|
488
|
-
if non_image_files:
|
|
489
|
-
_LOGGER.warning(f"Non-image or corrupted files found and ignored: {non_image_files}")
|
|
490
|
-
|
|
491
|
-
report = (
|
|
492
|
-
f"\n--- Inspection Report for '{path_obj.name}' ---\n"
|
|
493
|
-
f"Total images found: {img_counter}\n"
|
|
494
|
-
f"Image formats: {img_types or 'None'}\n"
|
|
495
|
-
f"Image sizes (WxH): {img_sizes or 'None'}\n"
|
|
496
|
-
f"Image channels (bands): {img_channels or 'None'}\n"
|
|
497
|
-
f"--------------------------------------"
|
|
498
|
-
)
|
|
499
|
-
print(report)
|
|
500
|
-
|
|
501
|
-
def split_data(self, val_size: float = 0.2, test_size: float = 0.0,
|
|
502
|
-
stratify: bool = True, random_state: Optional[int] = None) -> 'VisionDatasetMaker':
|
|
503
|
-
"""Splits the dataset into training, validation, and optional test sets."""
|
|
504
|
-
if self._is_split:
|
|
505
|
-
_LOGGER.warning("Data has already been split.")
|
|
506
|
-
return self
|
|
507
|
-
|
|
508
|
-
if val_size + test_size >= 1.0:
|
|
509
|
-
_LOGGER.error("The sum of val_size and test_size must be less than 1.")
|
|
510
|
-
raise ValueError()
|
|
511
|
-
|
|
512
|
-
indices = list(range(len(self.full_dataset)))
|
|
513
|
-
labels_for_split = self.labels if stratify else None
|
|
514
|
-
|
|
515
|
-
train_indices, val_test_indices = train_test_split(
|
|
516
|
-
indices, test_size=(val_size + test_size), random_state=random_state, stratify=labels_for_split
|
|
517
|
-
)
|
|
518
|
-
|
|
519
|
-
if test_size > 0:
|
|
520
|
-
val_test_labels = [self.labels[i] for i in val_test_indices]
|
|
521
|
-
stratify_val_test = val_test_labels if stratify else None
|
|
522
|
-
val_indices, test_indices = train_test_split(
|
|
523
|
-
val_test_indices, test_size=(test_size / (val_size + test_size)),
|
|
524
|
-
random_state=random_state, stratify=stratify_val_test
|
|
525
|
-
)
|
|
526
|
-
self._test_dataset = Subset(self.full_dataset, test_indices)
|
|
527
|
-
_LOGGER.info(f"Test set created with {len(self._test_dataset)} images.")
|
|
528
|
-
else:
|
|
529
|
-
val_indices = val_test_indices
|
|
530
|
-
|
|
531
|
-
self._train_dataset = Subset(self.full_dataset, train_indices)
|
|
532
|
-
self._val_dataset = Subset(self.full_dataset, val_indices)
|
|
533
|
-
self._is_split = True
|
|
534
|
-
|
|
535
|
-
_LOGGER.info(f"Data split into: \n- Training: {len(self._train_dataset)} images \n- Validation: {len(self._val_dataset)} images")
|
|
536
|
-
return self
|
|
537
|
-
|
|
538
|
-
def configure_transforms(self, resize_size: int = 256, crop_size: int = 224,
|
|
539
|
-
mean: List[float] = [0.485, 0.456, 0.406],
|
|
540
|
-
std: List[float] = [0.229, 0.224, 0.225],
|
|
541
|
-
extra_train_transforms: Optional[List] = None) -> 'VisionDatasetMaker':
|
|
542
|
-
"""Configures and applies the image transformations (augmentations)."""
|
|
543
|
-
if not self._is_split:
|
|
544
|
-
_LOGGER.error("Transforms must be configured AFTER splitting data. Call .split_data() first.")
|
|
545
|
-
raise RuntimeError()
|
|
546
|
-
|
|
547
|
-
base_train_transforms = [transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip()]
|
|
548
|
-
if extra_train_transforms:
|
|
549
|
-
base_train_transforms.extend(extra_train_transforms)
|
|
550
|
-
|
|
551
|
-
final_transforms = [transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]
|
|
552
|
-
|
|
553
|
-
val_transform = transforms.Compose([transforms.Resize(resize_size), transforms.CenterCrop(crop_size), *final_transforms])
|
|
554
|
-
train_transform = transforms.Compose([*base_train_transforms, *final_transforms])
|
|
555
|
-
|
|
556
|
-
self._train_dataset.dataset.transform = train_transform # type: ignore
|
|
557
|
-
self._val_dataset.dataset.transform = val_transform # type: ignore
|
|
558
|
-
if self._test_dataset:
|
|
559
|
-
self._test_dataset.dataset.transform = val_transform # type: ignore
|
|
560
|
-
|
|
561
|
-
self._are_transforms_configured = True
|
|
562
|
-
_LOGGER.info("Image transforms configured and applied.")
|
|
563
|
-
return self
|
|
564
|
-
|
|
565
|
-
def get_datasets(self) -> Tuple[Dataset, ...]:
|
|
566
|
-
"""Returns the final train, validation, and optional test datasets."""
|
|
567
|
-
if not self._is_split:
|
|
568
|
-
_LOGGER.error("Data has not been split. Call .split_data() first.")
|
|
569
|
-
raise RuntimeError()
|
|
570
|
-
if not self._are_transforms_configured:
|
|
571
|
-
_LOGGER.warning("Transforms have not been configured. Using default ToTensor only.")
|
|
572
|
-
|
|
573
|
-
if self._test_dataset:
|
|
574
|
-
return self._train_dataset, self._val_dataset, self._test_dataset
|
|
575
|
-
return self._train_dataset, self._val_dataset
|
|
576
|
-
|
|
577
|
-
|
|
578
471
|
# --- SequenceMaker ---
|
|
579
472
|
class SequenceMaker(_BaseMaker):
|
|
580
473
|
"""
|
|
@@ -763,40 +656,5 @@ class SequenceMaker(_BaseMaker):
|
|
|
763
656
|
return self._train_dataset, self._test_dataset
|
|
764
657
|
|
|
765
658
|
|
|
766
|
-
# --- Custom Vision Transform Class ---
|
|
767
|
-
class ResizeAspectFill:
|
|
768
|
-
"""
|
|
769
|
-
Custom transformation to make an image square by padding it to match the
|
|
770
|
-
longest side, preserving the aspect ratio. The image is finally centered.
|
|
771
|
-
|
|
772
|
-
Args:
|
|
773
|
-
pad_color (Union[str, int]): Color to use for the padding.
|
|
774
|
-
Defaults to "black".
|
|
775
|
-
"""
|
|
776
|
-
def __init__(self, pad_color: Union[str, int] = "black") -> None:
|
|
777
|
-
self.pad_color = pad_color
|
|
778
|
-
|
|
779
|
-
def __call__(self, image: Image.Image) -> Image.Image:
|
|
780
|
-
if not isinstance(image, Image.Image):
|
|
781
|
-
_LOGGER.error(f"Expected PIL.Image.Image, got {type(image).__name__}")
|
|
782
|
-
raise TypeError()
|
|
783
|
-
|
|
784
|
-
w, h = image.size
|
|
785
|
-
if w == h:
|
|
786
|
-
return image
|
|
787
|
-
|
|
788
|
-
# Determine padding to center the image
|
|
789
|
-
if w > h:
|
|
790
|
-
top_padding = (w - h) // 2
|
|
791
|
-
bottom_padding = w - h - top_padding
|
|
792
|
-
padding = (0, top_padding, 0, bottom_padding)
|
|
793
|
-
else: # h > w
|
|
794
|
-
left_padding = (h - w) // 2
|
|
795
|
-
right_padding = h - w - left_padding
|
|
796
|
-
padding = (left_padding, 0, right_padding, 0)
|
|
797
|
-
|
|
798
|
-
return ImageOps.expand(image, padding, fill=self.pad_color)
|
|
799
|
-
|
|
800
|
-
|
|
801
659
|
def info():
|
|
802
660
|
_script_info(__all__)
|