dragon-ml-toolbox 12.5.0__tar.gz → 12.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-12.5.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-12.7.0}/PKG-INFO +1 -1
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ETL_cleaning.py +4 -4
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ETL_engineering.py +2 -2
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/MICE_imputation.py +2 -2
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ML_optimization.py +2 -2
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ML_simple_optimization.py +2 -2
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/VIF_factor.py +2 -2
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/data_exploration.py +3 -3
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ensemble_learning.py +2 -2
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/utilities.py +57 -8
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/pyproject.toml +1 -1
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/LICENSE +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/README.md +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/GUI_tools.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ML_callbacks.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ML_datasetmaster.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ML_evaluation.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ML_evaluation_multi.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ML_inference.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ML_models.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ML_scaler.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ML_trainer.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ML_utilities.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/PSO_optimization.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/RNN_forecast.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/SQL.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/_logger.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/_script_info.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/constants.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/custom_logger.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ensemble_evaluation.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/ensemble_inference.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/handle_excel.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/keys.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/math_utilities.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/optimization_tools.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/path_manager.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/ml_tools/serde.py +0 -0
- {dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/setup.cfg +0 -0
|
@@ -5,7 +5,7 @@ from typing import Union, List, Dict
|
|
|
5
5
|
|
|
6
6
|
from .path_manager import sanitize_filename, make_fullpath
|
|
7
7
|
from .data_exploration import drop_macro
|
|
8
|
-
from .utilities import
|
|
8
|
+
from .utilities import save_dataframe_filename, load_dataframe
|
|
9
9
|
from ._script_info import _script_info
|
|
10
10
|
from ._logger import _LOGGER
|
|
11
11
|
|
|
@@ -263,7 +263,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
263
263
|
df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
|
|
264
264
|
|
|
265
265
|
# Save cleaned dataframe
|
|
266
|
-
|
|
266
|
+
save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
|
|
267
267
|
|
|
268
268
|
_LOGGER.info(f"Data successfully cleaned.")
|
|
269
269
|
|
|
@@ -329,7 +329,7 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
|
|
|
329
329
|
threshold=threshold)
|
|
330
330
|
|
|
331
331
|
# Save cleaned dataframe
|
|
332
|
-
|
|
332
|
+
save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
|
|
333
333
|
|
|
334
334
|
_LOGGER.info(f"Data successfully cleaned.")
|
|
335
335
|
|
|
@@ -494,7 +494,7 @@ class DataFrameCleaner:
|
|
|
494
494
|
if isinstance(output_filepath, str):
|
|
495
495
|
output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
|
|
496
496
|
|
|
497
|
-
|
|
497
|
+
save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
|
|
498
498
|
|
|
499
499
|
return None
|
|
500
500
|
|
|
@@ -3,7 +3,7 @@ import re
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
|
|
5
5
|
|
|
6
|
-
from .utilities import load_dataframe,
|
|
6
|
+
from .utilities import load_dataframe, save_dataframe_filename
|
|
7
7
|
from .path_manager import make_fullpath
|
|
8
8
|
from ._script_info import _script_info
|
|
9
9
|
from ._logger import _LOGGER
|
|
@@ -230,7 +230,7 @@ class DataProcessor:
|
|
|
230
230
|
df_processed = self.transform(df)
|
|
231
231
|
|
|
232
232
|
# save processed df
|
|
233
|
-
|
|
233
|
+
save_dataframe_filename(df=df_processed, save_dir=out_path.parent, filename=out_path.name)
|
|
234
234
|
|
|
235
235
|
def __str__(self) -> str:
|
|
236
236
|
"""
|
|
@@ -6,7 +6,7 @@ import numpy as np
|
|
|
6
6
|
from plotnine import ggplot, labs, theme, element_blank # type: ignore
|
|
7
7
|
from typing import Optional, Union
|
|
8
8
|
|
|
9
|
-
from .utilities import load_dataframe, merge_dataframes,
|
|
9
|
+
from .utilities import load_dataframe, merge_dataframes, save_dataframe_filename
|
|
10
10
|
from .math_utilities import threshold_binary_values
|
|
11
11
|
from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
|
|
12
12
|
from ._logger import _LOGGER
|
|
@@ -75,7 +75,7 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
|
|
|
75
75
|
def save_imputed_datasets(save_dir: Union[str, Path], imputed_datasets: list, df_targets: pd.DataFrame, imputed_dataset_names: list[str]):
|
|
76
76
|
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
77
77
|
merged_df = merge_dataframes(imputed_df, df_targets, direction="horizontal", verbose=False)
|
|
78
|
-
|
|
78
|
+
save_dataframe_filename(df=merged_df, save_dir=save_dir, filename=subname)
|
|
79
79
|
|
|
80
80
|
|
|
81
81
|
#Get names of features that had missing values before imputation
|
|
@@ -18,7 +18,7 @@ from .ML_inference import PyTorchInferenceHandler
|
|
|
18
18
|
from .keys import PyTorchInferenceKeys
|
|
19
19
|
from .SQL import DatabaseManager
|
|
20
20
|
from .optimization_tools import _save_result
|
|
21
|
-
from .utilities import
|
|
21
|
+
from .utilities import save_dataframe_filename
|
|
22
22
|
from .math_utilities import discretize_categorical_values
|
|
23
23
|
|
|
24
24
|
|
|
@@ -513,7 +513,7 @@ def _run_single_optimization_rep(
|
|
|
513
513
|
|
|
514
514
|
def _handle_pandas_log(logger: PandasLogger, save_path: Path, target_name: str):
|
|
515
515
|
log_dataframe = logger.to_dataframe()
|
|
516
|
-
|
|
516
|
+
save_dataframe_filename(df=log_dataframe, save_dir=save_path / "EvolutionLogs", filename=target_name)
|
|
517
517
|
|
|
518
518
|
|
|
519
519
|
def info():
|
|
@@ -18,7 +18,7 @@ from .ML_inference import PyTorchInferenceHandler
|
|
|
18
18
|
from .keys import PyTorchInferenceKeys
|
|
19
19
|
from .SQL import DatabaseManager
|
|
20
20
|
from .optimization_tools import _save_result
|
|
21
|
-
from .utilities import
|
|
21
|
+
from .utilities import save_dataframe_filename
|
|
22
22
|
from .math_utilities import threshold_binary_values
|
|
23
23
|
|
|
24
24
|
"""
|
|
@@ -406,7 +406,7 @@ def s_run_optimization(
|
|
|
406
406
|
|
|
407
407
|
def _handle_pandas_log(logger: PandasLogger, save_path: Path, target_name: str):
|
|
408
408
|
log_dataframe = logger.to_dataframe()
|
|
409
|
-
|
|
409
|
+
save_dataframe_filename(df=log_dataframe, save_dir=save_path / "EvolutionLogs", filename=target_name)
|
|
410
410
|
|
|
411
411
|
|
|
412
412
|
def info():
|
|
@@ -7,7 +7,7 @@ from statsmodels.tools.tools import add_constant
|
|
|
7
7
|
import warnings
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
|
|
10
|
-
from .utilities import yield_dataframes_from_dir,
|
|
10
|
+
from .utilities import yield_dataframes_from_dir, save_dataframe_filename
|
|
11
11
|
from .path_manager import sanitize_filename, make_fullpath
|
|
12
12
|
from ._logger import _LOGGER
|
|
13
13
|
from ._script_info import _script_info
|
|
@@ -229,7 +229,7 @@ def compute_vif_multi(input_directory: Union[str, Path],
|
|
|
229
229
|
result_df, dropped_cols = drop_vif_based(df=df, vif_df=vif_dataframe)
|
|
230
230
|
|
|
231
231
|
if len(dropped_cols) > 0:
|
|
232
|
-
|
|
232
|
+
save_dataframe_filename(df=result_df, save_dir=output_dataset_path, filename=new_filename)
|
|
233
233
|
|
|
234
234
|
|
|
235
235
|
def info():
|
|
@@ -10,7 +10,7 @@ import re
|
|
|
10
10
|
from .path_manager import sanitize_filename, make_fullpath
|
|
11
11
|
from ._script_info import _script_info
|
|
12
12
|
from ._logger import _LOGGER
|
|
13
|
-
from .utilities import
|
|
13
|
+
from .utilities import save_dataframe_filename
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
# Keep track of all available tools, show using `info()`
|
|
@@ -269,7 +269,7 @@ def drop_macro(df: pd.DataFrame,
|
|
|
269
269
|
|
|
270
270
|
# Log initial state
|
|
271
271
|
missing_data = show_null_columns(df=df_clean)
|
|
272
|
-
|
|
272
|
+
save_dataframe_filename(df=missing_data.reset_index(drop=False),
|
|
273
273
|
save_dir=log_directory,
|
|
274
274
|
filename="Missing_Data_start")
|
|
275
275
|
|
|
@@ -298,7 +298,7 @@ def drop_macro(df: pd.DataFrame,
|
|
|
298
298
|
|
|
299
299
|
# log final state
|
|
300
300
|
missing_data = show_null_columns(df=df_clean)
|
|
301
|
-
|
|
301
|
+
save_dataframe_filename(df=missing_data.reset_index(drop=False),
|
|
302
302
|
save_dir=log_directory,
|
|
303
303
|
filename="Missing_Data_final")
|
|
304
304
|
|
|
@@ -14,7 +14,7 @@ from sklearn.model_selection import train_test_split
|
|
|
14
14
|
from sklearn.base import clone
|
|
15
15
|
|
|
16
16
|
from .utilities import yield_dataframes_from_dir, train_dataset_yielder
|
|
17
|
-
from .serde import
|
|
17
|
+
from .serde import serialize_object_filename
|
|
18
18
|
from .path_manager import sanitize_filename, make_fullpath
|
|
19
19
|
from ._script_info import _script_info
|
|
20
20
|
from .keys import EnsembleKeys
|
|
@@ -411,7 +411,7 @@ def _save_model(trained_model, model_name: str, target_name:str, feature_names:
|
|
|
411
411
|
EnsembleKeys.FEATURES: feature_names,
|
|
412
412
|
EnsembleKeys.TARGET: target_name}
|
|
413
413
|
|
|
414
|
-
|
|
414
|
+
serialize_object_filename(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
|
|
415
415
|
|
|
416
416
|
|
|
417
417
|
# TRAIN EVALUATE PIPELINE
|
|
@@ -12,10 +12,11 @@ from ._logger import _LOGGER
|
|
|
12
12
|
# Keep track of available tools
|
|
13
13
|
__all__ = [
|
|
14
14
|
"load_dataframe",
|
|
15
|
+
"load_dataframe_greedy",
|
|
15
16
|
"yield_dataframes_from_dir",
|
|
16
17
|
"merge_dataframes",
|
|
18
|
+
"save_dataframe_filename",
|
|
17
19
|
"save_dataframe",
|
|
18
|
-
"save_dataframe_path",
|
|
19
20
|
"distribute_dataset_by_target",
|
|
20
21
|
"train_dataset_orchestrator",
|
|
21
22
|
"train_dataset_yielder"
|
|
@@ -124,6 +125,54 @@ def load_dataframe(
|
|
|
124
125
|
return df, df_name # type: ignore
|
|
125
126
|
|
|
126
127
|
|
|
128
|
+
def load_dataframe_greedy(directory: Union[str, Path],
|
|
129
|
+
use_columns: Optional[list[str]] = None,
|
|
130
|
+
all_strings: bool = False,
|
|
131
|
+
verbose: bool = True) -> pd.DataFrame:
|
|
132
|
+
"""
|
|
133
|
+
Greedily loads the first found CSV file from a directory into a Pandas DataFrame.
|
|
134
|
+
|
|
135
|
+
This function scans the specified directory for any CSV files. It will
|
|
136
|
+
attempt to load the *first* CSV file it finds using the `load_dataframe`
|
|
137
|
+
function as a Pandas DataFrame.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
directory (str, Path):
|
|
141
|
+
The path to the directory to search for a CSV file.
|
|
142
|
+
use_columns (list[str] | None):
|
|
143
|
+
A list of column names to load. If None, all columns are loaded.
|
|
144
|
+
all_strings (bool):
|
|
145
|
+
If True, loads all columns as string data types.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
pd.DataFrame:
|
|
149
|
+
A pandas DataFrame loaded from the first CSV file found.
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
FileNotFoundError:
|
|
153
|
+
If the specified directory does not exist or the CSV file path
|
|
154
|
+
found is invalid.
|
|
155
|
+
ValueError:
|
|
156
|
+
If the loaded DataFrame is empty or `use_columns` contains
|
|
157
|
+
invalid column names.
|
|
158
|
+
"""
|
|
159
|
+
# validate directory
|
|
160
|
+
dir_path = make_fullpath(directory, enforce="directory")
|
|
161
|
+
|
|
162
|
+
# list all csv files and grab one (should be the only one)
|
|
163
|
+
csv_dict = list_csv_paths(directory=dir_path, verbose=False)
|
|
164
|
+
|
|
165
|
+
for df_path in csv_dict.values():
|
|
166
|
+
df , _df_name = load_dataframe(df_path=df_path,
|
|
167
|
+
use_columns=use_columns,
|
|
168
|
+
kind="pandas",
|
|
169
|
+
all_strings=all_strings,
|
|
170
|
+
verbose=verbose)
|
|
171
|
+
break
|
|
172
|
+
|
|
173
|
+
return df
|
|
174
|
+
|
|
175
|
+
|
|
127
176
|
def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
|
|
128
177
|
"""
|
|
129
178
|
Iterates over all CSV files in a given directory, loading each into a Pandas DataFrame.
|
|
@@ -210,7 +259,7 @@ def merge_dataframes(
|
|
|
210
259
|
return merged_df
|
|
211
260
|
|
|
212
261
|
|
|
213
|
-
def
|
|
262
|
+
def save_dataframe_filename(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Path], filename: str) -> None:
|
|
214
263
|
"""
|
|
215
264
|
Saves a pandas or polars DataFrame to a CSV file.
|
|
216
265
|
|
|
@@ -250,11 +299,11 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
|
|
|
250
299
|
_LOGGER.info(f"Saved dataset: '{filename}' with shape: {df.shape}")
|
|
251
300
|
|
|
252
301
|
|
|
253
|
-
def
|
|
302
|
+
def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], full_path: Path):
|
|
254
303
|
"""
|
|
255
304
|
Saves a DataFrame to a specified full path.
|
|
256
305
|
|
|
257
|
-
This function is a
|
|
306
|
+
This function is a wrapper for `save_dataframe_filename()`. It takes a
|
|
258
307
|
single `pathlib.Path` object pointing to a `.csv` file.
|
|
259
308
|
|
|
260
309
|
Args:
|
|
@@ -265,9 +314,9 @@ def save_dataframe_path(df: Union[pd.DataFrame, pl.DataFrame], full_path: Path):
|
|
|
265
314
|
_LOGGER.error('A path object pointing to a .csv file must be provided.')
|
|
266
315
|
raise ValueError()
|
|
267
316
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
317
|
+
save_dataframe_filename(df=df,
|
|
318
|
+
save_dir=full_path.parent,
|
|
319
|
+
filename=full_path.name)
|
|
271
320
|
|
|
272
321
|
|
|
273
322
|
def distribute_dataset_by_target(
|
|
@@ -351,7 +400,7 @@ def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
|
|
|
351
400
|
filename = df_dir.name + '_' + target_name + '_' + df_name
|
|
352
401
|
else:
|
|
353
402
|
filename = target_name + '_' + df_name
|
|
354
|
-
|
|
403
|
+
save_dataframe_filename(df=df, save_dir=save_dir, filename=filename)
|
|
355
404
|
total_saved += 1
|
|
356
405
|
except Exception as e:
|
|
357
406
|
_LOGGER.error(f"Failed to process file '{df_path}'. Reason: {e}")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/dragon_ml_toolbox.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/dragon_ml_toolbox.egg-info/requires.txt
RENAMED
|
File without changes
|
{dragon_ml_toolbox-12.5.0 → dragon_ml_toolbox-12.7.0}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|