dragon-ml-toolbox 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/METADATA +1 -1
- dragon_ml_toolbox-2.2.0.dist-info/RECORD +21 -0
- ml_tools/ETL_engineering.py +543 -0
- ml_tools/MICE_imputation.py +27 -28
- ml_tools/PSO_optimization.py +15 -15
- ml_tools/VIF_factor.py +20 -17
- ml_tools/data_exploration.py +58 -32
- ml_tools/ensemble_learning.py +40 -42
- ml_tools/handle_excel.py +98 -78
- ml_tools/logger.py +13 -11
- ml_tools/utilities.py +165 -60
- dragon_ml_toolbox-2.0.0.dist-info/RECORD +0 -20
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-2.0.0.dist-info → dragon_ml_toolbox-2.2.0.dist-info}/top_level.txt +0 -0
ml_tools/logger.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
import
|
|
1
|
+
from pathlib import Path
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import Union, List, Dict, Any
|
|
4
4
|
import pandas as pd
|
|
5
5
|
from openpyxl.styles import Font, PatternFill
|
|
6
6
|
import traceback
|
|
7
7
|
import json
|
|
8
|
-
from .utilities import sanitize_filename, _script_info
|
|
8
|
+
from .utilities import sanitize_filename, _script_info, make_fullpath
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
__all__ = [
|
|
@@ -21,7 +21,7 @@ def custom_logger(
|
|
|
21
21
|
str,
|
|
22
22
|
BaseException
|
|
23
23
|
],
|
|
24
|
-
save_directory: str,
|
|
24
|
+
save_directory: Union[str, Path],
|
|
25
25
|
log_name: str,
|
|
26
26
|
) -> None:
|
|
27
27
|
"""
|
|
@@ -54,10 +54,12 @@ def custom_logger(
|
|
|
54
54
|
ValueError: If the data type is unsupported.
|
|
55
55
|
"""
|
|
56
56
|
try:
|
|
57
|
-
|
|
57
|
+
save_path = make_fullpath(save_directory, make=True)
|
|
58
|
+
|
|
58
59
|
timestamp = datetime.now().strftime(r"%Y%m%d_%H%M%S")
|
|
59
60
|
log_name = sanitize_filename(log_name)
|
|
60
|
-
|
|
61
|
+
|
|
62
|
+
base_path = save_path / f"{log_name}_{timestamp}"
|
|
61
63
|
|
|
62
64
|
if isinstance(data, list):
|
|
63
65
|
_log_list_to_txt(data, base_path + ".txt")
|
|
@@ -86,7 +88,7 @@ def custom_logger(
|
|
|
86
88
|
print(f"Error in custom_logger: {e}")
|
|
87
89
|
|
|
88
90
|
|
|
89
|
-
def _log_list_to_txt(data: List[Any], path:
|
|
91
|
+
def _log_list_to_txt(data: List[Any], path: Path) -> None:
|
|
90
92
|
log_lines = []
|
|
91
93
|
for item in data:
|
|
92
94
|
try:
|
|
@@ -98,7 +100,7 @@ def _log_list_to_txt(data: List[Any], path: str) -> None:
|
|
|
98
100
|
f.write('\n'.join(log_lines))
|
|
99
101
|
|
|
100
102
|
|
|
101
|
-
def _log_dict_to_csv(data: Dict[Any, List[Any]], path:
|
|
103
|
+
def _log_dict_to_csv(data: Dict[Any, List[Any]], path: Path) -> None:
|
|
102
104
|
sanitized_dict = {}
|
|
103
105
|
max_length = max(len(v) for v in data.values()) if data else 0
|
|
104
106
|
|
|
@@ -113,7 +115,7 @@ def _log_dict_to_csv(data: Dict[Any, List[Any]], path: str) -> None:
|
|
|
113
115
|
df.to_csv(path, index=False)
|
|
114
116
|
|
|
115
117
|
|
|
116
|
-
def _log_dataframe_to_xlsx(data: pd.DataFrame, path:
|
|
118
|
+
def _log_dataframe_to_xlsx(data: pd.DataFrame, path: Path) -> None:
|
|
117
119
|
writer = pd.ExcelWriter(path, engine='openpyxl')
|
|
118
120
|
data.to_excel(writer, index=True, sheet_name='Data')
|
|
119
121
|
|
|
@@ -134,18 +136,18 @@ def _log_dataframe_to_xlsx(data: pd.DataFrame, path: str) -> None:
|
|
|
134
136
|
writer.close()
|
|
135
137
|
|
|
136
138
|
|
|
137
|
-
def _log_string_to_log(data: str, path:
|
|
139
|
+
def _log_string_to_log(data: str, path: Path) -> None:
|
|
138
140
|
with open(path, 'w', encoding='utf-8') as f:
|
|
139
141
|
f.write(data.strip() + '\n')
|
|
140
142
|
|
|
141
143
|
|
|
142
|
-
def _log_exception_to_log(exc: BaseException, path:
|
|
144
|
+
def _log_exception_to_log(exc: BaseException, path: Path) -> None:
|
|
143
145
|
with open(path, 'w', encoding='utf-8') as f:
|
|
144
146
|
f.write("Exception occurred:\n")
|
|
145
147
|
traceback.print_exception(type(exc), exc, exc.__traceback__, file=f)
|
|
146
148
|
|
|
147
149
|
|
|
148
|
-
def _log_dict_to_json(data: Dict[Any, Any], path:
|
|
150
|
+
def _log_dict_to_json(data: Dict[Any, Any], path: Path) -> None:
|
|
149
151
|
with open(path, 'w', encoding='utf-8') as f:
|
|
150
152
|
json.dump(data, f, indent=4, ensure_ascii=False)
|
|
151
153
|
|
ml_tools/utilities.py
CHANGED
|
@@ -2,7 +2,6 @@ import math
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
4
|
import polars as pl
|
|
5
|
-
import os
|
|
6
5
|
from pathlib import Path
|
|
7
6
|
import re
|
|
8
7
|
from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple
|
|
@@ -12,6 +11,7 @@ from joblib.externals.loky.process_executor import TerminatedWorkerError
|
|
|
12
11
|
|
|
13
12
|
# Keep track of available tools
|
|
14
13
|
__all__ = [
|
|
14
|
+
"make_fullpath",
|
|
15
15
|
"list_csv_paths",
|
|
16
16
|
"list_files_by_extension",
|
|
17
17
|
"load_dataframe",
|
|
@@ -28,27 +28,83 @@ __all__ = [
|
|
|
28
28
|
]
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
def
|
|
31
|
+
def make_fullpath(
|
|
32
|
+
input_path: Union[str, Path],
|
|
33
|
+
make: bool = False,
|
|
34
|
+
verbose: bool = False
|
|
35
|
+
) -> Path:
|
|
32
36
|
"""
|
|
33
|
-
|
|
37
|
+
Resolves a string or Path into an absolute Path.
|
|
38
|
+
|
|
39
|
+
- If the path exists, it is returned.
|
|
40
|
+
- If the path does not exist and `make=True`, it will:
|
|
41
|
+
- Create the file if the path has a suffix (i.e., is treated as a file)
|
|
42
|
+
- Create the directory if it has no suffix
|
|
43
|
+
- If `make=False` and the path does not exist, an error is raised.
|
|
44
|
+
- Optionally prints whether the resolved path is a file or directory.
|
|
34
45
|
|
|
35
46
|
Parameters:
|
|
36
|
-
|
|
47
|
+
input_path (str | Path): Path to resolve.
|
|
48
|
+
make (bool): If True, attempt to create file or directory.
|
|
49
|
+
verbose (bool): Print classification after resolution.
|
|
37
50
|
|
|
38
51
|
Returns:
|
|
39
|
-
|
|
52
|
+
Path: Resolved absolute path.
|
|
53
|
+
|
|
54
|
+
Raises:
|
|
55
|
+
ValueError: If the path doesn't exist and can't be created.
|
|
56
|
+
"""
|
|
57
|
+
path = Path(input_path).expanduser()
|
|
58
|
+
|
|
59
|
+
is_file = path.suffix != ""
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
resolved = path.resolve(strict=True)
|
|
63
|
+
except FileNotFoundError:
|
|
64
|
+
if not make:
|
|
65
|
+
raise ValueError(f"❌ Path does not exist: '{path}'")
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
if is_file:
|
|
69
|
+
# Create parent directories first
|
|
70
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
path.touch(exist_ok=False)
|
|
72
|
+
else:
|
|
73
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
resolved = path.resolve(strict=True)
|
|
75
|
+
except Exception as e:
|
|
76
|
+
raise ValueError(f"❌ Failed to create {'file' if is_file else 'directory'} '{path}': {e}")
|
|
77
|
+
|
|
78
|
+
if verbose:
|
|
79
|
+
if resolved.is_file():
|
|
80
|
+
print("📄 Path is a File")
|
|
81
|
+
elif resolved.is_dir():
|
|
82
|
+
print("📁 Path is a Directory")
|
|
83
|
+
else:
|
|
84
|
+
print("❓ Path exists but is neither file nor directory")
|
|
85
|
+
|
|
86
|
+
return resolved
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
|
|
40
91
|
"""
|
|
41
|
-
|
|
92
|
+
Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.
|
|
42
93
|
|
|
43
|
-
|
|
44
|
-
|
|
94
|
+
Parameters:
|
|
95
|
+
directory (str | Path): Path to the directory containing `.csv` files.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
(dict[str, Path]): Dictionary mapping {filename: filepath}.
|
|
99
|
+
"""
|
|
100
|
+
dir_path = make_fullpath(directory)
|
|
45
101
|
|
|
46
102
|
csv_paths = list(dir_path.glob("*.csv"))
|
|
47
103
|
if not csv_paths:
|
|
48
|
-
raise IOError(f"No CSV files found in directory: {dir_path}")
|
|
104
|
+
raise IOError(f"No CSV files found in directory: {dir_path.name}")
|
|
49
105
|
|
|
50
106
|
# make a dictionary of paths and names
|
|
51
|
-
name_path_dict = {p.stem:
|
|
107
|
+
name_path_dict = {p.stem: p for p in csv_paths}
|
|
52
108
|
|
|
53
109
|
print("\n🗂️ CSV files found:")
|
|
54
110
|
for name in name_path_dict.keys():
|
|
@@ -57,22 +113,19 @@ def list_csv_paths(directory: str) -> dict[str, str]:
|
|
|
57
113
|
return name_path_dict
|
|
58
114
|
|
|
59
115
|
|
|
60
|
-
def list_files_by_extension(directory: str, extension: str) -> dict[str,
|
|
116
|
+
def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[str, Path]:
|
|
61
117
|
"""
|
|
62
118
|
Lists all files with the specified extension in the given directory and returns a mapping:
|
|
63
119
|
filenames (without extensions) to their absolute paths.
|
|
64
120
|
|
|
65
121
|
Parameters:
|
|
66
|
-
directory (str): Path to the directory to search in.
|
|
122
|
+
directory (str | Path): Path to the directory to search in.
|
|
67
123
|
extension (str): File extension to search for (e.g., 'json', 'txt').
|
|
68
124
|
|
|
69
125
|
Returns:
|
|
70
|
-
(dict[str,
|
|
126
|
+
(dict[str, Path]): Dictionary mapping {filename: filepath}.
|
|
71
127
|
"""
|
|
72
|
-
dir_path =
|
|
73
|
-
|
|
74
|
-
if not dir_path.is_dir():
|
|
75
|
-
raise FileNotFoundError(f"Directory not found: {dir_path}")
|
|
128
|
+
dir_path = make_fullpath(directory)
|
|
76
129
|
|
|
77
130
|
# Normalize the extension (remove leading dot if present)
|
|
78
131
|
normalized_ext = extension.lstrip(".").lower()
|
|
@@ -82,7 +135,7 @@ def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
|
|
|
82
135
|
if not matched_paths:
|
|
83
136
|
raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
|
|
84
137
|
|
|
85
|
-
name_path_dict = {p.stem:
|
|
138
|
+
name_path_dict = {p.stem: p for p in matched_paths}
|
|
86
139
|
|
|
87
140
|
print(f"\n📂 '{normalized_ext.upper()}' files found:")
|
|
88
141
|
for name in name_path_dict:
|
|
@@ -91,32 +144,70 @@ def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
|
|
|
91
144
|
return name_path_dict
|
|
92
145
|
|
|
93
146
|
|
|
94
|
-
def load_dataframe(
|
|
147
|
+
def load_dataframe(
|
|
148
|
+
df_path: Union[str, Path],
|
|
149
|
+
kind: Literal["pandas", "polars"] = "pandas",
|
|
150
|
+
all_strings: bool = False
|
|
151
|
+
) -> Tuple[Union[pd.DataFrame, pl.DataFrame], str]:
|
|
95
152
|
"""
|
|
96
|
-
Load a CSV file into a
|
|
153
|
+
Load a CSV file into a DataFrame and extract its base name.
|
|
154
|
+
|
|
155
|
+
Can load data as either a pandas or a polars DataFrame. Allows for loading all
|
|
156
|
+
columns as string types to prevent type inference errors.
|
|
97
157
|
|
|
98
158
|
Args:
|
|
99
|
-
df_path (str):
|
|
159
|
+
df_path (Union[str, Path]):
|
|
160
|
+
The path to the CSV file.
|
|
161
|
+
kind (Literal["pandas", "polars"], optional):
|
|
162
|
+
The type of DataFrame to load. Defaults to "pandas".
|
|
163
|
+
all_strings (bool, optional):
|
|
164
|
+
If True, loads all columns as string data types. This is useful for
|
|
165
|
+
ETL tasks and to avoid type-inference errors. Defaults to False.
|
|
100
166
|
|
|
101
167
|
Returns:
|
|
102
|
-
Tuple
|
|
103
|
-
|
|
168
|
+
(Tuple[DataFrameType, str]):
|
|
169
|
+
A tuple containing the loaded DataFrame (either pandas or polars)
|
|
170
|
+
and the base name of the file (without extension).
|
|
171
|
+
|
|
172
|
+
Raises:
|
|
173
|
+
FileNotFoundError: If the file does not exist at the given path.
|
|
174
|
+
ValueError: If the DataFrame is empty or an invalid 'kind' is provided.
|
|
104
175
|
"""
|
|
105
|
-
path =
|
|
106
|
-
|
|
176
|
+
path = make_fullpath(df_path)
|
|
177
|
+
|
|
107
178
|
df_name = path.stem
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
179
|
+
|
|
180
|
+
if kind == "pandas":
|
|
181
|
+
if all_strings:
|
|
182
|
+
df = pd.read_csv(path, encoding='utf-8', dtype=str)
|
|
183
|
+
else:
|
|
184
|
+
df = pd.read_csv(path, encoding='utf-8')
|
|
185
|
+
|
|
186
|
+
elif kind == "polars":
|
|
187
|
+
if all_strings:
|
|
188
|
+
df = pl.read_csv(path, infer_schema=False)
|
|
189
|
+
else:
|
|
190
|
+
# Default behavior: infer the schema.
|
|
191
|
+
df = pl.read_csv(path, infer_schema_length=1000)
|
|
192
|
+
|
|
193
|
+
else:
|
|
194
|
+
raise ValueError(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
|
|
195
|
+
|
|
196
|
+
# This check works for both pandas and polars DataFrames
|
|
197
|
+
if df.shape[0] == 0:
|
|
198
|
+
raise ValueError(f"DataFrame '{df_name}' loaded from '{path}' is empty.")
|
|
199
|
+
|
|
200
|
+
print(f"\n💿 Loaded {kind} dataset: '{df_name}' with shape: {df.shape}")
|
|
201
|
+
|
|
111
202
|
return df, df_name
|
|
112
203
|
|
|
113
204
|
|
|
114
|
-
def yield_dataframes_from_dir(datasets_dir: str):
|
|
205
|
+
def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
|
|
115
206
|
"""
|
|
116
207
|
Iterates over all CSV files in a given directory, loading each into a pandas DataFrame.
|
|
117
208
|
|
|
118
209
|
Parameters:
|
|
119
|
-
datasets_dir (str):
|
|
210
|
+
datasets_dir (str | Path):
|
|
120
211
|
The path to the directory containing `.csv` dataset files.
|
|
121
212
|
|
|
122
213
|
Yields:
|
|
@@ -129,7 +220,8 @@ def yield_dataframes_from_dir(datasets_dir: str):
|
|
|
129
220
|
- CSV files are read using UTF-8 encoding.
|
|
130
221
|
- Output is streamed via a generator to support lazy loading of multiple datasets.
|
|
131
222
|
"""
|
|
132
|
-
|
|
223
|
+
datasets_path = make_fullpath(datasets_dir)
|
|
224
|
+
for df_name, df_path in list_csv_paths(datasets_path).items():
|
|
133
225
|
df, _ = load_dataframe(df_path)
|
|
134
226
|
yield df, df_name
|
|
135
227
|
|
|
@@ -193,29 +285,42 @@ def merge_dataframes(
|
|
|
193
285
|
return merged_df
|
|
194
286
|
|
|
195
287
|
|
|
196
|
-
def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
|
|
288
|
+
def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Path], filename: str) -> None:
|
|
197
289
|
"""
|
|
198
|
-
|
|
290
|
+
Saves a pandas or polars DataFrame to a CSV file.
|
|
199
291
|
|
|
200
|
-
|
|
201
|
-
df
|
|
202
|
-
|
|
203
|
-
|
|
292
|
+
Args:
|
|
293
|
+
df (Union[pd.DataFrame, pl.DataFrame]):
|
|
294
|
+
The DataFrame to save.
|
|
295
|
+
save_dir (Union[str, Path]):
|
|
296
|
+
The directory where the CSV file will be saved.
|
|
297
|
+
filename (str):
|
|
298
|
+
The CSV filename. The '.csv' extension will be added if missing.
|
|
204
299
|
"""
|
|
205
|
-
|
|
300
|
+
# This check works for both pandas and polars
|
|
301
|
+
if df.shape[0] == 0:
|
|
206
302
|
print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
|
|
207
303
|
return
|
|
208
304
|
|
|
209
|
-
|
|
305
|
+
# Create the directory if it doesn't exist
|
|
306
|
+
save_path = make_fullpath(save_dir, make=True)
|
|
210
307
|
|
|
308
|
+
# Clean the filename
|
|
211
309
|
filename = sanitize_filename(filename)
|
|
212
|
-
|
|
213
310
|
if not filename.endswith('.csv'):
|
|
214
311
|
filename += '.csv'
|
|
215
312
|
|
|
216
|
-
output_path =
|
|
313
|
+
output_path = save_path / filename
|
|
217
314
|
|
|
218
|
-
|
|
315
|
+
# --- Type-specific saving logic ---
|
|
316
|
+
if isinstance(df, pd.DataFrame):
|
|
317
|
+
df.to_csv(output_path, index=False, encoding='utf-8')
|
|
318
|
+
elif isinstance(df, pl.DataFrame):
|
|
319
|
+
df.write_csv(output_path) # Polars defaults to utf8 and no index
|
|
320
|
+
else:
|
|
321
|
+
# This error handles cases where an unsupported type is passed
|
|
322
|
+
raise TypeError(f"Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
|
|
323
|
+
|
|
219
324
|
print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
|
|
220
325
|
|
|
221
326
|
|
|
@@ -392,24 +497,24 @@ def threshold_binary_values_batch(
|
|
|
392
497
|
return np.hstack([cont_part, bin_part])
|
|
393
498
|
|
|
394
499
|
|
|
395
|
-
def serialize_object(obj: Any, save_dir: str, filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[
|
|
500
|
+
def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[Path]:
|
|
396
501
|
"""
|
|
397
502
|
Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
|
|
398
503
|
|
|
399
504
|
Parameters:
|
|
400
505
|
obj (Any) : The Python object to serialize.
|
|
401
|
-
save_dir (str) : Directory path where the serialized object will be saved.
|
|
506
|
+
save_dir (str | Path) : Directory path where the serialized object will be saved.
|
|
402
507
|
filename (str) : Name for the output file, extension will be appended if needed.
|
|
403
508
|
|
|
404
509
|
Returns:
|
|
405
|
-
(
|
|
510
|
+
(Path | None) : The full file path where the object was saved if successful; otherwise, None.
|
|
406
511
|
"""
|
|
407
512
|
try:
|
|
408
|
-
|
|
513
|
+
save_path = make_fullpath(save_dir, make=True)
|
|
409
514
|
sanitized_name = sanitize_filename(filename)
|
|
410
515
|
if not sanitized_name.endswith('.joblib'):
|
|
411
516
|
sanitized_name = sanitized_name + ".joblib"
|
|
412
|
-
full_path =
|
|
517
|
+
full_path = save_path / sanitized_name
|
|
413
518
|
joblib.dump(obj, full_path)
|
|
414
519
|
except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
|
|
415
520
|
message = f"❌ Failed to serialize object of type '{type(obj)}': {e}"
|
|
@@ -424,23 +529,22 @@ def serialize_object(obj: Any, save_dir: str, filename: str, verbose: bool=True,
|
|
|
424
529
|
return full_path
|
|
425
530
|
|
|
426
531
|
|
|
427
|
-
def deserialize_object(filepath: str, verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
|
|
532
|
+
def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
|
|
428
533
|
"""
|
|
429
534
|
Loads a serialized object from a .joblib file.
|
|
430
535
|
|
|
431
536
|
Parameters:
|
|
432
|
-
filepath (str): Full path to the serialized .joblib file.
|
|
537
|
+
filepath (str | Path): Full path to the serialized .joblib file.
|
|
433
538
|
|
|
434
539
|
Returns:
|
|
435
540
|
(Any | None): The deserialized Python object, or None if loading fails.
|
|
436
541
|
"""
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
return None
|
|
542
|
+
true_filepath = make_fullpath(filepath)
|
|
543
|
+
|
|
440
544
|
try:
|
|
441
|
-
obj = joblib.load(
|
|
545
|
+
obj = joblib.load(true_filepath)
|
|
442
546
|
except (IOError, OSError, EOFError, TypeError, ValueError) as e:
|
|
443
|
-
message = f"❌ Failed to deserialize object from '{
|
|
547
|
+
message = f"❌ Failed to deserialize object from '{true_filepath}': {e}"
|
|
444
548
|
if raise_on_error:
|
|
445
549
|
raise Exception(message)
|
|
446
550
|
else:
|
|
@@ -453,7 +557,7 @@ def deserialize_object(filepath: str, verbose: bool=True, raise_on_error: bool=T
|
|
|
453
557
|
|
|
454
558
|
|
|
455
559
|
def distribute_datasets_by_target(
|
|
456
|
-
df_or_path: Union[pd.DataFrame, str],
|
|
560
|
+
df_or_path: Union[pd.DataFrame, str, Path],
|
|
457
561
|
target_columns: list[str],
|
|
458
562
|
verbose: bool = False
|
|
459
563
|
) -> Iterator[Tuple[str, pd.DataFrame]]:
|
|
@@ -463,7 +567,7 @@ def distribute_datasets_by_target(
|
|
|
463
567
|
|
|
464
568
|
Parameters
|
|
465
569
|
----------
|
|
466
|
-
df_or_path : [pd.DataFrame | str]
|
|
570
|
+
df_or_path : [pd.DataFrame | str | Path]
|
|
467
571
|
Dataframe or path to Dataframe with all feature and target columns ready to split and train a model.
|
|
468
572
|
target_columns : List[str]
|
|
469
573
|
List of target column names to generate per-target DataFrames.
|
|
@@ -476,9 +580,10 @@ def distribute_datasets_by_target(
|
|
|
476
580
|
* Target name.
|
|
477
581
|
* Pandas DataFrame.
|
|
478
582
|
"""
|
|
479
|
-
# Validate path
|
|
480
|
-
if isinstance(df_or_path, str):
|
|
481
|
-
|
|
583
|
+
# Validate path or dataframe
|
|
584
|
+
if isinstance(df_or_path, str) or isinstance(df_or_path, Path):
|
|
585
|
+
df_path = make_fullpath(df_or_path)
|
|
586
|
+
df, _ = load_dataframe(df_path)
|
|
482
587
|
else:
|
|
483
588
|
df = df_or_path
|
|
484
589
|
|
|
@@ -486,7 +591,7 @@ def distribute_datasets_by_target(
|
|
|
486
591
|
feature_columns = [col for col in df.columns if col not in valid_targets]
|
|
487
592
|
|
|
488
593
|
for target in valid_targets:
|
|
489
|
-
subset = df[feature_columns + [target]].dropna(subset=[target])
|
|
594
|
+
subset = df[feature_columns + [target]].dropna(subset=[target]) # type: ignore
|
|
490
595
|
if verbose:
|
|
491
596
|
print(f"Target: '{target}' - Dataframe shape: {subset.shape}")
|
|
492
597
|
yield target, subset
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
dragon_ml_toolbox-2.0.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
-
dragon_ml_toolbox-2.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
|
|
3
|
-
ml_tools/MICE_imputation.py,sha256=wIfl8I3SyHUett-0vizaCiv0y_q43-zij8VczsbEIOI,11088
|
|
4
|
-
ml_tools/PSO_optimization.py,sha256=bNiuKqyVoShGM4VBx4exJ8jjVVxQjlunkVpzaMb7fwY,20850
|
|
5
|
-
ml_tools/VIF_factor.py,sha256=HEBsLJy_qSDaPw1Btha5B7omxN4wjJXg-sqoetCjCJw,10016
|
|
6
|
-
ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
|
|
8
|
-
ml_tools/data_exploration.py,sha256=NfPuN57wL5CXBnRyvIayxaYMe_ZKieHT3ZIcmtO_XIQ,20115
|
|
9
|
-
ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
|
|
10
|
-
ml_tools/ensemble_learning.py,sha256=v_btCkVthuEl3Pu1WipASvU5lGAVbXxxKEMq3boF-HI,37305
|
|
11
|
-
ml_tools/handle_excel.py,sha256=NrCOWSENgb1HdqId_QOdPTjBUIJPePI9a2pnmmBd3lw,12613
|
|
12
|
-
ml_tools/logger.py,sha256=WI7wiGmmALCQPl0AIauw_mPzFNTbaQf0v9J8pojvHUg,4708
|
|
13
|
-
ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
|
|
14
|
-
ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
|
|
15
|
-
ml_tools/utilities.py,sha256=_7RDgk9uBxPuHJRVOOFYFUOZyJ1o9QILnxYsKdGCfLQ,16772
|
|
16
|
-
ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
|
|
17
|
-
dragon_ml_toolbox-2.0.0.dist-info/METADATA,sha256=7MHJGUXvWThm8-Rv9NZyogTQKBBMH4x0EXLsHel9Dns,2974
|
|
18
|
-
dragon_ml_toolbox-2.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
-
dragon_ml_toolbox-2.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
20
|
-
dragon_ml_toolbox-2.0.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|