dragon-ml-toolbox 10.1.1__py3-none-any.whl → 14.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
- dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
- {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
- {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
- ml_tools/ETL_cleaning.py +175 -59
- ml_tools/ETL_engineering.py +506 -70
- ml_tools/GUI_tools.py +2 -1
- ml_tools/MICE_imputation.py +212 -7
- ml_tools/ML_callbacks.py +73 -40
- ml_tools/ML_datasetmaster.py +267 -284
- ml_tools/ML_evaluation.py +119 -58
- ml_tools/ML_evaluation_multi.py +107 -32
- ml_tools/ML_inference.py +15 -5
- ml_tools/ML_models.py +234 -170
- ml_tools/ML_models_advanced.py +323 -0
- ml_tools/ML_optimization.py +321 -97
- ml_tools/ML_scaler.py +10 -5
- ml_tools/ML_trainer.py +585 -40
- ml_tools/ML_utilities.py +528 -0
- ml_tools/ML_vision_datasetmaster.py +1315 -0
- ml_tools/ML_vision_evaluation.py +260 -0
- ml_tools/ML_vision_inference.py +428 -0
- ml_tools/ML_vision_models.py +627 -0
- ml_tools/ML_vision_transformers.py +58 -0
- ml_tools/PSO_optimization.py +10 -7
- ml_tools/RNN_forecast.py +2 -0
- ml_tools/SQL.py +22 -9
- ml_tools/VIF_factor.py +4 -3
- ml_tools/_ML_vision_recipe.py +88 -0
- ml_tools/__init__.py +1 -0
- ml_tools/_logger.py +0 -2
- ml_tools/_schema.py +96 -0
- ml_tools/constants.py +79 -0
- ml_tools/custom_logger.py +164 -16
- ml_tools/data_exploration.py +1092 -109
- ml_tools/ensemble_evaluation.py +48 -1
- ml_tools/ensemble_inference.py +6 -7
- ml_tools/ensemble_learning.py +4 -3
- ml_tools/handle_excel.py +1 -0
- ml_tools/keys.py +80 -0
- ml_tools/math_utilities.py +259 -0
- ml_tools/optimization_tools.py +198 -24
- ml_tools/path_manager.py +144 -45
- ml_tools/serde.py +192 -0
- ml_tools/utilities.py +287 -227
- dragon_ml_toolbox-10.1.1.dist-info/RECORD +0 -36
- {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0
ml_tools/utilities.py
CHANGED
|
@@ -1,27 +1,25 @@
|
|
|
1
|
-
import math
|
|
2
1
|
import numpy as np
|
|
3
2
|
import pandas as pd
|
|
4
3
|
import polars as pl
|
|
5
4
|
from pathlib import Path
|
|
6
|
-
from typing import Literal, Union,
|
|
7
|
-
|
|
8
|
-
from joblib.externals.loky.process_executor import TerminatedWorkerError
|
|
5
|
+
from typing import Literal, Union, Optional, Any, Iterator, Tuple, overload
|
|
6
|
+
|
|
9
7
|
from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
|
|
10
8
|
from ._script_info import _script_info
|
|
11
9
|
from ._logger import _LOGGER
|
|
10
|
+
from ._schema import FeatureSchema
|
|
12
11
|
|
|
13
12
|
|
|
14
13
|
# Keep track of available tools
|
|
15
14
|
__all__ = [
|
|
16
15
|
"load_dataframe",
|
|
16
|
+
"load_dataframe_greedy",
|
|
17
|
+
"load_dataframe_with_schema",
|
|
17
18
|
"yield_dataframes_from_dir",
|
|
18
19
|
"merge_dataframes",
|
|
20
|
+
"save_dataframe_filename",
|
|
19
21
|
"save_dataframe",
|
|
20
|
-
"
|
|
21
|
-
"threshold_binary_values",
|
|
22
|
-
"threshold_binary_values_batch",
|
|
23
|
-
"serialize_object",
|
|
24
|
-
"deserialize_object",
|
|
22
|
+
"save_dataframe_with_schema",
|
|
25
23
|
"distribute_dataset_by_target",
|
|
26
24
|
"train_dataset_orchestrator",
|
|
27
25
|
"train_dataset_yielder"
|
|
@@ -32,6 +30,7 @@ __all__ = [
|
|
|
32
30
|
@overload
|
|
33
31
|
def load_dataframe(
|
|
34
32
|
df_path: Union[str, Path],
|
|
33
|
+
use_columns: Optional[list[str]] = None,
|
|
35
34
|
kind: Literal["pandas"] = "pandas",
|
|
36
35
|
all_strings: bool = False,
|
|
37
36
|
verbose: bool = True
|
|
@@ -42,7 +41,8 @@ def load_dataframe(
|
|
|
42
41
|
@overload
|
|
43
42
|
def load_dataframe(
|
|
44
43
|
df_path: Union[str, Path],
|
|
45
|
-
|
|
44
|
+
use_columns: Optional[list[str]] = None,
|
|
45
|
+
kind: Literal["polars"] = "polars",
|
|
46
46
|
all_strings: bool = False,
|
|
47
47
|
verbose: bool = True
|
|
48
48
|
) -> Tuple[pl.DataFrame, str]:
|
|
@@ -50,6 +50,7 @@ def load_dataframe(
|
|
|
50
50
|
|
|
51
51
|
def load_dataframe(
|
|
52
52
|
df_path: Union[str, Path],
|
|
53
|
+
use_columns: Optional[list[str]] = None,
|
|
53
54
|
kind: Literal["pandas", "polars"] = "pandas",
|
|
54
55
|
all_strings: bool = False,
|
|
55
56
|
verbose: bool = True
|
|
@@ -58,11 +59,13 @@ def load_dataframe(
|
|
|
58
59
|
Load a CSV file into a DataFrame and extract its base name.
|
|
59
60
|
|
|
60
61
|
Can load data as either a pandas or a polars DataFrame. Allows for loading all
|
|
61
|
-
columns as string types to prevent type inference errors.
|
|
62
|
+
columns or a subset of columns as string types to prevent type inference errors.
|
|
62
63
|
|
|
63
64
|
Args:
|
|
64
65
|
df_path (str, Path):
|
|
65
66
|
The path to the CSV file.
|
|
67
|
+
use_columns (list[str] | None):
|
|
68
|
+
If provided, only these columns will be loaded from the CSV.
|
|
66
69
|
kind ("pandas", "polars"):
|
|
67
70
|
The type of DataFrame to load. Defaults to "pandas".
|
|
68
71
|
all_strings (bool):
|
|
@@ -76,28 +79,44 @@ def load_dataframe(
|
|
|
76
79
|
|
|
77
80
|
Raises:
|
|
78
81
|
FileNotFoundError: If the file does not exist at the given path.
|
|
79
|
-
ValueError: If the DataFrame is empty
|
|
82
|
+
ValueError: If the DataFrame is empty, an invalid 'kind' is provided, or a column in 'use_columns' is not found in the file.
|
|
80
83
|
"""
|
|
81
84
|
path = make_fullpath(df_path)
|
|
82
85
|
|
|
83
86
|
df_name = path.stem
|
|
84
87
|
|
|
85
|
-
|
|
86
|
-
if
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
88
|
+
try:
|
|
89
|
+
if kind == "pandas":
|
|
90
|
+
pd_kwargs: dict[str,Any]
|
|
91
|
+
pd_kwargs = {'encoding': 'utf-8'}
|
|
92
|
+
if use_columns:
|
|
93
|
+
pd_kwargs['usecols'] = use_columns
|
|
94
|
+
if all_strings:
|
|
95
|
+
pd_kwargs['dtype'] = str
|
|
96
|
+
|
|
97
|
+
df = pd.read_csv(path, **pd_kwargs)
|
|
98
|
+
|
|
99
|
+
elif kind == "polars":
|
|
100
|
+
pl_kwargs: dict[str,Any]
|
|
101
|
+
pl_kwargs = {}
|
|
102
|
+
pl_kwargs['null_values'] = ["", " "]
|
|
103
|
+
if use_columns:
|
|
104
|
+
pl_kwargs['columns'] = use_columns
|
|
105
|
+
|
|
106
|
+
if all_strings:
|
|
107
|
+
pl_kwargs['infer_schema'] = False
|
|
108
|
+
else:
|
|
109
|
+
pl_kwargs['infer_schema_length'] = 1000
|
|
110
|
+
|
|
111
|
+
df = pl.read_csv(path, **pl_kwargs)
|
|
112
|
+
|
|
94
113
|
else:
|
|
95
|
-
|
|
96
|
-
|
|
114
|
+
_LOGGER.error(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
|
|
115
|
+
raise ValueError()
|
|
97
116
|
|
|
98
|
-
|
|
99
|
-
_LOGGER.error(f"
|
|
100
|
-
raise
|
|
117
|
+
except (ValueError, pl.exceptions.ColumnNotFoundError) as e:
|
|
118
|
+
_LOGGER.error(f"Failed to load '{df_name}'. A specified column may not exist in the file.")
|
|
119
|
+
raise e
|
|
101
120
|
|
|
102
121
|
# This check works for both pandas and polars DataFrames
|
|
103
122
|
if df.shape[0] == 0:
|
|
@@ -110,6 +129,116 @@ def load_dataframe(
|
|
|
110
129
|
return df, df_name # type: ignore
|
|
111
130
|
|
|
112
131
|
|
|
132
|
+
def load_dataframe_greedy(directory: Union[str, Path],
|
|
133
|
+
use_columns: Optional[list[str]] = None,
|
|
134
|
+
all_strings: bool = False,
|
|
135
|
+
verbose: bool = True) -> pd.DataFrame:
|
|
136
|
+
"""
|
|
137
|
+
Greedily loads the first found CSV file from a directory into a Pandas DataFrame.
|
|
138
|
+
|
|
139
|
+
This function scans the specified directory for any CSV files. It will
|
|
140
|
+
attempt to load the *first* CSV file it finds using the `load_dataframe`
|
|
141
|
+
function as a Pandas DataFrame.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
directory (str, Path):
|
|
145
|
+
The path to the directory to search for a CSV file.
|
|
146
|
+
use_columns (list[str] | None):
|
|
147
|
+
A list of column names to load. If None, all columns are loaded.
|
|
148
|
+
all_strings (bool):
|
|
149
|
+
If True, loads all columns as string data types.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
pd.DataFrame:
|
|
153
|
+
A pandas DataFrame loaded from the first CSV file found.
|
|
154
|
+
|
|
155
|
+
Raises:
|
|
156
|
+
FileNotFoundError:
|
|
157
|
+
If the specified directory does not exist or the CSV file path
|
|
158
|
+
found is invalid.
|
|
159
|
+
ValueError:
|
|
160
|
+
If the loaded DataFrame is empty or `use_columns` contains
|
|
161
|
+
invalid column names.
|
|
162
|
+
"""
|
|
163
|
+
# validate directory
|
|
164
|
+
dir_path = make_fullpath(directory, enforce="directory")
|
|
165
|
+
|
|
166
|
+
# list all csv files and grab one (should be the only one)
|
|
167
|
+
csv_dict = list_csv_paths(directory=dir_path, verbose=False)
|
|
168
|
+
|
|
169
|
+
for df_path in csv_dict.values():
|
|
170
|
+
df , _df_name = load_dataframe(df_path=df_path,
|
|
171
|
+
use_columns=use_columns,
|
|
172
|
+
kind="pandas",
|
|
173
|
+
all_strings=all_strings,
|
|
174
|
+
verbose=verbose)
|
|
175
|
+
break
|
|
176
|
+
|
|
177
|
+
return df
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def load_dataframe_with_schema(
|
|
181
|
+
df_path: Union[str, Path],
|
|
182
|
+
schema: "FeatureSchema",
|
|
183
|
+
all_strings: bool = False,
|
|
184
|
+
) -> Tuple[pd.DataFrame, str]:
|
|
185
|
+
"""
|
|
186
|
+
Loads a CSV file into a Pandas DataFrame, strictly validating its
|
|
187
|
+
feature columns against a FeatureSchema.
|
|
188
|
+
|
|
189
|
+
This function wraps `load_dataframe`. After loading, it validates
|
|
190
|
+
that the first N columns of the DataFrame (where N =
|
|
191
|
+
len(schema.feature_names)) contain *exactly* the set of features
|
|
192
|
+
specified in the schema.
|
|
193
|
+
|
|
194
|
+
- If the columns are present but out of order, they are reordered.
|
|
195
|
+
- If any required feature is missing from the first N columns, it fails.
|
|
196
|
+
- If any extra column is found within the first N columns, it fails.
|
|
197
|
+
|
|
198
|
+
Columns *after* the first N are considered target columns and are
|
|
199
|
+
logged for verification.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
df_path (str, Path):
|
|
203
|
+
The path to the CSV file.
|
|
204
|
+
schema (FeatureSchema):
|
|
205
|
+
The schema object to validate against.
|
|
206
|
+
all_strings (bool):
|
|
207
|
+
If True, loads all columns as string data types.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
(Tuple[pd.DataFrame, str]):
|
|
211
|
+
A tuple containing the loaded, validated (and possibly
|
|
212
|
+
reordered) pandas DataFrame and the base name of the file.
|
|
213
|
+
|
|
214
|
+
Raises:
|
|
215
|
+
ValueError:
|
|
216
|
+
- If the DataFrame is missing columns required by the schema
|
|
217
|
+
within its first N columns.
|
|
218
|
+
- If the DataFrame's first N columns contain unexpected
|
|
219
|
+
columns that are not in the schema.
|
|
220
|
+
FileNotFoundError:
|
|
221
|
+
If the file does not exist at the given path.
|
|
222
|
+
"""
|
|
223
|
+
# Step 1: Load the dataframe using the original function
|
|
224
|
+
try:
|
|
225
|
+
df, df_name = load_dataframe(
|
|
226
|
+
df_path=df_path,
|
|
227
|
+
use_columns=None, # Load all columns for validation
|
|
228
|
+
kind="pandas",
|
|
229
|
+
all_strings=all_strings,
|
|
230
|
+
verbose=True
|
|
231
|
+
)
|
|
232
|
+
except Exception as e:
|
|
233
|
+
_LOGGER.error(f"Failed during initial load for schema validation: {e}")
|
|
234
|
+
raise e
|
|
235
|
+
|
|
236
|
+
# Step 2: Call the helper to validate and reorder
|
|
237
|
+
df_validated = _validate_and_reorder_schema(df=df, schema=schema)
|
|
238
|
+
|
|
239
|
+
return df_validated, df_name
|
|
240
|
+
|
|
241
|
+
|
|
113
242
|
def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
|
|
114
243
|
"""
|
|
115
244
|
Iterates over all CSV files in a given directory, loading each into a Pandas DataFrame.
|
|
@@ -196,7 +325,7 @@ def merge_dataframes(
|
|
|
196
325
|
return merged_df
|
|
197
326
|
|
|
198
327
|
|
|
199
|
-
def
|
|
328
|
+
def save_dataframe_filename(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Path], filename: str) -> None:
|
|
200
329
|
"""
|
|
201
330
|
Saves a pandas or polars DataFrame to a CSV file.
|
|
202
331
|
|
|
@@ -214,7 +343,7 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
|
|
|
214
343
|
return
|
|
215
344
|
|
|
216
345
|
# Create the directory if it doesn't exist
|
|
217
|
-
save_path = make_fullpath(save_dir, make=True)
|
|
346
|
+
save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
218
347
|
|
|
219
348
|
# Clean the filename
|
|
220
349
|
filename = sanitize_filename(filename)
|
|
@@ -225,227 +354,91 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
|
|
|
225
354
|
|
|
226
355
|
# --- Type-specific saving logic ---
|
|
227
356
|
if isinstance(df, pd.DataFrame):
|
|
228
|
-
|
|
357
|
+
# Transform "" to np.nan before saving
|
|
358
|
+
df_to_save = df.replace(r'^\s*$', np.nan, regex=True)
|
|
359
|
+
# Save
|
|
360
|
+
df_to_save.to_csv(output_path, index=False, encoding='utf-8')
|
|
229
361
|
elif isinstance(df, pl.DataFrame):
|
|
230
|
-
|
|
362
|
+
# Transform empty strings to Null
|
|
363
|
+
df_to_save = df.with_columns(
|
|
364
|
+
pl.when(pl.col(pl.Utf8).str.strip() == "")
|
|
365
|
+
.then(None)
|
|
366
|
+
.otherwise(pl.col(pl.Utf8))
|
|
367
|
+
)
|
|
368
|
+
# Save
|
|
369
|
+
df_to_save.write_csv(output_path)
|
|
231
370
|
else:
|
|
232
371
|
# This error handles cases where an unsupported type is passed
|
|
233
372
|
_LOGGER.error(f"Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
|
|
234
373
|
raise TypeError()
|
|
235
374
|
|
|
236
|
-
_LOGGER.info(f"Saved dataset: '{filename}' with shape: {
|
|
375
|
+
_LOGGER.info(f"Saved dataset: '{filename}' with shape: {df_to_save.shape}")
|
|
237
376
|
|
|
238
377
|
|
|
239
|
-
def
|
|
378
|
+
def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], full_path: Path):
|
|
240
379
|
"""
|
|
241
|
-
|
|
242
|
-
applying heuristic adjustments to correct for potential data entry scale mismatches.
|
|
380
|
+
Saves a DataFrame to a specified full path.
|
|
243
381
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
A list of values that may include strings, floats, integers, or None.
|
|
247
|
-
None values are treated as 0.0.
|
|
248
|
-
|
|
249
|
-
threshold (int, optional):
|
|
250
|
-
The number of log10 orders of magnitude below the median scale
|
|
251
|
-
at which a value is considered suspect and is scaled upward accordingly.
|
|
252
|
-
Default is 2.
|
|
253
|
-
|
|
254
|
-
Returns:
|
|
255
|
-
List[float]: A list of normalized float values summing to 1.0.
|
|
256
|
-
|
|
257
|
-
Notes:
|
|
258
|
-
- Zeros and None values remain zero.
|
|
259
|
-
- Input strings are automatically cast to floats if possible.
|
|
382
|
+
This function is a wrapper for `save_dataframe_filename()`. It takes a
|
|
383
|
+
single `pathlib.Path` object pointing to a `.csv` file.
|
|
260
384
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
385
|
+
Args:
|
|
386
|
+
df (Union[pd.DataFrame, pl.DataFrame]): The pandas or polars DataFrame to save.
|
|
387
|
+
full_path (Path): The complete file path, including the filename and `.csv` extension, where the DataFrame will be saved.
|
|
264
388
|
"""
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
# Raise for negative values
|
|
269
|
-
if any(x < 0 for x in float_list):
|
|
270
|
-
_LOGGER.error("Negative values are not allowed in the input list.")
|
|
389
|
+
if not isinstance(full_path, Path) or not full_path.suffix.endswith(".csv"):
|
|
390
|
+
_LOGGER.error('A path object pointing to a .csv file must be provided.')
|
|
271
391
|
raise ValueError()
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
return [0.0 for _ in float_list]
|
|
277
|
-
|
|
278
|
-
log_scales = [math.log10(x) for x in nonzero]
|
|
279
|
-
log_median = np.median(log_scales)
|
|
280
|
-
|
|
281
|
-
# Step 3: Adjust values that are much smaller than median
|
|
282
|
-
adjusted = []
|
|
283
|
-
for x in float_list:
|
|
284
|
-
if x == 0.0:
|
|
285
|
-
adjusted.append(0.0)
|
|
286
|
-
else:
|
|
287
|
-
log_x = math.log10(x)
|
|
288
|
-
if log_median - log_x > threshold:
|
|
289
|
-
scale_diff = round(log_median - log_x)
|
|
290
|
-
adjusted.append(x * (10 ** scale_diff))
|
|
291
|
-
else:
|
|
292
|
-
adjusted.append(x)
|
|
293
|
-
|
|
294
|
-
# Step 4: Normalize to sum to 1.0
|
|
295
|
-
total = sum(adjusted)
|
|
296
|
-
if total == 0:
|
|
297
|
-
return [0.0 for _ in adjusted]
|
|
298
|
-
|
|
299
|
-
return [x / total for x in adjusted]
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
def threshold_binary_values(
|
|
303
|
-
input_array: Union[Sequence[float], np.ndarray, pd.Series, pl.Series],
|
|
304
|
-
binary_values: Optional[int] = None
|
|
305
|
-
) -> Union[np.ndarray, pd.Series, pl.Series, list[float], tuple[float]]:
|
|
306
|
-
"""
|
|
307
|
-
Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
|
|
308
|
-
|
|
309
|
-
Binary elements are converted to 0 or 1 using a 0.5 threshold.
|
|
310
|
-
|
|
311
|
-
Parameters:
|
|
312
|
-
input_array: 1D sequence, NumPy array, pandas Series, or polars Series.
|
|
313
|
-
binary_values (Optional[int]) :
|
|
314
|
-
- If `None`, all values are treated as binary.
|
|
315
|
-
- If `int`, only this many last `binary_values` are thresholded.
|
|
316
|
-
|
|
317
|
-
Returns:
|
|
318
|
-
Any:
|
|
319
|
-
Same type as input
|
|
320
|
-
"""
|
|
321
|
-
original_type = type(input_array)
|
|
322
|
-
|
|
323
|
-
if isinstance(input_array, pl.Series):
|
|
324
|
-
array = input_array.to_numpy()
|
|
325
|
-
elif isinstance(input_array, (pd.Series, np.ndarray)):
|
|
326
|
-
array = np.asarray(input_array)
|
|
327
|
-
elif isinstance(input_array, (list, tuple)):
|
|
328
|
-
array = np.array(input_array)
|
|
329
|
-
else:
|
|
330
|
-
_LOGGER.error("Unsupported input type")
|
|
331
|
-
raise TypeError()
|
|
332
|
-
|
|
333
|
-
array = array.flatten()
|
|
334
|
-
total = array.shape[0]
|
|
392
|
+
|
|
393
|
+
save_dataframe_filename(df=df,
|
|
394
|
+
save_dir=full_path.parent,
|
|
395
|
+
filename=full_path.name)
|
|
335
396
|
|
|
336
|
-
bin_count = total if binary_values is None else binary_values
|
|
337
|
-
if not (0 <= bin_count <= total):
|
|
338
|
-
_LOGGER.error("'binary_values' must be between 0 and the total number of elements")
|
|
339
|
-
raise ValueError()
|
|
340
397
|
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
result = np.concatenate([cont_part, bin_part])
|
|
347
|
-
|
|
348
|
-
if original_type is pd.Series:
|
|
349
|
-
return pd.Series(result, index=input_array.index if hasattr(input_array, 'index') else None) # type: ignore
|
|
350
|
-
elif original_type is pl.Series:
|
|
351
|
-
return pl.Series(input_array.name if hasattr(input_array, 'name') else "binary", result) # type: ignore
|
|
352
|
-
elif original_type is list:
|
|
353
|
-
return result.tolist()
|
|
354
|
-
elif original_type is tuple:
|
|
355
|
-
return tuple(result)
|
|
356
|
-
else:
|
|
357
|
-
return result
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
def threshold_binary_values_batch(
|
|
361
|
-
input_array: np.ndarray,
|
|
362
|
-
binary_values: int
|
|
363
|
-
) -> np.ndarray:
|
|
398
|
+
def save_dataframe_with_schema(
|
|
399
|
+
df: pd.DataFrame,
|
|
400
|
+
full_path: Path,
|
|
401
|
+
schema: "FeatureSchema"
|
|
402
|
+
) -> None:
|
|
364
403
|
"""
|
|
365
|
-
|
|
404
|
+
Saves a pandas DataFrame to a CSV, strictly enforcing that the
|
|
405
|
+
first N columns match the FeatureSchema.
|
|
366
406
|
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
2D array with shape (batch_size, n_features).
|
|
371
|
-
binary_values : int
|
|
372
|
-
Number of binary features located at the END of each row.
|
|
373
|
-
|
|
374
|
-
Returns
|
|
375
|
-
-------
|
|
376
|
-
np.ndarray
|
|
377
|
-
Thresholded array, same shape as input.
|
|
378
|
-
"""
|
|
379
|
-
if input_array.ndim != 2:
|
|
380
|
-
_LOGGER.error(f"Expected 2D array, got {input_array.ndim}D array.")
|
|
381
|
-
raise AssertionError()
|
|
382
|
-
|
|
383
|
-
batch_size, total_features = input_array.shape
|
|
407
|
+
This function validates that the first N columns of the DataFrame
|
|
408
|
+
(where N = len(schema.feature_names)) contain *exactly* the set
|
|
409
|
+
of features specified in the schema.
|
|
384
410
|
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
411
|
+
- If the columns are present but out of order, they are reordered.
|
|
412
|
+
- If any required feature is missing from the first N columns, it fails.
|
|
413
|
+
- If any extra column is found within the first N columns, it fails.
|
|
388
414
|
|
|
389
|
-
|
|
390
|
-
|
|
415
|
+
Columns *after* the first N are considered target columns and are
|
|
416
|
+
logged for verification.
|
|
391
417
|
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> None:
|
|
400
|
-
"""
|
|
401
|
-
Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
|
|
402
|
-
|
|
403
|
-
Parameters:
|
|
404
|
-
obj (Any) : The Python object to serialize.
|
|
405
|
-
save_dir (str | Path) : Directory path where the serialized object will be saved.
|
|
406
|
-
filename (str) : Name for the output file, extension will be appended if needed.
|
|
407
|
-
"""
|
|
408
|
-
try:
|
|
409
|
-
save_path = make_fullpath(save_dir, make=True)
|
|
410
|
-
sanitized_name = sanitize_filename(filename)
|
|
411
|
-
if not sanitized_name.endswith('.joblib'):
|
|
412
|
-
sanitized_name = sanitized_name + ".joblib"
|
|
413
|
-
full_path = save_path / sanitized_name
|
|
414
|
-
joblib.dump(obj, full_path)
|
|
415
|
-
except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
|
|
416
|
-
_LOGGER.error(f"Failed to serialize object of type '{type(obj)}'.")
|
|
417
|
-
if raise_on_error:
|
|
418
|
-
raise e
|
|
419
|
-
return None
|
|
420
|
-
else:
|
|
421
|
-
if verbose:
|
|
422
|
-
_LOGGER.info(f"Object of type '{type(obj)}' saved to '{full_path}'")
|
|
423
|
-
return None
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
|
|
427
|
-
"""
|
|
428
|
-
Loads a serialized object from a .joblib file.
|
|
429
|
-
|
|
430
|
-
Parameters:
|
|
431
|
-
filepath (str | Path): Full path to the serialized .joblib file.
|
|
418
|
+
Args:
|
|
419
|
+
df (pd.DataFrame):
|
|
420
|
+
The DataFrame to save.
|
|
421
|
+
full_path (Path):
|
|
422
|
+
The complete file path where the DataFrame will be saved.
|
|
423
|
+
schema (FeatureSchema):
|
|
424
|
+
The schema object to validate against.
|
|
432
425
|
|
|
433
|
-
|
|
434
|
-
|
|
426
|
+
Raises:
|
|
427
|
+
ValueError:
|
|
428
|
+
- If the DataFrame is missing columns required by the schema
|
|
429
|
+
within its first N columns.
|
|
430
|
+
- If the DataFrame's first N columns contain unexpected
|
|
431
|
+
columns that are not in the schema.
|
|
435
432
|
"""
|
|
436
|
-
|
|
433
|
+
if not isinstance(full_path, Path) or not full_path.suffix.endswith(".csv"):
|
|
434
|
+
_LOGGER.error('A path object pointing to a .csv file must be provided.')
|
|
435
|
+
raise ValueError()
|
|
437
436
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
raise e
|
|
444
|
-
return None
|
|
445
|
-
else:
|
|
446
|
-
if verbose:
|
|
447
|
-
_LOGGER.info(f"Loaded object of type '{type(obj)}'.")
|
|
448
|
-
return obj
|
|
437
|
+
# Call the helper to validate and reorder
|
|
438
|
+
df_to_save = _validate_and_reorder_schema(df=df, schema=schema)
|
|
439
|
+
|
|
440
|
+
# Call the original save function
|
|
441
|
+
save_dataframe(df=df_to_save, full_path=full_path)
|
|
449
442
|
|
|
450
443
|
|
|
451
444
|
def distribute_dataset_by_target(
|
|
@@ -529,7 +522,7 @@ def train_dataset_orchestrator(list_of_dirs: list[Union[str,Path]],
|
|
|
529
522
|
filename = df_dir.name + '_' + target_name + '_' + df_name
|
|
530
523
|
else:
|
|
531
524
|
filename = target_name + '_' + df_name
|
|
532
|
-
|
|
525
|
+
save_dataframe_filename(df=df, save_dir=save_dir, filename=filename)
|
|
533
526
|
total_saved += 1
|
|
534
527
|
except Exception as e:
|
|
535
528
|
_LOGGER.error(f"Failed to process file '{df_path}'. Reason: {e}")
|
|
@@ -560,5 +553,72 @@ def train_dataset_yielder(
|
|
|
560
553
|
yield (df_features, df_target, feature_names, target_col)
|
|
561
554
|
|
|
562
555
|
|
|
556
|
+
def _validate_and_reorder_schema(
|
|
557
|
+
df: pd.DataFrame,
|
|
558
|
+
schema: "FeatureSchema"
|
|
559
|
+
) -> pd.DataFrame:
|
|
560
|
+
"""
|
|
561
|
+
Internal helper to validate and reorder a DataFrame against a schema.
|
|
562
|
+
|
|
563
|
+
Checks for missing, extra, and out-of-order feature columns
|
|
564
|
+
(the first N columns). Returns a reordered DataFrame if necessary.
|
|
565
|
+
Logs all actions.
|
|
566
|
+
|
|
567
|
+
Raises:
|
|
568
|
+
ValueError: If validation fails.
|
|
569
|
+
"""
|
|
570
|
+
# Get schema and DataFrame column info
|
|
571
|
+
expected_features = list(schema.feature_names)
|
|
572
|
+
expected_set = set(expected_features)
|
|
573
|
+
n_features = len(expected_features)
|
|
574
|
+
|
|
575
|
+
all_df_columns = df.columns.to_list()
|
|
576
|
+
|
|
577
|
+
# --- Strict Validation ---
|
|
578
|
+
|
|
579
|
+
# 0. Check if DataFrame is long enough
|
|
580
|
+
if len(all_df_columns) < n_features:
|
|
581
|
+
_LOGGER.error(f"DataFrame has only {len(all_df_columns)} columns, but schema requires {n_features} features.")
|
|
582
|
+
raise ValueError()
|
|
583
|
+
|
|
584
|
+
df_feature_cols = all_df_columns[:n_features]
|
|
585
|
+
df_feature_set = set(df_feature_cols)
|
|
586
|
+
df_target_cols = all_df_columns[n_features:]
|
|
587
|
+
|
|
588
|
+
# 1. Check for missing features
|
|
589
|
+
missing_from_df = expected_set - df_feature_set
|
|
590
|
+
if missing_from_df:
|
|
591
|
+
_LOGGER.error(f"DataFrame's first {n_features} columns are missing required schema features: {missing_from_df}")
|
|
592
|
+
raise ValueError()
|
|
593
|
+
|
|
594
|
+
# 2. Check for extra (unexpected) features
|
|
595
|
+
extra_in_df = df_feature_set - expected_set
|
|
596
|
+
if extra_in_df:
|
|
597
|
+
_LOGGER.error(f"DataFrame's first {n_features} columns contain unexpected columns: {extra_in_df}")
|
|
598
|
+
raise ValueError()
|
|
599
|
+
|
|
600
|
+
# --- Reordering ---
|
|
601
|
+
|
|
602
|
+
df_to_process = df
|
|
603
|
+
|
|
604
|
+
# If we pass validation, the sets are equal. Now check order.
|
|
605
|
+
if df_feature_cols == expected_features:
|
|
606
|
+
_LOGGER.info("DataFrame feature columns already match schema order.")
|
|
607
|
+
else:
|
|
608
|
+
_LOGGER.warning("DataFrame feature columns do not match schema order. Reordering...")
|
|
609
|
+
|
|
610
|
+
# Rebuild the DataFrame with the correct feature order + target columns
|
|
611
|
+
new_order = expected_features + df_target_cols
|
|
612
|
+
df_to_process = df[new_order]
|
|
613
|
+
|
|
614
|
+
# Log the presumed target columns for user verification
|
|
615
|
+
if not df_target_cols:
|
|
616
|
+
_LOGGER.warning(f"No target columns were found after index {n_features-1}.")
|
|
617
|
+
else:
|
|
618
|
+
_LOGGER.info(f"Presumed Target Columns: {df_target_cols}")
|
|
619
|
+
|
|
620
|
+
return df_to_process # type: ignore
|
|
621
|
+
|
|
622
|
+
|
|
563
623
|
def info():
|
|
564
624
|
_script_info(__all__)
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
dragon_ml_toolbox-10.1.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
-
dragon_ml_toolbox-10.1.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
|
|
3
|
-
ml_tools/ETL_cleaning.py,sha256=i-hrafaAivg8wprcCmwHA5MkXFsUmHNR9RRGbIyw4ZE,15981
|
|
4
|
-
ml_tools/ETL_engineering.py,sha256=sgpIhlFIeId4eSJ-a33MnVuPNXs50msxFWa8-kw2hOI,36369
|
|
5
|
-
ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
|
|
6
|
-
ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
|
|
7
|
-
ml_tools/ML_callbacks.py,sha256=JPvEw_cW5tYNJ2rMSgnNrKLuni_UrmuhDFaOw-u2SvA,13926
|
|
8
|
-
ml_tools/ML_datasetmaster.py,sha256=CBZFpvm0qiY-8gP89iKTkd7jvU-rGQcJwk-_mBJmRSg,29273
|
|
9
|
-
ml_tools/ML_evaluation.py,sha256=28JJ2M71p4pxniwav2Hv3b1a5dsvaoIYNLm-UJQuXvY,16002
|
|
10
|
-
ml_tools/ML_evaluation_multi.py,sha256=2jTSNFCu8cz5C05EusnrDyffs59M2Fq3UXSHxo2TR1A,12515
|
|
11
|
-
ml_tools/ML_inference.py,sha256=SGDPiPxs_OYDKKRZziFMyaWcC8A37c70W9t-dMP5niI,23066
|
|
12
|
-
ml_tools/ML_models.py,sha256=Dl2mTMgVCtnNCSRlyqvMnInsKJVldS7vnBPimD-TnHo,27999
|
|
13
|
-
ml_tools/ML_optimization.py,sha256=a2Uxe1g-y4I-gFa8ENIM8QDS-Pz3hoPRRaVXAWMbyQA,13491
|
|
14
|
-
ml_tools/ML_scaler.py,sha256=O8JzHr2551zPpKRRReEIMvq0lNAAPau6hV59KUMAySg,7420
|
|
15
|
-
ml_tools/ML_trainer.py,sha256=xM-o-gbPhWXm2lOVXbeaTFotgJSDRSHyE7H0-9OOij4,23712
|
|
16
|
-
ml_tools/PSO_optimization.py,sha256=q0VYpssQGbPum7xdnkDXlJQKhZMYZo8acHpKhajPK3c,22954
|
|
17
|
-
ml_tools/RNN_forecast.py,sha256=8rNZr-eWOBXMiDQV22e_tQTPM5LM2IFggEAa1FaoXaI,1965
|
|
18
|
-
ml_tools/SQL.py,sha256=WDgdZUYuLBUpv-4Am9XjVY_Aq_jxBWdLrbcgAIEwefI,10704
|
|
19
|
-
ml_tools/VIF_factor.py,sha256=MkMh_RIdsN2XUPzKNGRiEcmB17R_MmvGV4ezpL5zD2E,10403
|
|
20
|
-
ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
|
|
21
|
-
ml_tools/_logger.py,sha256=wcImAiXEZKPNcwM30qBh3t7HvoPURonJY0nrgMGF0sM,4719
|
|
22
|
-
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
23
|
-
ml_tools/custom_logger.py,sha256=ry43hk54K6xKo8jRAgq1sFxUpOA9T0LIJ7sw0so2BW0,5880
|
|
24
|
-
ml_tools/data_exploration.py,sha256=hKA_3U-piJ8TtDWhzX_T2Awkg-25e0DC5E8qloqPo6w,27206
|
|
25
|
-
ml_tools/ensemble_evaluation.py,sha256=xMEMfXJ5MjTkTfr1LkFOeD7iUtnVDCW3S9lm3zT-6tY,24778
|
|
26
|
-
ml_tools/ensemble_inference.py,sha256=EFHnbjbu31fcVp88NBx8lWAVdu2Gpg9MY9huVZJHFfM,9350
|
|
27
|
-
ml_tools/ensemble_learning.py,sha256=3s0kH4i_naj0IVl_T4knst-Hwg4TScWjEdsXX5KAi7I,21929
|
|
28
|
-
ml_tools/handle_excel.py,sha256=He4UT15sCGhaG-JKfs7uYVAubxWjrqgJ6U7OhMR2fuE,14005
|
|
29
|
-
ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
|
|
30
|
-
ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
|
|
31
|
-
ml_tools/path_manager.py,sha256=TJgoqMAryc5F0dal8W_zvJgE1TpOzlskIyYJk614WW4,13809
|
|
32
|
-
ml_tools/utilities.py,sha256=SVMaSDigh6SUoAeig2_sXLLIj5w5mUs5KuVWpHvFDec,19816
|
|
33
|
-
dragon_ml_toolbox-10.1.1.dist-info/METADATA,sha256=wJ2byoP5azuIBrLRpUUQ96DkDAQuxVtgf2lFPafBUUQ,6968
|
|
34
|
-
dragon_ml_toolbox-10.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
35
|
-
dragon_ml_toolbox-10.1.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
36
|
-
dragon_ml_toolbox-10.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|