dragon-ml-toolbox 3.12.0__tar.gz → 3.12.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-3.12.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-3.12.1}/PKG-INFO +1 -1
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/MICE_imputation.py +6 -6
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/PSO_optimization.py +2 -4
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/VIF_factor.py +15 -13
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/data_exploration.py +1 -1
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/handle_excel.py +9 -8
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/path_manager.py +1 -1
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/utilities.py +73 -41
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/pyproject.toml +1 -1
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/LICENSE +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/README.md +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/ETL_engineering.py +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/GUI_tools.py +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/ML_callbacks.py +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/ML_evaluation.py +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/ML_trainer.py +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/ML_tutorial.py +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/RNN_forecast.py +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/_pytorch_models.py +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/datasetmaster.py +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/keys.py +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/ml_tools/logger.py +0 -0
- {dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/setup.cfg +0 -0
|
@@ -35,7 +35,7 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
|
|
|
35
35
|
imputed_datasets = [kernel.complete_data(dataset=i) for i in range(resulting_datasets)]
|
|
36
36
|
|
|
37
37
|
if imputed_datasets is None or len(imputed_datasets) == 0:
|
|
38
|
-
raise ValueError("No imputed datasets were generated. Check the MICE process.")
|
|
38
|
+
raise ValueError("❌ No imputed datasets were generated. Check the MICE process.")
|
|
39
39
|
|
|
40
40
|
# threshold binary columns
|
|
41
41
|
if binary_columns is not None:
|
|
@@ -56,8 +56,8 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
|
|
|
56
56
|
|
|
57
57
|
# Ensure indexes match
|
|
58
58
|
for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
|
|
59
|
-
assert imputed_df.shape[0] == df.shape[0], f"Row count mismatch in dataset {subname}" # type: ignore
|
|
60
|
-
assert all(imputed_df.index == df.index), f"Index mismatch in dataset {subname}" # type: ignore
|
|
59
|
+
assert imputed_df.shape[0] == df.shape[0], f"❌ Row count mismatch in dataset {subname}" # type: ignore
|
|
60
|
+
assert all(imputed_df.index == df.index), f"❌ Index mismatch in dataset {subname}" # type: ignore
|
|
61
61
|
# print("✅ All imputed datasets match the original DataFrame indexes.")
|
|
62
62
|
|
|
63
63
|
return kernel, imputed_datasets, imputed_dataset_names
|
|
@@ -90,7 +90,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
|
|
|
90
90
|
dataset_count = kernel.num_datasets
|
|
91
91
|
|
|
92
92
|
if dataset_count != len(imputed_dataset_names):
|
|
93
|
-
raise ValueError(f"Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
|
|
93
|
+
raise ValueError(f"❌ Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
|
|
94
94
|
|
|
95
95
|
# Check path
|
|
96
96
|
root_path = make_fullpath(root_dir, make=True)
|
|
@@ -152,7 +152,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
152
152
|
"""Helper function to add labels and legends to a figure"""
|
|
153
153
|
|
|
154
154
|
if not isinstance(fig, ggplot):
|
|
155
|
-
raise TypeError("Expected a plotnine.ggplot object")
|
|
155
|
+
raise TypeError("❌ Expected a plotnine.ggplot object")
|
|
156
156
|
|
|
157
157
|
# Edit labels and title
|
|
158
158
|
fig = fig + theme(
|
|
@@ -166,7 +166,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
|
|
|
166
166
|
fig = fig.draw()
|
|
167
167
|
|
|
168
168
|
if not hasattr(fig, 'axes') or len(fig.axes) == 0:
|
|
169
|
-
raise RuntimeError("Rendered figure has no axes to modify")
|
|
169
|
+
raise RuntimeError("❌ Rendered figure has no axes to modify")
|
|
170
170
|
|
|
171
171
|
if filename == "Combined_Distributions":
|
|
172
172
|
custom_xlabel = "Feature Values"
|
|
@@ -530,10 +530,8 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
|
|
|
530
530
|
results_path = make_fullpath(results_dir)
|
|
531
531
|
output_path = make_fullpath(save_dir, make=True)
|
|
532
532
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
_LOGGER.warning("⚠️ No data found. No plots will be generated.")
|
|
536
|
-
return
|
|
533
|
+
# Check that the directory contains csv files
|
|
534
|
+
list_csv_paths(results_path, verbose=False)
|
|
537
535
|
|
|
538
536
|
# --- Data Loading and Preparation ---
|
|
539
537
|
_LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
|
|
@@ -26,8 +26,7 @@ def compute_vif(
|
|
|
26
26
|
save_dir: Optional[Union[str,Path]] = None,
|
|
27
27
|
filename: Optional[str] = None,
|
|
28
28
|
fontsize: int = 14,
|
|
29
|
-
show_plot: bool = True
|
|
30
|
-
verbose: bool = True
|
|
29
|
+
show_plot: bool = True
|
|
31
30
|
) -> pd.DataFrame:
|
|
32
31
|
"""
|
|
33
32
|
Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
|
|
@@ -54,21 +53,20 @@ def compute_vif(
|
|
|
54
53
|
if use_columns is None:
|
|
55
54
|
sanitized_columns = df.select_dtypes(include='number').columns.tolist()
|
|
56
55
|
missing_features = set(ground_truth_cols) - set(sanitized_columns)
|
|
57
|
-
if missing_features
|
|
56
|
+
if missing_features:
|
|
58
57
|
_LOGGER.warning(f"⚠️ These columns are not Numeric:\n{missing_features}")
|
|
59
58
|
else:
|
|
60
59
|
sanitized_columns = list()
|
|
61
60
|
for feature in use_columns:
|
|
62
61
|
if feature not in ground_truth_cols:
|
|
63
|
-
|
|
64
|
-
_LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
|
|
62
|
+
_LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
|
|
65
63
|
else:
|
|
66
64
|
sanitized_columns.append(feature)
|
|
67
65
|
|
|
68
66
|
if ignore_columns is not None and use_columns is None:
|
|
69
67
|
missing_ignore = set(ignore_columns) - set(ground_truth_cols)
|
|
70
|
-
if missing_ignore
|
|
71
|
-
_LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
|
|
68
|
+
if missing_ignore:
|
|
69
|
+
_LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not found in the Dataframe:\n{missing_ignore}")
|
|
72
70
|
sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
|
|
73
71
|
|
|
74
72
|
X = df[sanitized_columns].copy()
|
|
@@ -139,7 +137,7 @@ def compute_vif(
|
|
|
139
137
|
filename += ".svg"
|
|
140
138
|
full_save_path = save_path / filename
|
|
141
139
|
plt.savefig(full_save_path, format='svg', bbox_inches='tight')
|
|
142
|
-
|
|
140
|
+
_LOGGER.info(f"✅ Saved VIF plot: '{filename}'")
|
|
143
141
|
|
|
144
142
|
if show_plot:
|
|
145
143
|
plt.show()
|
|
@@ -164,11 +162,16 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
|
|
|
164
162
|
"""
|
|
165
163
|
# Ensure expected structure
|
|
166
164
|
if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
|
|
167
|
-
raise ValueError("
|
|
165
|
+
raise ValueError("'vif_df' must contain 'feature' and 'VIF' columns.")
|
|
168
166
|
|
|
169
167
|
# Identify features to drop
|
|
170
168
|
to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
|
|
171
|
-
|
|
169
|
+
if len(to_drop) > 0:
|
|
170
|
+
_LOGGER.info(f"🗑️ Dropping {len(to_drop)} column(s) with VIF > {threshold}:")
|
|
171
|
+
for dropped_column in to_drop:
|
|
172
|
+
print(f"\t{dropped_column}")
|
|
173
|
+
else:
|
|
174
|
+
_LOGGER.info(f"No columns exceed the VIF threshold of '{threshold}'.")
|
|
172
175
|
|
|
173
176
|
result_df = df.drop(columns=to_drop)
|
|
174
177
|
|
|
@@ -186,7 +189,7 @@ def compute_vif_multi(input_directory: Union[str, Path],
|
|
|
186
189
|
max_features_to_plot: int = 20,
|
|
187
190
|
fontsize: int = 14):
|
|
188
191
|
"""
|
|
189
|
-
Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots
|
|
192
|
+
Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots will be displayed inline.
|
|
190
193
|
Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
|
|
191
194
|
|
|
192
195
|
Args:
|
|
@@ -216,8 +219,7 @@ def compute_vif_multi(input_directory: Union[str, Path],
|
|
|
216
219
|
fontsize=fontsize,
|
|
217
220
|
save_dir=output_plot_directory,
|
|
218
221
|
filename=df_name,
|
|
219
|
-
show_plot=False
|
|
220
|
-
verbose=False)
|
|
222
|
+
show_plot=False)
|
|
221
223
|
|
|
222
224
|
if output_dataset_path is not None:
|
|
223
225
|
new_filename = df_name + '_VIF'
|
|
@@ -143,7 +143,7 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
|
|
|
143
143
|
feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
|
|
144
144
|
rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
|
|
145
145
|
if len(rows_to_drop) > 0:
|
|
146
|
-
print(f"
|
|
146
|
+
print(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
|
|
147
147
|
df_clean = df_clean.drop(index=rows_to_drop)
|
|
148
148
|
else:
|
|
149
149
|
print(f"✅ No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
|
|
@@ -36,7 +36,7 @@ def find_excel_files(
|
|
|
36
36
|
input_path = make_fullpath(directory)
|
|
37
37
|
|
|
38
38
|
if not input_path.is_dir():
|
|
39
|
-
raise NotADirectoryError(f"Directory not found: {input_path}")
|
|
39
|
+
raise NotADirectoryError(f"❌ Directory not found: {input_path}")
|
|
40
40
|
|
|
41
41
|
excel_files = [
|
|
42
42
|
f for f in input_path.iterdir()
|
|
@@ -46,7 +46,7 @@ def find_excel_files(
|
|
|
46
46
|
]
|
|
47
47
|
|
|
48
48
|
if not excel_files:
|
|
49
|
-
raise FileNotFoundError(f"No valid Excel files found in directory: {input_path}")
|
|
49
|
+
raise FileNotFoundError(f"❌ No valid Excel files found in directory: {input_path}")
|
|
50
50
|
|
|
51
51
|
return excel_files
|
|
52
52
|
|
|
@@ -198,7 +198,7 @@ def validate_excel_schema(
|
|
|
198
198
|
invalid_files.append(file)
|
|
199
199
|
|
|
200
200
|
except Exception as e:
|
|
201
|
-
_LOGGER.error(f"Error processing '{file}': {e}")
|
|
201
|
+
_LOGGER.error(f"❌ Error processing '{file}': {e}")
|
|
202
202
|
invalid_files.append(file)
|
|
203
203
|
|
|
204
204
|
valid_excel_number = len(excel_paths) - len(invalid_files)
|
|
@@ -251,7 +251,7 @@ def vertical_merge_transform_excel(
|
|
|
251
251
|
if target_columns is not None:
|
|
252
252
|
missing = [col for col in target_columns if col not in df.columns]
|
|
253
253
|
if missing:
|
|
254
|
-
raise ValueError(f"Invalid columns in {file.name}: {missing}")
|
|
254
|
+
raise ValueError(f"❌ Invalid columns in {file.name}: {missing}")
|
|
255
255
|
df = df[target_columns]
|
|
256
256
|
|
|
257
257
|
dataframes.append(df)
|
|
@@ -261,7 +261,7 @@ def vertical_merge_transform_excel(
|
|
|
261
261
|
if rename_columns is not None:
|
|
262
262
|
expected_len = len(target_columns if target_columns is not None else merged_df.columns)
|
|
263
263
|
if len(rename_columns) != expected_len:
|
|
264
|
-
raise ValueError("Length of 'rename_columns' must match the selected columns")
|
|
264
|
+
raise ValueError("❌ Length of 'rename_columns' must match the selected columns")
|
|
265
265
|
merged_df.columns = rename_columns
|
|
266
266
|
|
|
267
267
|
merged_df.to_csv(csv_path, index=False, encoding='utf-8')
|
|
@@ -324,6 +324,9 @@ def horizontal_merge_transform_excel(
|
|
|
324
324
|
merged_df = pd.concat(padded_dataframes, axis=1)
|
|
325
325
|
|
|
326
326
|
duplicate_columns = merged_df.columns[merged_df.columns.duplicated()].tolist()
|
|
327
|
+
|
|
328
|
+
if duplicate_columns:
|
|
329
|
+
_LOGGER.warning(f"⚠️ Duplicate columns: {duplicate_columns}")
|
|
327
330
|
|
|
328
331
|
if skip_duplicates:
|
|
329
332
|
merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
|
|
@@ -344,9 +347,7 @@ def horizontal_merge_transform_excel(
|
|
|
344
347
|
merged_df.to_csv(csv_path, index=False, encoding='utf-8')
|
|
345
348
|
|
|
346
349
|
_LOGGER.info(f"✅ Merged {len(excel_files)} Excel files into '{csv_filename}'.")
|
|
347
|
-
|
|
348
|
-
_LOGGER.warning(f"⚠️ Duplicate columns: {duplicate_columns}")
|
|
349
|
-
|
|
350
|
+
|
|
350
351
|
|
|
351
352
|
def info():
|
|
352
353
|
_script_info(__all__)
|
|
@@ -102,7 +102,7 @@ class PathManager:
|
|
|
102
102
|
for key in new_paths:
|
|
103
103
|
if key in self._paths:
|
|
104
104
|
raise KeyError(
|
|
105
|
-
f"Path key '{key}' already exists in the manager. To replace it, call update() with overwrite=True."
|
|
105
|
+
f"❌ Path key '{key}' already exists in the manager. To replace it, call update() with overwrite=True."
|
|
106
106
|
)
|
|
107
107
|
|
|
108
108
|
# Resolve any string paths to Path objects before storing
|
|
@@ -32,28 +32,42 @@ __all__ = [
|
|
|
32
32
|
def make_fullpath(
|
|
33
33
|
input_path: Union[str, Path],
|
|
34
34
|
make: bool = False,
|
|
35
|
-
verbose: bool = False
|
|
35
|
+
verbose: bool = False,
|
|
36
|
+
enforce: Optional[Literal["directory", "file"]] = None
|
|
36
37
|
) -> Path:
|
|
37
38
|
"""
|
|
38
|
-
Resolves a string or Path into an absolute Path.
|
|
39
|
+
Resolves a string or Path into an absolute Path, optionally creating it.
|
|
39
40
|
|
|
40
41
|
- If the path exists, it is returned.
|
|
41
42
|
- If the path does not exist and `make=True`, it will:
|
|
42
|
-
- Create the file if the path has a suffix
|
|
43
|
+
- Create the file if the path has a suffix
|
|
43
44
|
- Create the directory if it has no suffix
|
|
44
45
|
- If `make=False` and the path does not exist, an error is raised.
|
|
46
|
+
- If `enforce`, raises an error if the resolved path is not what was enforced.
|
|
45
47
|
- Optionally prints whether the resolved path is a file or directory.
|
|
46
48
|
|
|
47
49
|
Parameters:
|
|
48
|
-
input_path (str | Path):
|
|
49
|
-
|
|
50
|
-
|
|
50
|
+
input_path (str | Path):
|
|
51
|
+
Path to resolve.
|
|
52
|
+
make (bool):
|
|
53
|
+
If True, attempt to create file or directory.
|
|
54
|
+
verbose (bool):
|
|
55
|
+
Print classification after resolution.
|
|
56
|
+
enforce ("directory" | "file" | None):
|
|
57
|
+
Raises an error if the resolved path is not what was enforced.
|
|
51
58
|
|
|
52
59
|
Returns:
|
|
53
60
|
Path: Resolved absolute path.
|
|
54
61
|
|
|
55
62
|
Raises:
|
|
56
63
|
ValueError: If the path doesn't exist and can't be created.
|
|
64
|
+
TypeError: If the final path does not match the `enforce` parameter.
|
|
65
|
+
|
|
66
|
+
## 🗒️ Note:
|
|
67
|
+
|
|
68
|
+
Directories with dots will be treated as files.
|
|
69
|
+
|
|
70
|
+
Files without extension will be treated as directories.
|
|
57
71
|
"""
|
|
58
72
|
path = Path(input_path).expanduser()
|
|
59
73
|
|
|
@@ -75,6 +89,12 @@ def make_fullpath(
|
|
|
75
89
|
resolved = path.resolve(strict=True)
|
|
76
90
|
except Exception as e:
|
|
77
91
|
raise ValueError(f"❌ Failed to create {'file' if is_file else 'directory'} '{path}': {e}")
|
|
92
|
+
|
|
93
|
+
if enforce == "file" and not resolved.is_file():
|
|
94
|
+
raise TypeError(f"❌ Path was enforced as a file, but it is not: '{resolved}'")
|
|
95
|
+
|
|
96
|
+
if enforce == "directory" and not resolved.is_dir():
|
|
97
|
+
raise TypeError(f"❌ Path was enforced as a directory, but it is not: '{resolved}'")
|
|
78
98
|
|
|
79
99
|
if verbose:
|
|
80
100
|
if resolved.is_file():
|
|
@@ -87,7 +107,7 @@ def make_fullpath(
|
|
|
87
107
|
return resolved
|
|
88
108
|
|
|
89
109
|
|
|
90
|
-
def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
|
|
110
|
+
def list_csv_paths(directory: Union[str,Path], verbose: bool=True) -> dict[str, Path]:
|
|
91
111
|
"""
|
|
92
112
|
Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.
|
|
93
113
|
|
|
@@ -101,19 +121,20 @@ def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
|
|
|
101
121
|
|
|
102
122
|
csv_paths = list(dir_path.glob("*.csv"))
|
|
103
123
|
if not csv_paths:
|
|
104
|
-
raise IOError(f"No CSV files found in directory: {dir_path.name}")
|
|
124
|
+
raise IOError(f"❌ No CSV files found in directory: {dir_path.name}")
|
|
105
125
|
|
|
106
126
|
# make a dictionary of paths and names
|
|
107
127
|
name_path_dict = {p.stem: p for p in csv_paths}
|
|
108
128
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
129
|
+
if verbose:
|
|
130
|
+
print("\n🗂️ CSV files found:")
|
|
131
|
+
for name in name_path_dict.keys():
|
|
132
|
+
print(f"\t{name}")
|
|
112
133
|
|
|
113
134
|
return name_path_dict
|
|
114
135
|
|
|
115
136
|
|
|
116
|
-
def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[str, Path]:
|
|
137
|
+
def list_files_by_extension(directory: Union[str,Path], extension: str, verbose: bool=True) -> dict[str, Path]:
|
|
117
138
|
"""
|
|
118
139
|
Lists all files with the specified extension in the given directory and returns a mapping:
|
|
119
140
|
filenames (without extensions) to their absolute paths.
|
|
@@ -133,13 +154,14 @@ def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[
|
|
|
133
154
|
|
|
134
155
|
matched_paths = list(dir_path.glob(pattern))
|
|
135
156
|
if not matched_paths:
|
|
136
|
-
raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
|
|
157
|
+
raise IOError(f"❌ No '.{normalized_ext}' files found in directory: {dir_path}")
|
|
137
158
|
|
|
138
159
|
name_path_dict = {p.stem: p for p in matched_paths}
|
|
139
160
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
161
|
+
if verbose:
|
|
162
|
+
print(f"\n📂 '{normalized_ext.upper()}' files found:")
|
|
163
|
+
for name in name_path_dict:
|
|
164
|
+
print(f"\t{name}")
|
|
143
165
|
|
|
144
166
|
return name_path_dict
|
|
145
167
|
|
|
@@ -147,7 +169,8 @@ def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[
|
|
|
147
169
|
def load_dataframe(
|
|
148
170
|
df_path: Union[str, Path],
|
|
149
171
|
kind: Literal["pandas", "polars"] = "pandas",
|
|
150
|
-
all_strings: bool = False
|
|
172
|
+
all_strings: bool = False,
|
|
173
|
+
verbose: bool = True
|
|
151
174
|
) -> Tuple[Union[pd.DataFrame, pl.DataFrame], str]:
|
|
152
175
|
"""
|
|
153
176
|
Load a CSV file into a DataFrame and extract its base name.
|
|
@@ -191,20 +214,21 @@ def load_dataframe(
|
|
|
191
214
|
df = pl.read_csv(path, infer_schema_length=1000)
|
|
192
215
|
|
|
193
216
|
else:
|
|
194
|
-
raise ValueError(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
|
|
217
|
+
raise ValueError(f"❌ Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
|
|
195
218
|
|
|
196
219
|
# This check works for both pandas and polars DataFrames
|
|
197
220
|
if df.shape[0] == 0:
|
|
198
|
-
raise ValueError(f"DataFrame '{df_name}' loaded from '{path}' is empty.")
|
|
221
|
+
raise ValueError(f"❌ DataFrame '{df_name}' loaded from '{path}' is empty.")
|
|
199
222
|
|
|
200
|
-
|
|
223
|
+
if verbose:
|
|
224
|
+
print(f"\n💾 Loaded {kind.upper()} dataset: '{df_name}' with shape: {df.shape}")
|
|
201
225
|
|
|
202
226
|
return df, df_name
|
|
203
227
|
|
|
204
228
|
|
|
205
|
-
def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
|
|
229
|
+
def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
|
|
206
230
|
"""
|
|
207
|
-
Iterates over all CSV files in a given directory, loading each into a
|
|
231
|
+
Iterates over all CSV files in a given directory, loading each into a Pandas DataFrame.
|
|
208
232
|
|
|
209
233
|
Parameters:
|
|
210
234
|
datasets_dir (str | Path):
|
|
@@ -221,9 +245,10 @@ def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
|
|
|
221
245
|
- Output is streamed via a generator to support lazy loading of multiple datasets.
|
|
222
246
|
"""
|
|
223
247
|
datasets_path = make_fullpath(datasets_dir)
|
|
224
|
-
|
|
248
|
+
files_dict = list_csv_paths(datasets_path, verbose=verbose)
|
|
249
|
+
for df_name, df_path in files_dict.items():
|
|
225
250
|
df: pd.DataFrame
|
|
226
|
-
df, _ = load_dataframe(df_path, kind="pandas") # type: ignore
|
|
251
|
+
df, _ = load_dataframe(df_path, kind="pandas", verbose=verbose) # type: ignore
|
|
227
252
|
yield df, df_name
|
|
228
253
|
|
|
229
254
|
|
|
@@ -253,35 +278,35 @@ def merge_dataframes(
|
|
|
253
278
|
- If column names or order differ for vertical merge.
|
|
254
279
|
"""
|
|
255
280
|
if len(dfs) < 2:
|
|
256
|
-
raise ValueError("At least 2 DataFrames must be provided.")
|
|
281
|
+
raise ValueError("❌ At least 2 DataFrames must be provided.")
|
|
257
282
|
|
|
258
283
|
if verbose:
|
|
259
284
|
for i, df in enumerate(dfs, start=1):
|
|
260
|
-
print(f"DataFrame {i} shape: {df.shape}")
|
|
285
|
+
print(f"➡️ DataFrame {i} shape: {df.shape}")
|
|
261
286
|
|
|
262
287
|
|
|
263
288
|
if direction == "horizontal":
|
|
264
289
|
reference_index = dfs[0].index
|
|
265
290
|
for i, df in enumerate(dfs, start=1):
|
|
266
291
|
if not df.index.equals(reference_index):
|
|
267
|
-
raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
|
|
292
|
+
raise ValueError(f"❌ Indexes do not match: Dataset 1 and Dataset {i}.")
|
|
268
293
|
merged_df = pd.concat(dfs, axis=1)
|
|
269
294
|
|
|
270
295
|
elif direction == "vertical":
|
|
271
296
|
reference_columns = dfs[0].columns
|
|
272
297
|
for i, df in enumerate(dfs, start=1):
|
|
273
298
|
if not df.columns.equals(reference_columns):
|
|
274
|
-
raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
|
|
299
|
+
raise ValueError(f"❌ Column names/order do not match: Dataset 1 and Dataset {i}.")
|
|
275
300
|
merged_df = pd.concat(dfs, axis=0)
|
|
276
301
|
|
|
277
302
|
else:
|
|
278
|
-
raise ValueError(f"Invalid merge direction: {direction}")
|
|
303
|
+
raise ValueError(f"❌ Invalid merge direction: {direction}")
|
|
279
304
|
|
|
280
305
|
if reset_index:
|
|
281
306
|
merged_df = merged_df.reset_index(drop=True)
|
|
282
307
|
|
|
283
308
|
if verbose:
|
|
284
|
-
print(f"Merged DataFrame shape: {merged_df.shape}")
|
|
309
|
+
print(f"\n✅ Merged DataFrame shape: {merged_df.shape}")
|
|
285
310
|
|
|
286
311
|
return merged_df
|
|
287
312
|
|
|
@@ -320,9 +345,9 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
|
|
|
320
345
|
df.write_csv(output_path) # Polars defaults to utf8 and no index
|
|
321
346
|
else:
|
|
322
347
|
# This error handles cases where an unsupported type is passed
|
|
323
|
-
raise TypeError(f"Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
|
|
348
|
+
raise TypeError(f"❌ Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
|
|
324
349
|
|
|
325
|
-
print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
|
|
350
|
+
print(f"\n✅ Saved dataset: '{filename}' with shape: {df.shape}")
|
|
326
351
|
|
|
327
352
|
|
|
328
353
|
def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
@@ -356,7 +381,7 @@ def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
|
|
|
356
381
|
|
|
357
382
|
# Raise for negative values
|
|
358
383
|
if any(x < 0 for x in float_list):
|
|
359
|
-
raise ValueError("Negative values are not allowed in the input list.")
|
|
384
|
+
raise ValueError("❌ Negative values are not allowed in the input list.")
|
|
360
385
|
|
|
361
386
|
# Step 2: Compute log10 of non-zero values
|
|
362
387
|
nonzero = [x for x in float_list if x > 0]
|
|
@@ -395,7 +420,7 @@ def sanitize_filename(filename: str) -> str:
|
|
|
395
420
|
- Removing or replacing characters invalid in filenames.
|
|
396
421
|
|
|
397
422
|
Args:
|
|
398
|
-
|
|
423
|
+
filename (str): Base filename.
|
|
399
424
|
|
|
400
425
|
Returns:
|
|
401
426
|
str: A sanitized string suitable to use as a filename.
|
|
@@ -408,6 +433,10 @@ def sanitize_filename(filename: str) -> str:
|
|
|
408
433
|
|
|
409
434
|
# Conservative filter to keep filenames safe across platforms
|
|
410
435
|
sanitized = re.sub(r'[^\w\-.]', '', sanitized)
|
|
436
|
+
|
|
437
|
+
# Check for empty string after sanitization
|
|
438
|
+
if not sanitized:
|
|
439
|
+
raise ValueError("The sanitized filename is empty. The original input may have contained only invalid characters.")
|
|
411
440
|
|
|
412
441
|
return sanitized
|
|
413
442
|
|
|
@@ -418,6 +447,8 @@ def threshold_binary_values(
|
|
|
418
447
|
) -> Union[np.ndarray, pd.Series, pl.Series, list[float], tuple[float]]:
|
|
419
448
|
"""
|
|
420
449
|
Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
|
|
450
|
+
|
|
451
|
+
Binary elements are converted to 0 or 1 using a 0.5 threshold.
|
|
421
452
|
|
|
422
453
|
Parameters:
|
|
423
454
|
input_array: 1D sequence, NumPy array, pandas Series, or polars Series.
|
|
@@ -426,7 +457,8 @@ def threshold_binary_values(
|
|
|
426
457
|
- If `int`, only this many last `binary_values` are thresholded.
|
|
427
458
|
|
|
428
459
|
Returns:
|
|
429
|
-
|
|
460
|
+
Any:
|
|
461
|
+
Same type as input
|
|
430
462
|
"""
|
|
431
463
|
original_type = type(input_array)
|
|
432
464
|
|
|
@@ -437,14 +469,14 @@ def threshold_binary_values(
|
|
|
437
469
|
elif isinstance(input_array, (list, tuple)):
|
|
438
470
|
array = np.array(input_array)
|
|
439
471
|
else:
|
|
440
|
-
raise TypeError("Unsupported input type")
|
|
472
|
+
raise TypeError("❌ Unsupported input type")
|
|
441
473
|
|
|
442
474
|
array = array.flatten()
|
|
443
475
|
total = array.shape[0]
|
|
444
476
|
|
|
445
477
|
bin_count = total if binary_values is None else binary_values
|
|
446
478
|
if not (0 <= bin_count <= total):
|
|
447
|
-
raise ValueError("binary_values must be between 0 and the total number of elements")
|
|
479
|
+
raise ValueError("❌ binary_values must be between 0 and the total number of elements")
|
|
448
480
|
|
|
449
481
|
if bin_count == 0:
|
|
450
482
|
result = array
|
|
@@ -484,9 +516,9 @@ def threshold_binary_values_batch(
|
|
|
484
516
|
np.ndarray
|
|
485
517
|
Thresholded array, same shape as input.
|
|
486
518
|
"""
|
|
487
|
-
assert input_array.ndim == 2, f"Expected 2D array, got {input_array.ndim}D"
|
|
519
|
+
assert input_array.ndim == 2, f"❌ Expected 2D array, got {input_array.ndim}D"
|
|
488
520
|
batch_size, total_features = input_array.shape
|
|
489
|
-
assert 0 <= binary_values <= total_features, "binary_values out of valid range"
|
|
521
|
+
assert 0 <= binary_values <= total_features, "❌ binary_values out of valid range"
|
|
490
522
|
|
|
491
523
|
if binary_values == 0:
|
|
492
524
|
return input_array.copy()
|
|
@@ -523,7 +555,7 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
|
|
|
523
555
|
return None
|
|
524
556
|
else:
|
|
525
557
|
if verbose:
|
|
526
|
-
print(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
|
|
558
|
+
print(f"\n✅ Object of type '{type(obj)}' saved to '{full_path}'")
|
|
527
559
|
return None
|
|
528
560
|
|
|
529
561
|
|
|
@@ -550,7 +582,7 @@ def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_e
|
|
|
550
582
|
return None
|
|
551
583
|
else:
|
|
552
584
|
if verbose:
|
|
553
|
-
print(f"✅ Loaded object of type '{type(obj)}'")
|
|
585
|
+
print(f"\n✅ Loaded object of type '{type(obj)}'")
|
|
554
586
|
return obj
|
|
555
587
|
|
|
556
588
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/dragon_ml_toolbox.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/dragon_ml_toolbox.egg-info/requires.txt
RENAMED
|
File without changes
|
{dragon_ml_toolbox-3.12.0 → dragon_ml_toolbox-3.12.1}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|