dragon-ml-toolbox 3.12.0__py3-none-any.whl → 3.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 3.12.0
3
+ Version: 3.12.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,26 +1,26 @@
1
- dragon_ml_toolbox-3.12.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-3.12.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
1
+ dragon_ml_toolbox-3.12.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-3.12.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
3
  ml_tools/ETL_engineering.py,sha256=yeZsW_7zRvEcuMZbM4E2GV1dxwBoWIeJAcFFk2AK0fY,39502
4
4
  ml_tools/GUI_tools.py,sha256=VonZEizPS0ncm8HWU-ik-SgcXKryJU8eSG7NN0QN9cc,42222
5
- ml_tools/MICE_imputation.py,sha256=rYqvwQDVtoAJJ0agXWoGzoZEHedWiA6QzcEKEIkiZ08,11388
5
+ ml_tools/MICE_imputation.py,sha256=7CDsIQxx5Jb_DwPAmWmz3FXcn85sUyH7g9UcZ1_E07s,11412
6
6
  ml_tools/ML_callbacks.py,sha256=g_9nSzoA22UJOQZCPKeDz-Ayh0ECFZLzRd6rZ8SokrE,13080
7
7
  ml_tools/ML_evaluation.py,sha256=oiDV6HItQloUUKCUpltV-2pogubWLBieGpc-VUwosAQ,10106
8
8
  ml_tools/ML_trainer.py,sha256=gGXAu65v_5yYCqKqmHpSLJ3yY0M_Scr_nJ6qHBHSK1k,14487
9
9
  ml_tools/ML_tutorial.py,sha256=m5mZPULhO4mOpfp32fM_mUNVduv-S2hoKNbsZObNI4k,12233
10
- ml_tools/PSO_optimization.py,sha256=64sQCavw8ecFr318-fugnax8LhjSWiR4aiH6aYiVD2k,24839
10
+ ml_tools/PSO_optimization.py,sha256=1wRM-goZSwCji5LQVDP1VjF0LyGN5-QWBvofbwfjQRQ,24780
11
11
  ml_tools/RNN_forecast.py,sha256=IZLcPs3by0Chei7ill_Grjxs7BBUnzau0Oavi3dWiyE,1886
12
- ml_tools/VIF_factor.py,sha256=BeP4ig3l7b1Igwgte9z8rEwHdSZvVT7W_9mcBHGoNJw,10299
12
+ ml_tools/VIF_factor.py,sha256=gD3sZ9HBdTHlf4gbvUvx6kKczO_JFxMZKTXw1h0KVCg,10365
13
13
  ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  ml_tools/_pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
15
- ml_tools/data_exploration.py,sha256=M7bn2q5XN9zJZJGAmMMFSFFZh8LGzC2arFelrXw3N6Q,25241
15
+ ml_tools/data_exploration.py,sha256=ZpjK_lN5mDhjf9iQpvyYNA2SF7M5q4D5m09saln7YFI,25241
16
16
  ml_tools/datasetmaster.py,sha256=S3PKHNQZ9cyAOck8xQltVLZhaD1gFLfgHFL-aRjz4JU,30077
17
17
  ml_tools/ensemble_learning.py,sha256=D-9IbOKtCvyAB-LbPu3sdSRtdp0RZIcQEZcyMnarHmQ,45758
18
- ml_tools/handle_excel.py,sha256=lwds7rDLlGSCWiWGI7xNg-Z7kxAepogp0lstSFa0590,12949
18
+ ml_tools/handle_excel.py,sha256=2Q_MBArss4emPQ8p-Uj9x_e7wGg3OoYM2AU_HG59UCY,12978
19
19
  ml_tools/keys.py,sha256=A3mLrtLZrxL27whAs2F1GPqZ1KzJpxBp6QbhxY5ioPI,636
20
20
  ml_tools/logger.py,sha256=UkbiU9ihBhw9VKyn3rZzisdClWV94EBV6B09_D0iUU0,6026
21
- ml_tools/path_manager.py,sha256=OCpESgdftbi6mOxetDMIaHhazt4N-W8pJx11X3-yNOs,8305
22
- ml_tools/utilities.py,sha256=FW97hMTLLxjDR1so-C-_yDm_iz2z_YfirRXjG_IwSLo,22843
23
- dragon_ml_toolbox-3.12.0.dist-info/METADATA,sha256=JD5pg6MBVM3stGknoD2vwec1pKgykEwNVtRmanRV2sw,3274
24
- dragon_ml_toolbox-3.12.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
25
- dragon_ml_toolbox-3.12.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
26
- dragon_ml_toolbox-3.12.0.dist-info/RECORD,,
21
+ ml_tools/path_manager.py,sha256=1LD9JFzqVyJQl2kTA7tK930_IV3qxfiV4cMIBzItytY,8309
22
+ ml_tools/utilities.py,sha256=Vh4ZdI03g8EpgQL7KDwnAw2vtBlHtx6KxCuAATxLvT4,24208
23
+ dragon_ml_toolbox-3.12.1.dist-info/METADATA,sha256=vwEN95BhK71LrhuuTuZbxdyfdq_X5VljuP89uXNguok,3274
24
+ dragon_ml_toolbox-3.12.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
25
+ dragon_ml_toolbox-3.12.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
26
+ dragon_ml_toolbox-3.12.1.dist-info/RECORD,,
@@ -35,7 +35,7 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
35
35
  imputed_datasets = [kernel.complete_data(dataset=i) for i in range(resulting_datasets)]
36
36
 
37
37
  if imputed_datasets is None or len(imputed_datasets) == 0:
38
- raise ValueError("No imputed datasets were generated. Check the MICE process.")
38
+ raise ValueError("No imputed datasets were generated. Check the MICE process.")
39
39
 
40
40
  # threshold binary columns
41
41
  if binary_columns is not None:
@@ -56,8 +56,8 @@ def apply_mice(df: pd.DataFrame, df_name: str, binary_columns: Optional[list[str
56
56
 
57
57
  # Ensure indexes match
58
58
  for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
59
- assert imputed_df.shape[0] == df.shape[0], f"Row count mismatch in dataset {subname}" # type: ignore
60
- assert all(imputed_df.index == df.index), f"Index mismatch in dataset {subname}" # type: ignore
59
+ assert imputed_df.shape[0] == df.shape[0], f"Row count mismatch in dataset {subname}" # type: ignore
60
+ assert all(imputed_df.index == df.index), f"Index mismatch in dataset {subname}" # type: ignore
61
61
  # print("✅ All imputed datasets match the original DataFrame indexes.")
62
62
 
63
63
  return kernel, imputed_datasets, imputed_dataset_names
@@ -90,7 +90,7 @@ def get_convergence_diagnostic(kernel: mf.ImputationKernel, imputed_dataset_name
90
90
  dataset_count = kernel.num_datasets
91
91
 
92
92
  if dataset_count != len(imputed_dataset_names):
93
- raise ValueError(f"Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
93
+ raise ValueError(f"Expected {dataset_count} names in imputed_dataset_names, got {len(imputed_dataset_names)}")
94
94
 
95
95
  # Check path
96
96
  root_path = make_fullpath(root_dir, make=True)
@@ -152,7 +152,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
152
152
  """Helper function to add labels and legends to a figure"""
153
153
 
154
154
  if not isinstance(fig, ggplot):
155
- raise TypeError("Expected a plotnine.ggplot object")
155
+ raise TypeError("Expected a plotnine.ggplot object")
156
156
 
157
157
  # Edit labels and title
158
158
  fig = fig + theme(
@@ -166,7 +166,7 @@ def get_imputed_distributions(kernel: mf.ImputationKernel, df_name: str, root_di
166
166
  fig = fig.draw()
167
167
 
168
168
  if not hasattr(fig, 'axes') or len(fig.axes) == 0:
169
- raise RuntimeError("Rendered figure has no axes to modify")
169
+ raise RuntimeError("Rendered figure has no axes to modify")
170
170
 
171
171
  if filename == "Combined_Distributions":
172
172
  custom_xlabel = "Feature Values"
@@ -530,10 +530,8 @@ def plot_optimal_feature_distributions(results_dir: Union[str, Path], save_dir:
530
530
  results_path = make_fullpath(results_dir)
531
531
  output_path = make_fullpath(save_dir, make=True)
532
532
 
533
- all_csvs = list_csv_paths(results_path)
534
- if not all_csvs:
535
- _LOGGER.warning("⚠️ No data found. No plots will be generated.")
536
- return
533
+ # Check that the directory contains csv files
534
+ list_csv_paths(results_path, verbose=False)
537
535
 
538
536
  # --- Data Loading and Preparation ---
539
537
  _LOGGER.info(f"📁 Starting analysis from results in: '{results_dir}'")
ml_tools/VIF_factor.py CHANGED
@@ -26,8 +26,7 @@ def compute_vif(
26
26
  save_dir: Optional[Union[str,Path]] = None,
27
27
  filename: Optional[str] = None,
28
28
  fontsize: int = 14,
29
- show_plot: bool = True,
30
- verbose: bool = True
29
+ show_plot: bool = True
31
30
  ) -> pd.DataFrame:
32
31
  """
33
32
  Computes Variance Inflation Factors (VIF) for numeric columns in a DataFrame. Optionally, generates a bar plot of VIF values.
@@ -54,21 +53,20 @@ def compute_vif(
54
53
  if use_columns is None:
55
54
  sanitized_columns = df.select_dtypes(include='number').columns.tolist()
56
55
  missing_features = set(ground_truth_cols) - set(sanitized_columns)
57
- if missing_features and verbose:
56
+ if missing_features:
58
57
  _LOGGER.warning(f"⚠️ These columns are not Numeric:\n{missing_features}")
59
58
  else:
60
59
  sanitized_columns = list()
61
60
  for feature in use_columns:
62
61
  if feature not in ground_truth_cols:
63
- if verbose:
64
- _LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
62
+ _LOGGER.warning(f"⚠️ The provided column '{feature}' is not in the DataFrame.")
65
63
  else:
66
64
  sanitized_columns.append(feature)
67
65
 
68
66
  if ignore_columns is not None and use_columns is None:
69
67
  missing_ignore = set(ignore_columns) - set(ground_truth_cols)
70
- if missing_ignore and verbose:
71
- _LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not in the Dataframe:\n{missing_ignore}")
68
+ if missing_ignore:
69
+ _LOGGER.warning(f"⚠️ Warning: The following 'columns to ignore' are not found in the Dataframe:\n{missing_ignore}")
72
70
  sanitized_columns = [f for f in sanitized_columns if f not in ignore_columns]
73
71
 
74
72
  X = df[sanitized_columns].copy()
@@ -139,7 +137,7 @@ def compute_vif(
139
137
  filename += ".svg"
140
138
  full_save_path = save_path / filename
141
139
  plt.savefig(full_save_path, format='svg', bbox_inches='tight')
142
- print(f"\tSaved VIF plot: '{filename}'")
140
+ _LOGGER.info(f" Saved VIF plot: '{filename}'")
143
141
 
144
142
  if show_plot:
145
143
  plt.show()
@@ -164,11 +162,16 @@ def drop_vif_based(df: pd.DataFrame, vif_df: pd.DataFrame, threshold: float = 10
164
162
  """
165
163
  # Ensure expected structure
166
164
  if 'feature' not in vif_df.columns or 'VIF' not in vif_df.columns:
167
- raise ValueError("`vif_df` must contain 'feature' and 'VIF' columns.")
165
+ raise ValueError("'vif_df' must contain 'feature' and 'VIF' columns.")
168
166
 
169
167
  # Identify features to drop
170
168
  to_drop = vif_df[vif_df["VIF"] > threshold]["feature"].tolist()
171
- _LOGGER.info(f"🗑️ Dropping {len(to_drop)} column(s) with VIF > {threshold}: {to_drop}")
169
+ if len(to_drop) > 0:
170
+ _LOGGER.info(f"🗑️ Dropping {len(to_drop)} column(s) with VIF > {threshold}:")
171
+ for dropped_column in to_drop:
172
+ print(f"\t{dropped_column}")
173
+ else:
174
+ _LOGGER.info(f"No columns exceed the VIF threshold of '{threshold}'.")
172
175
 
173
176
  result_df = df.drop(columns=to_drop)
174
177
 
@@ -186,7 +189,7 @@ def compute_vif_multi(input_directory: Union[str, Path],
186
189
  max_features_to_plot: int = 20,
187
190
  fontsize: int = 14):
188
191
  """
189
- Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots or warnings will be displayed inline.
192
+ Computes Variance Inflation Factors (VIF) for numeric columns in a directory with CSV files (loaded as pandas DataFrames). No plots will be displayed inline.
190
193
  Generates a bar plot of VIF values. Optionally drops columns with VIF >= 10 and saves as a new CSV file.
191
194
 
192
195
  Args:
@@ -216,8 +219,7 @@ def compute_vif_multi(input_directory: Union[str, Path],
216
219
  fontsize=fontsize,
217
220
  save_dir=output_plot_directory,
218
221
  filename=df_name,
219
- show_plot=False,
220
- verbose=False)
222
+ show_plot=False)
221
223
 
222
224
  if output_dataset_path is not None:
223
225
  new_filename = df_name + '_VIF'
@@ -143,7 +143,7 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
143
143
  feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
144
144
  rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
145
145
  if len(rows_to_drop) > 0:
146
- print(f"📉 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
146
+ print(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
147
147
  df_clean = df_clean.drop(index=rows_to_drop)
148
148
  else:
149
149
  print(f"✅ No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
ml_tools/handle_excel.py CHANGED
@@ -36,7 +36,7 @@ def find_excel_files(
36
36
  input_path = make_fullpath(directory)
37
37
 
38
38
  if not input_path.is_dir():
39
- raise NotADirectoryError(f"Directory not found: {input_path}")
39
+ raise NotADirectoryError(f"Directory not found: {input_path}")
40
40
 
41
41
  excel_files = [
42
42
  f for f in input_path.iterdir()
@@ -46,7 +46,7 @@ def find_excel_files(
46
46
  ]
47
47
 
48
48
  if not excel_files:
49
- raise FileNotFoundError(f"No valid Excel files found in directory: {input_path}")
49
+ raise FileNotFoundError(f"No valid Excel files found in directory: {input_path}")
50
50
 
51
51
  return excel_files
52
52
 
@@ -198,7 +198,7 @@ def validate_excel_schema(
198
198
  invalid_files.append(file)
199
199
 
200
200
  except Exception as e:
201
- _LOGGER.error(f"Error processing '{file}': {e}")
201
+ _LOGGER.error(f"Error processing '{file}': {e}")
202
202
  invalid_files.append(file)
203
203
 
204
204
  valid_excel_number = len(excel_paths) - len(invalid_files)
@@ -251,7 +251,7 @@ def vertical_merge_transform_excel(
251
251
  if target_columns is not None:
252
252
  missing = [col for col in target_columns if col not in df.columns]
253
253
  if missing:
254
- raise ValueError(f"Invalid columns in {file.name}: {missing}")
254
+ raise ValueError(f"Invalid columns in {file.name}: {missing}")
255
255
  df = df[target_columns]
256
256
 
257
257
  dataframes.append(df)
@@ -261,7 +261,7 @@ def vertical_merge_transform_excel(
261
261
  if rename_columns is not None:
262
262
  expected_len = len(target_columns if target_columns is not None else merged_df.columns)
263
263
  if len(rename_columns) != expected_len:
264
- raise ValueError("Length of 'rename_columns' must match the selected columns")
264
+ raise ValueError("Length of 'rename_columns' must match the selected columns")
265
265
  merged_df.columns = rename_columns
266
266
 
267
267
  merged_df.to_csv(csv_path, index=False, encoding='utf-8')
@@ -324,6 +324,9 @@ def horizontal_merge_transform_excel(
324
324
  merged_df = pd.concat(padded_dataframes, axis=1)
325
325
 
326
326
  duplicate_columns = merged_df.columns[merged_df.columns.duplicated()].tolist()
327
+
328
+ if duplicate_columns:
329
+ _LOGGER.warning(f"⚠️ Duplicate columns: {duplicate_columns}")
327
330
 
328
331
  if skip_duplicates:
329
332
  merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
@@ -344,9 +347,7 @@ def horizontal_merge_transform_excel(
344
347
  merged_df.to_csv(csv_path, index=False, encoding='utf-8')
345
348
 
346
349
  _LOGGER.info(f"✅ Merged {len(excel_files)} Excel files into '{csv_filename}'.")
347
- if duplicate_columns:
348
- _LOGGER.warning(f"⚠️ Duplicate columns: {duplicate_columns}")
349
-
350
+
350
351
 
351
352
  def info():
352
353
  _script_info(__all__)
ml_tools/path_manager.py CHANGED
@@ -102,7 +102,7 @@ class PathManager:
102
102
  for key in new_paths:
103
103
  if key in self._paths:
104
104
  raise KeyError(
105
- f"Path key '{key}' already exists in the manager. To replace it, call update() with overwrite=True."
105
+ f"Path key '{key}' already exists in the manager. To replace it, call update() with overwrite=True."
106
106
  )
107
107
 
108
108
  # Resolve any string paths to Path objects before storing
ml_tools/utilities.py CHANGED
@@ -32,28 +32,42 @@ __all__ = [
32
32
  def make_fullpath(
33
33
  input_path: Union[str, Path],
34
34
  make: bool = False,
35
- verbose: bool = False
35
+ verbose: bool = False,
36
+ enforce: Optional[Literal["directory", "file"]] = None
36
37
  ) -> Path:
37
38
  """
38
- Resolves a string or Path into an absolute Path.
39
+ Resolves a string or Path into an absolute Path, optionally creating it.
39
40
 
40
41
  - If the path exists, it is returned.
41
42
  - If the path does not exist and `make=True`, it will:
42
- - Create the file if the path has a suffix (i.e., is treated as a file)
43
+ - Create the file if the path has a suffix
43
44
  - Create the directory if it has no suffix
44
45
  - If `make=False` and the path does not exist, an error is raised.
46
+ - If `enforce`, raises an error if the resolved path is not what was enforced.
45
47
  - Optionally prints whether the resolved path is a file or directory.
46
48
 
47
49
  Parameters:
48
- input_path (str | Path): Path to resolve.
49
- make (bool): If True, attempt to create file or directory.
50
- verbose (bool): Print classification after resolution.
50
+ input_path (str | Path):
51
+ Path to resolve.
52
+ make (bool):
53
+ If True, attempt to create file or directory.
54
+ verbose (bool):
55
+ Print classification after resolution.
56
+ enforce ("directory" | "file" | None):
57
+ Raises an error if the resolved path is not what was enforced.
51
58
 
52
59
  Returns:
53
60
  Path: Resolved absolute path.
54
61
 
55
62
  Raises:
56
63
  ValueError: If the path doesn't exist and can't be created.
64
+ TypeError: If the final path does not match the `enforce` parameter.
65
+
66
+ ## 🗒️ Note:
67
+
68
+ Directories with dots will be treated as files.
69
+
70
+ Files without extension will be treated as directories.
57
71
  """
58
72
  path = Path(input_path).expanduser()
59
73
 
@@ -75,6 +89,12 @@ def make_fullpath(
75
89
  resolved = path.resolve(strict=True)
76
90
  except Exception as e:
77
91
  raise ValueError(f"❌ Failed to create {'file' if is_file else 'directory'} '{path}': {e}")
92
+
93
+ if enforce == "file" and not resolved.is_file():
94
+ raise TypeError(f"❌ Path was enforced as a file, but it is not: '{resolved}'")
95
+
96
+ if enforce == "directory" and not resolved.is_dir():
97
+ raise TypeError(f"❌ Path was enforced as a directory, but it is not: '{resolved}'")
78
98
 
79
99
  if verbose:
80
100
  if resolved.is_file():
@@ -87,7 +107,7 @@ def make_fullpath(
87
107
  return resolved
88
108
 
89
109
 
90
- def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
110
+ def list_csv_paths(directory: Union[str,Path], verbose: bool=True) -> dict[str, Path]:
91
111
  """
92
112
  Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.
93
113
 
@@ -101,19 +121,20 @@ def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
101
121
 
102
122
  csv_paths = list(dir_path.glob("*.csv"))
103
123
  if not csv_paths:
104
- raise IOError(f"No CSV files found in directory: {dir_path.name}")
124
+ raise IOError(f"No CSV files found in directory: {dir_path.name}")
105
125
 
106
126
  # make a dictionary of paths and names
107
127
  name_path_dict = {p.stem: p for p in csv_paths}
108
128
 
109
- print("\n🗂️ CSV files found:")
110
- for name in name_path_dict.keys():
111
- print(f"\t{name}")
129
+ if verbose:
130
+ print("\n🗂️ CSV files found:")
131
+ for name in name_path_dict.keys():
132
+ print(f"\t{name}")
112
133
 
113
134
  return name_path_dict
114
135
 
115
136
 
116
- def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[str, Path]:
137
+ def list_files_by_extension(directory: Union[str,Path], extension: str, verbose: bool=True) -> dict[str, Path]:
117
138
  """
118
139
  Lists all files with the specified extension in the given directory and returns a mapping:
119
140
  filenames (without extensions) to their absolute paths.
@@ -133,13 +154,14 @@ def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[
133
154
 
134
155
  matched_paths = list(dir_path.glob(pattern))
135
156
  if not matched_paths:
136
- raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
157
+ raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
137
158
 
138
159
  name_path_dict = {p.stem: p for p in matched_paths}
139
160
 
140
- print(f"\n📂 '{normalized_ext.upper()}' files found:")
141
- for name in name_path_dict:
142
- print(f"\t{name}")
161
+ if verbose:
162
+ print(f"\n📂 '{normalized_ext.upper()}' files found:")
163
+ for name in name_path_dict:
164
+ print(f"\t{name}")
143
165
 
144
166
  return name_path_dict
145
167
 
@@ -147,7 +169,8 @@ def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[
147
169
  def load_dataframe(
148
170
  df_path: Union[str, Path],
149
171
  kind: Literal["pandas", "polars"] = "pandas",
150
- all_strings: bool = False
172
+ all_strings: bool = False,
173
+ verbose: bool = True
151
174
  ) -> Tuple[Union[pd.DataFrame, pl.DataFrame], str]:
152
175
  """
153
176
  Load a CSV file into a DataFrame and extract its base name.
@@ -191,20 +214,21 @@ def load_dataframe(
191
214
  df = pl.read_csv(path, infer_schema_length=1000)
192
215
 
193
216
  else:
194
- raise ValueError(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
217
+ raise ValueError(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
195
218
 
196
219
  # This check works for both pandas and polars DataFrames
197
220
  if df.shape[0] == 0:
198
- raise ValueError(f"DataFrame '{df_name}' loaded from '{path}' is empty.")
221
+ raise ValueError(f"DataFrame '{df_name}' loaded from '{path}' is empty.")
199
222
 
200
- print(f"\n💿 Loaded {kind} dataset: '{df_name}' with shape: {df.shape}")
223
+ if verbose:
224
+ print(f"\n💾 Loaded {kind.upper()} dataset: '{df_name}' with shape: {df.shape}")
201
225
 
202
226
  return df, df_name
203
227
 
204
228
 
205
- def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
229
+ def yield_dataframes_from_dir(datasets_dir: Union[str,Path], verbose: bool=True):
206
230
  """
207
- Iterates over all CSV files in a given directory, loading each into a pandas DataFrame.
231
+ Iterates over all CSV files in a given directory, loading each into a Pandas DataFrame.
208
232
 
209
233
  Parameters:
210
234
  datasets_dir (str | Path):
@@ -221,9 +245,10 @@ def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
221
245
  - Output is streamed via a generator to support lazy loading of multiple datasets.
222
246
  """
223
247
  datasets_path = make_fullpath(datasets_dir)
224
- for df_name, df_path in list_csv_paths(datasets_path).items():
248
+ files_dict = list_csv_paths(datasets_path, verbose=verbose)
249
+ for df_name, df_path in files_dict.items():
225
250
  df: pd.DataFrame
226
- df, _ = load_dataframe(df_path, kind="pandas") # type: ignore
251
+ df, _ = load_dataframe(df_path, kind="pandas", verbose=verbose) # type: ignore
227
252
  yield df, df_name
228
253
 
229
254
 
@@ -253,35 +278,35 @@ def merge_dataframes(
253
278
  - If column names or order differ for vertical merge.
254
279
  """
255
280
  if len(dfs) < 2:
256
- raise ValueError("At least 2 DataFrames must be provided.")
281
+ raise ValueError("At least 2 DataFrames must be provided.")
257
282
 
258
283
  if verbose:
259
284
  for i, df in enumerate(dfs, start=1):
260
- print(f"DataFrame {i} shape: {df.shape}")
285
+ print(f"➡️ DataFrame {i} shape: {df.shape}")
261
286
 
262
287
 
263
288
  if direction == "horizontal":
264
289
  reference_index = dfs[0].index
265
290
  for i, df in enumerate(dfs, start=1):
266
291
  if not df.index.equals(reference_index):
267
- raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
292
+ raise ValueError(f"Indexes do not match: Dataset 1 and Dataset {i}.")
268
293
  merged_df = pd.concat(dfs, axis=1)
269
294
 
270
295
  elif direction == "vertical":
271
296
  reference_columns = dfs[0].columns
272
297
  for i, df in enumerate(dfs, start=1):
273
298
  if not df.columns.equals(reference_columns):
274
- raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
299
+ raise ValueError(f"Column names/order do not match: Dataset 1 and Dataset {i}.")
275
300
  merged_df = pd.concat(dfs, axis=0)
276
301
 
277
302
  else:
278
- raise ValueError(f"Invalid merge direction: {direction}")
303
+ raise ValueError(f"Invalid merge direction: {direction}")
279
304
 
280
305
  if reset_index:
281
306
  merged_df = merged_df.reset_index(drop=True)
282
307
 
283
308
  if verbose:
284
- print(f"Merged DataFrame shape: {merged_df.shape}")
309
+ print(f"\n✅ Merged DataFrame shape: {merged_df.shape}")
285
310
 
286
311
  return merged_df
287
312
 
@@ -320,9 +345,9 @@ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Pa
320
345
  df.write_csv(output_path) # Polars defaults to utf8 and no index
321
346
  else:
322
347
  # This error handles cases where an unsupported type is passed
323
- raise TypeError(f"Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
348
+ raise TypeError(f"Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
324
349
 
325
- print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
350
+ print(f"\n✅ Saved dataset: '{filename}' with shape: {df.shape}")
326
351
 
327
352
 
328
353
  def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
@@ -356,7 +381,7 @@ def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
356
381
 
357
382
  # Raise for negative values
358
383
  if any(x < 0 for x in float_list):
359
- raise ValueError("Negative values are not allowed in the input list.")
384
+ raise ValueError("Negative values are not allowed in the input list.")
360
385
 
361
386
  # Step 2: Compute log10 of non-zero values
362
387
  nonzero = [x for x in float_list if x > 0]
@@ -395,7 +420,7 @@ def sanitize_filename(filename: str) -> str:
395
420
  - Removing or replacing characters invalid in filenames.
396
421
 
397
422
  Args:
398
- name (str): Base filename.
423
+ filename (str): Base filename.
399
424
 
400
425
  Returns:
401
426
  str: A sanitized string suitable to use as a filename.
@@ -408,6 +433,10 @@ def sanitize_filename(filename: str) -> str:
408
433
 
409
434
  # Conservative filter to keep filenames safe across platforms
410
435
  sanitized = re.sub(r'[^\w\-.]', '', sanitized)
436
+
437
+ # Check for empty string after sanitization
438
+ if not sanitized:
439
+ raise ValueError("The sanitized filename is empty. The original input may have contained only invalid characters.")
411
440
 
412
441
  return sanitized
413
442
 
@@ -418,6 +447,8 @@ def threshold_binary_values(
418
447
  ) -> Union[np.ndarray, pd.Series, pl.Series, list[float], tuple[float]]:
419
448
  """
420
449
  Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
450
+
451
+ Binary elements are converted to 0 or 1 using a 0.5 threshold.
421
452
 
422
453
  Parameters:
423
454
  input_array: 1D sequence, NumPy array, pandas Series, or polars Series.
@@ -426,7 +457,8 @@ def threshold_binary_values(
426
457
  - If `int`, only this many last `binary_values` are thresholded.
427
458
 
428
459
  Returns:
429
- Same type as input, with binary elements binarized to 0 or 1 using a 0.5 threshold.
460
+ Any:
461
+ Same type as input
430
462
  """
431
463
  original_type = type(input_array)
432
464
 
@@ -437,14 +469,14 @@ def threshold_binary_values(
437
469
  elif isinstance(input_array, (list, tuple)):
438
470
  array = np.array(input_array)
439
471
  else:
440
- raise TypeError("Unsupported input type")
472
+ raise TypeError("Unsupported input type")
441
473
 
442
474
  array = array.flatten()
443
475
  total = array.shape[0]
444
476
 
445
477
  bin_count = total if binary_values is None else binary_values
446
478
  if not (0 <= bin_count <= total):
447
- raise ValueError("binary_values must be between 0 and the total number of elements")
479
+ raise ValueError("binary_values must be between 0 and the total number of elements")
448
480
 
449
481
  if bin_count == 0:
450
482
  result = array
@@ -484,9 +516,9 @@ def threshold_binary_values_batch(
484
516
  np.ndarray
485
517
  Thresholded array, same shape as input.
486
518
  """
487
- assert input_array.ndim == 2, f"Expected 2D array, got {input_array.ndim}D"
519
+ assert input_array.ndim == 2, f"Expected 2D array, got {input_array.ndim}D"
488
520
  batch_size, total_features = input_array.shape
489
- assert 0 <= binary_values <= total_features, "binary_values out of valid range"
521
+ assert 0 <= binary_values <= total_features, "binary_values out of valid range"
490
522
 
491
523
  if binary_values == 0:
492
524
  return input_array.copy()
@@ -523,7 +555,7 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
523
555
  return None
524
556
  else:
525
557
  if verbose:
526
- print(f"✅ Object of type '{type(obj)}' saved to '{full_path}'")
558
+ print(f"\n✅ Object of type '{type(obj)}' saved to '{full_path}'")
527
559
  return None
528
560
 
529
561
 
@@ -550,7 +582,7 @@ def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_e
550
582
  return None
551
583
  else:
552
584
  if verbose:
553
- print(f"✅ Loaded object of type '{type(obj)}'")
585
+ print(f"\n✅ Loaded object of type '{type(obj)}'")
554
586
  return obj
555
587
 
556
588