dragon-ml-toolbox 1.4.8__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

ml_tools/utilities.py CHANGED
@@ -2,7 +2,6 @@ import math
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
  import polars as pl
5
- import os
6
5
  from pathlib import Path
7
6
  import re
8
7
  from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple
@@ -12,6 +11,7 @@ from joblib.externals.loky.process_executor import TerminatedWorkerError
12
11
 
13
12
  # Keep track of available tools
14
13
  __all__ = [
14
+ "make_fullpath",
15
15
  "list_csv_paths",
16
16
  "list_files_by_extension",
17
17
  "load_dataframe",
@@ -21,33 +21,90 @@ __all__ = [
21
21
  "normalize_mixed_list",
22
22
  "sanitize_filename",
23
23
  "threshold_binary_values",
24
+ "threshold_binary_values_batch",
24
25
  "serialize_object",
25
26
  "deserialize_object",
26
27
  "distribute_datasets_by_target"
27
28
  ]
28
29
 
29
30
 
30
- def list_csv_paths(directory: str) -> dict[str, str]:
31
+ def make_fullpath(
32
+ input_path: Union[str, Path],
33
+ make: bool = False,
34
+ verbose: bool = False
35
+ ) -> Path:
31
36
  """
32
- Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.
37
+ Resolves a string or Path into an absolute Path.
38
+
39
+ - If the path exists, it is returned.
40
+ - If the path does not exist and `make=True`, it will:
41
+ - Create the file if the path has a suffix (i.e., is treated as a file)
42
+ - Create the directory if it has no suffix
43
+ - If `make=False` and the path does not exist, an error is raised.
44
+ - Optionally prints whether the resolved path is a file or directory.
33
45
 
34
46
  Parameters:
35
- directory (str): Path to the directory containing `.csv` files.
47
+ input_path (str | Path): Path to resolve.
48
+ make (bool): If True, attempt to create file or directory.
49
+ verbose (bool): Print classification after resolution.
36
50
 
37
51
  Returns:
38
- (dict[str, str]): Dictionary mapping {filename: filepath}.
52
+ Path: Resolved absolute path.
53
+
54
+ Raises:
55
+ ValueError: If the path doesn't exist and can't be created.
56
+ """
57
+ path = Path(input_path).expanduser()
58
+
59
+ is_file = path.suffix != ""
60
+
61
+ try:
62
+ resolved = path.resolve(strict=True)
63
+ except FileNotFoundError:
64
+ if not make:
65
+ raise ValueError(f"❌ Path does not exist: '{path}'")
66
+
67
+ try:
68
+ if is_file:
69
+ # Create parent directories first
70
+ path.parent.mkdir(parents=True, exist_ok=True)
71
+ path.touch(exist_ok=False)
72
+ else:
73
+ path.mkdir(parents=True, exist_ok=True)
74
+ resolved = path.resolve(strict=True)
75
+ except Exception as e:
76
+ raise ValueError(f"❌ Failed to create {'file' if is_file else 'directory'} '{path}': {e}")
77
+
78
+ if verbose:
79
+ if resolved.is_file():
80
+ print("📄 Path is a File")
81
+ elif resolved.is_dir():
82
+ print("📁 Path is a Directory")
83
+ else:
84
+ print("❓ Path exists but is neither file nor directory")
85
+
86
+ return resolved
87
+
88
+
89
+
90
+ def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
39
91
  """
40
- dir_path = Path(directory).expanduser().resolve()
92
+ Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.
41
93
 
42
- if not dir_path.is_dir():
43
- raise FileNotFoundError(f"Directory not found: {dir_path}")
94
+ Parameters:
95
+ directory (str | Path): Path to the directory containing `.csv` files.
96
+
97
+ Returns:
98
+ (dict[str, Path]): Dictionary mapping {filename: filepath}.
99
+ """
100
+ dir_path = make_fullpath(directory)
44
101
 
45
102
  csv_paths = list(dir_path.glob("*.csv"))
46
103
  if not csv_paths:
47
- raise IOError(f"No CSV files found in directory: {dir_path}")
104
+ raise IOError(f"No CSV files found in directory: {dir_path.name}")
48
105
 
49
106
  # make a dictionary of paths and names
50
- name_path_dict = {p.stem: str(p) for p in csv_paths}
107
+ name_path_dict = {p.stem: p for p in csv_paths}
51
108
 
52
109
  print("\n🗂️ CSV files found:")
53
110
  for name in name_path_dict.keys():
@@ -56,22 +113,19 @@ def list_csv_paths(directory: str) -> dict[str, str]:
56
113
  return name_path_dict
57
114
 
58
115
 
59
- def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
116
+ def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[str, Path]:
60
117
  """
61
118
  Lists all files with the specified extension in the given directory and returns a mapping:
62
119
  filenames (without extensions) to their absolute paths.
63
120
 
64
121
  Parameters:
65
- directory (str): Path to the directory to search in.
122
+ directory (str | Path): Path to the directory to search in.
66
123
  extension (str): File extension to search for (e.g., 'json', 'txt').
67
124
 
68
125
  Returns:
69
- (dict[str, str]): Dictionary mapping {filename: filepath}.
126
+ (dict[str, Path]): Dictionary mapping {filename: filepath}.
70
127
  """
71
- dir_path = Path(directory).expanduser().resolve()
72
-
73
- if not dir_path.is_dir():
74
- raise FileNotFoundError(f"Directory not found: {dir_path}")
128
+ dir_path = make_fullpath(directory)
75
129
 
76
130
  # Normalize the extension (remove leading dot if present)
77
131
  normalized_ext = extension.lstrip(".").lower()
@@ -81,7 +135,7 @@ def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
81
135
  if not matched_paths:
82
136
  raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
83
137
 
84
- name_path_dict = {p.stem: str(p) for p in matched_paths}
138
+ name_path_dict = {p.stem: p for p in matched_paths}
85
139
 
86
140
  print(f"\n📂 '{normalized_ext.upper()}' files found:")
87
141
  for name in name_path_dict:
@@ -90,18 +144,18 @@ def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
90
144
  return name_path_dict
91
145
 
92
146
 
93
- def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
147
+ def load_dataframe(df_path: Union[str,Path]) -> tuple[pd.DataFrame, str]:
94
148
  """
95
149
  Load a CSV file into a pandas DataFrame and extract the base name (without extension) from the file path.
96
150
 
97
151
  Args:
98
- df_path (str): The path to the CSV file.
152
+ df_path (str | Path): The path to the CSV file.
99
153
 
100
154
  Returns:
101
155
  Tuple ([pd.DataFrame, str]):
102
156
  A tuple containing the loaded pandas DataFrame and the base name of the file.
103
157
  """
104
- path = Path(df_path).expanduser().resolve()
158
+ path = make_fullpath(df_path)
105
159
  df = pd.read_csv(path, encoding='utf-8')
106
160
  df_name = path.stem
107
161
  if df.empty:
@@ -110,12 +164,12 @@ def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
110
164
  return df, df_name
111
165
 
112
166
 
113
- def yield_dataframes_from_dir(datasets_dir: str):
167
+ def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
114
168
  """
115
169
  Iterates over all CSV files in a given directory, loading each into a pandas DataFrame.
116
170
 
117
171
  Parameters:
118
- datasets_dir (str):
172
+ datasets_dir (str | Path):
119
173
  The path to the directory containing `.csv` dataset files.
120
174
 
121
175
  Yields:
@@ -128,7 +182,8 @@ def yield_dataframes_from_dir(datasets_dir: str):
128
182
  - CSV files are read using UTF-8 encoding.
129
183
  - Output is streamed via a generator to support lazy loading of multiple datasets.
130
184
  """
131
- for df_name, df_path in list_csv_paths(datasets_dir).items():
185
+ datasets_path = make_fullpath(datasets_dir)
186
+ for df_name, df_path in list_csv_paths(datasets_path).items():
132
187
  df, _ = load_dataframe(df_path)
133
188
  yield df, df_name
134
189
 
@@ -192,27 +247,27 @@ def merge_dataframes(
192
247
  return merged_df
193
248
 
194
249
 
195
- def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
250
+ def save_dataframe(df: pd.DataFrame, save_dir: Union[str,Path], filename: str) -> None:
196
251
  """
197
252
  Save a pandas DataFrame to a CSV file.
198
253
 
199
254
  Parameters:
200
- df: pandas.DataFrame to save
201
- save_dir: str, directory where the CSV file will be saved.
202
- filename: str, CSV filename, extension will be added if missing.
255
+ df (pd.DataFrame): Dataframe to save.
256
+ save_dir (str | Path): Directory where the CSV file will be saved.
257
+ filename (str): CSV filename, extension will be added if missing.
203
258
  """
204
259
  if df.empty:
205
260
  print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
206
261
  return
207
262
 
208
- os.makedirs(save_dir, exist_ok=True)
263
+ save_path = make_fullpath(save_dir, make=True)
209
264
 
210
265
  filename = sanitize_filename(filename)
211
266
 
212
267
  if not filename.endswith('.csv'):
213
268
  filename += '.csv'
214
269
 
215
- output_path = os.path.join(save_dir, filename)
270
+ output_path = save_path / filename
216
271
 
217
272
  df.to_csv(output_path, index=False, encoding='utf-8')
218
273
  print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
@@ -356,26 +411,59 @@ def threshold_binary_values(
356
411
  return tuple(result)
357
412
  else:
358
413
  return result
414
+
415
+
416
+ def threshold_binary_values_batch(
417
+ input_array: np.ndarray,
418
+ binary_values: int
419
+ ) -> np.ndarray:
420
+ """
421
+ Threshold the last `binary_values` columns of a 2D NumPy array to binary {0,1} using 0.5 cutoff.
359
422
 
423
+ Parameters
424
+ ----------
425
+ input_array : np.ndarray
426
+ 2D array with shape (batch_size, n_features).
427
+ binary_values : int
428
+ Number of binary features located at the END of each row.
429
+
430
+ Returns
431
+ -------
432
+ np.ndarray
433
+ Thresholded array, same shape as input.
434
+ """
435
+ assert input_array.ndim == 2, f"Expected 2D array, got {input_array.ndim}D"
436
+ batch_size, total_features = input_array.shape
437
+ assert 0 <= binary_values <= total_features, "binary_values out of valid range"
438
+
439
+ if binary_values == 0:
440
+ return input_array.copy()
441
+
442
+ cont_part = input_array[:, :-binary_values] if binary_values < total_features else np.empty((batch_size, 0))
443
+ bin_part = input_array[:, -binary_values:] > 0.5
444
+ bin_part = bin_part.astype(np.int32)
445
+
446
+ return np.hstack([cont_part, bin_part])
360
447
 
361
- def serialize_object(obj: Any, save_dir: str, filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[str]:
448
+
449
+ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[str]:
362
450
  """
363
451
  Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
364
452
 
365
453
  Parameters:
366
454
  obj (Any) : The Python object to serialize.
367
- save_dir (str) : Directory path where the serialized object will be saved.
455
+ save_dir (str | Path) : Directory path where the serialized object will be saved.
368
456
  filename (str) : Name for the output file, extension will be appended if needed.
369
457
 
370
458
  Returns:
371
459
  (str | None) : The full file path where the object was saved if successful; otherwise, None.
372
460
  """
373
461
  try:
374
- os.makedirs(save_dir, exist_ok=True)
462
+ save_path = make_fullpath(save_dir, make=True)
375
463
  sanitized_name = sanitize_filename(filename)
376
464
  if not sanitized_name.endswith('.joblib'):
377
465
  sanitized_name = sanitized_name + ".joblib"
378
- full_path = os.path.join(save_dir, sanitized_name)
466
+ full_path = save_path / sanitized_name
379
467
  joblib.dump(obj, full_path)
380
468
  except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
381
469
  message = f"❌ Failed to serialize object of type '{type(obj)}': {e}"
@@ -390,23 +478,22 @@ def serialize_object(obj: Any, save_dir: str, filename: str, verbose: bool=True,
390
478
  return full_path
391
479
 
392
480
 
393
- def deserialize_object(filepath: str, verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
481
+ def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
394
482
  """
395
483
  Loads a serialized object from a .joblib file.
396
484
 
397
485
  Parameters:
398
- filepath (str): Full path to the serialized .joblib file.
486
+ filepath (str | Path): Full path to the serialized .joblib file.
399
487
 
400
488
  Returns:
401
489
  (Any | None): The deserialized Python object, or None if loading fails.
402
490
  """
403
- if not os.path.exists(filepath):
404
- print(f"❌ File does not exist: {filepath}")
405
- return None
491
+ true_filepath = make_fullpath(filepath)
492
+
406
493
  try:
407
- obj = joblib.load(filepath)
494
+ obj = joblib.load(true_filepath)
408
495
  except (IOError, OSError, EOFError, TypeError, ValueError) as e:
409
- message = f"❌ Failed to deserialize object from '{filepath}': {e}"
496
+ message = f"❌ Failed to deserialize object from '{true_filepath}': {e}"
410
497
  if raise_on_error:
411
498
  raise Exception(message)
412
499
  else:
@@ -419,7 +506,7 @@ def deserialize_object(filepath: str, verbose: bool=True, raise_on_error: bool=T
419
506
 
420
507
 
421
508
  def distribute_datasets_by_target(
422
- df_or_path: Union[pd.DataFrame, str],
509
+ df_or_path: Union[pd.DataFrame, str, Path],
423
510
  target_columns: list[str],
424
511
  verbose: bool = False
425
512
  ) -> Iterator[Tuple[str, pd.DataFrame]]:
@@ -429,7 +516,7 @@ def distribute_datasets_by_target(
429
516
 
430
517
  Parameters
431
518
  ----------
432
- df_or_path : [pd.DataFrame | str]
519
+ df_or_path : [pd.DataFrame | str | Path]
433
520
  Dataframe or path to Dataframe with all feature and target columns ready to split and train a model.
434
521
  target_columns : List[str]
435
522
  List of target column names to generate per-target DataFrames.
@@ -442,9 +529,10 @@ def distribute_datasets_by_target(
442
529
  * Target name.
443
530
  * Pandas DataFrame.
444
531
  """
445
- # Validate path
446
- if isinstance(df_or_path, str):
447
- df, _ = load_dataframe(df_or_path)
532
+ # Validate path or dataframe
533
+ if isinstance(df_or_path, str) or isinstance(df_or_path, Path):
534
+ df_path = make_fullpath(df_or_path)
535
+ df, _ = load_dataframe(df_path)
448
536
  else:
449
537
  df = df_or_path
450
538
 
@@ -1,19 +0,0 @@
1
- dragon_ml_toolbox-1.4.8.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-1.4.8.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=jDnniT0tgD0uw1NpjibsPF-qK3wmOKgTykLG2iNQU7E,1840
3
- ml_tools/MICE_imputation.py,sha256=wIfl8I3SyHUett-0vizaCiv0y_q43-zij8VczsbEIOI,11088
4
- ml_tools/VIF_factor.py,sha256=HEBsLJy_qSDaPw1Btha5B7omxN4wjJXg-sqoetCjCJw,10016
5
- ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- ml_tools/data_exploration.py,sha256=NfPuN57wL5CXBnRyvIayxaYMe_ZKieHT3ZIcmtO_XIQ,20115
7
- ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
8
- ml_tools/ensemble_learning.py,sha256=v_btCkVthuEl3Pu1WipASvU5lGAVbXxxKEMq3boF-HI,37305
9
- ml_tools/handle_excel.py,sha256=NrCOWSENgb1HdqId_QOdPTjBUIJPePI9a2pnmmBd3lw,12613
10
- ml_tools/logger.py,sha256=WI7wiGmmALCQPl0AIauw_mPzFNTbaQf0v9J8pojvHUg,4708
11
- ml_tools/particle_swarm_optimization.py,sha256=_RdlDJalklzJohKVr2FhLAQq1mIHxbxnPrSRvJ3vjO4,22199
12
- ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
13
- ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
14
- ml_tools/utilities.py,sha256=Ir3Yw4SuWMLKnbnl4Qzudn5U8CgcQ7zMtNqcllZMHeM,15682
15
- ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
16
- dragon_ml_toolbox-1.4.8.dist-info/METADATA,sha256=EcF9Tj-cyQbgMkM0b0gRZEksi3D_L3NVpsbSqhaKNkw,2516
17
- dragon_ml_toolbox-1.4.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
18
- dragon_ml_toolbox-1.4.8.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
19
- dragon_ml_toolbox-1.4.8.dist-info/RECORD,,