dragon-ml-toolbox 2.0.0__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

ml_tools/logger.py CHANGED
@@ -1,11 +1,11 @@
1
- import os
1
+ from pathlib import Path
2
2
  from datetime import datetime
3
3
  from typing import Union, List, Dict, Any
4
4
  import pandas as pd
5
5
  from openpyxl.styles import Font, PatternFill
6
6
  import traceback
7
7
  import json
8
- from .utilities import sanitize_filename, _script_info
8
+ from .utilities import sanitize_filename, _script_info, make_fullpath
9
9
 
10
10
 
11
11
  __all__ = [
@@ -21,7 +21,7 @@ def custom_logger(
21
21
  str,
22
22
  BaseException
23
23
  ],
24
- save_directory: str,
24
+ save_directory: Union[str, Path],
25
25
  log_name: str,
26
26
  ) -> None:
27
27
  """
@@ -54,10 +54,12 @@ def custom_logger(
54
54
  ValueError: If the data type is unsupported.
55
55
  """
56
56
  try:
57
- os.makedirs(save_directory, exist_ok=True)
57
+ save_path = make_fullpath(save_directory, make=True)
58
+
58
59
  timestamp = datetime.now().strftime(r"%Y%m%d_%H%M%S")
59
60
  log_name = sanitize_filename(log_name)
60
- base_path = os.path.join(save_directory, f"{log_name}_{timestamp}")
61
+
62
+ base_path = save_path / f"{log_name}_{timestamp}"
61
63
 
62
64
  if isinstance(data, list):
63
65
  _log_list_to_txt(data, base_path + ".txt")
@@ -86,7 +88,7 @@ def custom_logger(
86
88
  print(f"Error in custom_logger: {e}")
87
89
 
88
90
 
89
- def _log_list_to_txt(data: List[Any], path: str) -> None:
91
+ def _log_list_to_txt(data: List[Any], path: Path) -> None:
90
92
  log_lines = []
91
93
  for item in data:
92
94
  try:
@@ -98,7 +100,7 @@ def _log_list_to_txt(data: List[Any], path: str) -> None:
98
100
  f.write('\n'.join(log_lines))
99
101
 
100
102
 
101
- def _log_dict_to_csv(data: Dict[Any, List[Any]], path: str) -> None:
103
+ def _log_dict_to_csv(data: Dict[Any, List[Any]], path: Path) -> None:
102
104
  sanitized_dict = {}
103
105
  max_length = max(len(v) for v in data.values()) if data else 0
104
106
 
@@ -113,7 +115,7 @@ def _log_dict_to_csv(data: Dict[Any, List[Any]], path: str) -> None:
113
115
  df.to_csv(path, index=False)
114
116
 
115
117
 
116
- def _log_dataframe_to_xlsx(data: pd.DataFrame, path: str) -> None:
118
+ def _log_dataframe_to_xlsx(data: pd.DataFrame, path: Path) -> None:
117
119
  writer = pd.ExcelWriter(path, engine='openpyxl')
118
120
  data.to_excel(writer, index=True, sheet_name='Data')
119
121
 
@@ -134,18 +136,18 @@ def _log_dataframe_to_xlsx(data: pd.DataFrame, path: str) -> None:
134
136
  writer.close()
135
137
 
136
138
 
137
- def _log_string_to_log(data: str, path: str) -> None:
139
+ def _log_string_to_log(data: str, path: Path) -> None:
138
140
  with open(path, 'w', encoding='utf-8') as f:
139
141
  f.write(data.strip() + '\n')
140
142
 
141
143
 
142
- def _log_exception_to_log(exc: BaseException, path: str) -> None:
144
+ def _log_exception_to_log(exc: BaseException, path: Path) -> None:
143
145
  with open(path, 'w', encoding='utf-8') as f:
144
146
  f.write("Exception occurred:\n")
145
147
  traceback.print_exception(type(exc), exc, exc.__traceback__, file=f)
146
148
 
147
149
 
148
- def _log_dict_to_json(data: Dict[Any, Any], path: str) -> None:
150
+ def _log_dict_to_json(data: Dict[Any, Any], path: Path) -> None:
149
151
  with open(path, 'w', encoding='utf-8') as f:
150
152
  json.dump(data, f, indent=4, ensure_ascii=False)
151
153
 
ml_tools/utilities.py CHANGED
@@ -2,7 +2,6 @@ import math
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
  import polars as pl
5
- import os
6
5
  from pathlib import Path
7
6
  import re
8
7
  from typing import Literal, Union, Sequence, Optional, Any, Iterator, Tuple
@@ -12,6 +11,7 @@ from joblib.externals.loky.process_executor import TerminatedWorkerError
12
11
 
13
12
  # Keep track of available tools
14
13
  __all__ = [
14
+ "make_fullpath",
15
15
  "list_csv_paths",
16
16
  "list_files_by_extension",
17
17
  "load_dataframe",
@@ -28,27 +28,83 @@ __all__ = [
28
28
  ]
29
29
 
30
30
 
31
- def list_csv_paths(directory: str) -> dict[str, str]:
31
+ def make_fullpath(
32
+ input_path: Union[str, Path],
33
+ make: bool = False,
34
+ verbose: bool = False
35
+ ) -> Path:
32
36
  """
33
- Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.
37
+ Resolves a string or Path into an absolute Path.
38
+
39
+ - If the path exists, it is returned.
40
+ - If the path does not exist and `make=True`, it will:
41
+ - Create the file if the path has a suffix (i.e., is treated as a file)
42
+ - Create the directory if it has no suffix
43
+ - If `make=False` and the path does not exist, an error is raised.
44
+ - Optionally prints whether the resolved path is a file or directory.
34
45
 
35
46
  Parameters:
36
- directory (str): Path to the directory containing `.csv` files.
47
+ input_path (str | Path): Path to resolve.
48
+ make (bool): If True, attempt to create file or directory.
49
+ verbose (bool): Print classification after resolution.
37
50
 
38
51
  Returns:
39
- (dict[str, str]): Dictionary mapping {filename: filepath}.
52
+ Path: Resolved absolute path.
53
+
54
+ Raises:
55
+ ValueError: If the path doesn't exist and can't be created.
40
56
  """
41
- dir_path = Path(directory).expanduser().resolve()
57
+ path = Path(input_path).expanduser()
58
+
59
+ is_file = path.suffix != ""
60
+
61
+ try:
62
+ resolved = path.resolve(strict=True)
63
+ except FileNotFoundError:
64
+ if not make:
65
+ raise ValueError(f"❌ Path does not exist: '{path}'")
66
+
67
+ try:
68
+ if is_file:
69
+ # Create parent directories first
70
+ path.parent.mkdir(parents=True, exist_ok=True)
71
+ path.touch(exist_ok=False)
72
+ else:
73
+ path.mkdir(parents=True, exist_ok=True)
74
+ resolved = path.resolve(strict=True)
75
+ except Exception as e:
76
+ raise ValueError(f"❌ Failed to create {'file' if is_file else 'directory'} '{path}': {e}")
77
+
78
+ if verbose:
79
+ if resolved.is_file():
80
+ print("📄 Path is a File")
81
+ elif resolved.is_dir():
82
+ print("📁 Path is a Directory")
83
+ else:
84
+ print("❓ Path exists but is neither file nor directory")
85
+
86
+ return resolved
87
+
42
88
 
43
- if not dir_path.is_dir():
44
- raise FileNotFoundError(f"Directory not found: {dir_path}")
89
+
90
+ def list_csv_paths(directory: Union[str,Path]) -> dict[str, Path]:
91
+ """
92
+ Lists all `.csv` files in the specified directory and returns a mapping: filenames (without extensions) to their absolute paths.
93
+
94
+ Parameters:
95
+ directory (str | Path): Path to the directory containing `.csv` files.
96
+
97
+ Returns:
98
+ (dict[str, Path]): Dictionary mapping {filename: filepath}.
99
+ """
100
+ dir_path = make_fullpath(directory)
45
101
 
46
102
  csv_paths = list(dir_path.glob("*.csv"))
47
103
  if not csv_paths:
48
- raise IOError(f"No CSV files found in directory: {dir_path}")
104
+ raise IOError(f"No CSV files found in directory: {dir_path.name}")
49
105
 
50
106
  # make a dictionary of paths and names
51
- name_path_dict = {p.stem: str(p) for p in csv_paths}
107
+ name_path_dict = {p.stem: p for p in csv_paths}
52
108
 
53
109
  print("\n🗂️ CSV files found:")
54
110
  for name in name_path_dict.keys():
@@ -57,22 +113,19 @@ def list_csv_paths(directory: str) -> dict[str, str]:
57
113
  return name_path_dict
58
114
 
59
115
 
60
- def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
116
+ def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[str, Path]:
61
117
  """
62
118
  Lists all files with the specified extension in the given directory and returns a mapping:
63
119
  filenames (without extensions) to their absolute paths.
64
120
 
65
121
  Parameters:
66
- directory (str): Path to the directory to search in.
122
+ directory (str | Path): Path to the directory to search in.
67
123
  extension (str): File extension to search for (e.g., 'json', 'txt').
68
124
 
69
125
  Returns:
70
- (dict[str, str]): Dictionary mapping {filename: filepath}.
126
+ (dict[str, Path]): Dictionary mapping {filename: filepath}.
71
127
  """
72
- dir_path = Path(directory).expanduser().resolve()
73
-
74
- if not dir_path.is_dir():
75
- raise FileNotFoundError(f"Directory not found: {dir_path}")
128
+ dir_path = make_fullpath(directory)
76
129
 
77
130
  # Normalize the extension (remove leading dot if present)
78
131
  normalized_ext = extension.lstrip(".").lower()
@@ -82,7 +135,7 @@ def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
82
135
  if not matched_paths:
83
136
  raise IOError(f"No '.{normalized_ext}' files found in directory: {dir_path}")
84
137
 
85
- name_path_dict = {p.stem: str(p) for p in matched_paths}
138
+ name_path_dict = {p.stem: p for p in matched_paths}
86
139
 
87
140
  print(f"\n📂 '{normalized_ext.upper()}' files found:")
88
141
  for name in name_path_dict:
@@ -91,18 +144,18 @@ def list_files_by_extension(directory: str, extension: str) -> dict[str, str]:
91
144
  return name_path_dict
92
145
 
93
146
 
94
- def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
147
+ def load_dataframe(df_path: Union[str,Path]) -> tuple[pd.DataFrame, str]:
95
148
  """
96
149
  Load a CSV file into a pandas DataFrame and extract the base name (without extension) from the file path.
97
150
 
98
151
  Args:
99
- df_path (str): The path to the CSV file.
152
+ df_path (str | Path): The path to the CSV file.
100
153
 
101
154
  Returns:
102
155
  Tuple ([pd.DataFrame, str]):
103
156
  A tuple containing the loaded pandas DataFrame and the base name of the file.
104
157
  """
105
- path = Path(df_path).expanduser().resolve()
158
+ path = make_fullpath(df_path)
106
159
  df = pd.read_csv(path, encoding='utf-8')
107
160
  df_name = path.stem
108
161
  if df.empty:
@@ -111,12 +164,12 @@ def load_dataframe(df_path: str) -> tuple[pd.DataFrame, str]:
111
164
  return df, df_name
112
165
 
113
166
 
114
- def yield_dataframes_from_dir(datasets_dir: str):
167
+ def yield_dataframes_from_dir(datasets_dir: Union[str,Path]):
115
168
  """
116
169
  Iterates over all CSV files in a given directory, loading each into a pandas DataFrame.
117
170
 
118
171
  Parameters:
119
- datasets_dir (str):
172
+ datasets_dir (str | Path):
120
173
  The path to the directory containing `.csv` dataset files.
121
174
 
122
175
  Yields:
@@ -129,7 +182,8 @@ def yield_dataframes_from_dir(datasets_dir: str):
129
182
  - CSV files are read using UTF-8 encoding.
130
183
  - Output is streamed via a generator to support lazy loading of multiple datasets.
131
184
  """
132
- for df_name, df_path in list_csv_paths(datasets_dir).items():
185
+ datasets_path = make_fullpath(datasets_dir)
186
+ for df_name, df_path in list_csv_paths(datasets_path).items():
133
187
  df, _ = load_dataframe(df_path)
134
188
  yield df, df_name
135
189
 
@@ -193,27 +247,27 @@ def merge_dataframes(
193
247
  return merged_df
194
248
 
195
249
 
196
- def save_dataframe(df: pd.DataFrame, save_dir: str, filename: str) -> None:
250
+ def save_dataframe(df: pd.DataFrame, save_dir: Union[str,Path], filename: str) -> None:
197
251
  """
198
252
  Save a pandas DataFrame to a CSV file.
199
253
 
200
254
  Parameters:
201
- df: pandas.DataFrame to save
202
- save_dir: str, directory where the CSV file will be saved.
203
- filename: str, CSV filename, extension will be added if missing.
255
+ df (pd.DataFrame): Dataframe to save.
256
+ save_dir (str | Path): Directory where the CSV file will be saved.
257
+ filename (str): CSV filename, extension will be added if missing.
204
258
  """
205
259
  if df.empty:
206
260
  print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
207
261
  return
208
262
 
209
- os.makedirs(save_dir, exist_ok=True)
263
+ save_path = make_fullpath(save_dir, make=True)
210
264
 
211
265
  filename = sanitize_filename(filename)
212
266
 
213
267
  if not filename.endswith('.csv'):
214
268
  filename += '.csv'
215
269
 
216
- output_path = os.path.join(save_dir, filename)
270
+ output_path = save_path / filename
217
271
 
218
272
  df.to_csv(output_path, index=False, encoding='utf-8')
219
273
  print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
@@ -392,24 +446,24 @@ def threshold_binary_values_batch(
392
446
  return np.hstack([cont_part, bin_part])
393
447
 
394
448
 
395
- def serialize_object(obj: Any, save_dir: str, filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[str]:
449
+ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[str]:
396
450
  """
397
451
  Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
398
452
 
399
453
  Parameters:
400
454
  obj (Any) : The Python object to serialize.
401
- save_dir (str) : Directory path where the serialized object will be saved.
455
+ save_dir (str | Path) : Directory path where the serialized object will be saved.
402
456
  filename (str) : Name for the output file, extension will be appended if needed.
403
457
 
404
458
  Returns:
405
459
  (str | None) : The full file path where the object was saved if successful; otherwise, None.
406
460
  """
407
461
  try:
408
- os.makedirs(save_dir, exist_ok=True)
462
+ save_path = make_fullpath(save_dir, make=True)
409
463
  sanitized_name = sanitize_filename(filename)
410
464
  if not sanitized_name.endswith('.joblib'):
411
465
  sanitized_name = sanitized_name + ".joblib"
412
- full_path = os.path.join(save_dir, sanitized_name)
466
+ full_path = save_path / sanitized_name
413
467
  joblib.dump(obj, full_path)
414
468
  except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
415
469
  message = f"❌ Failed to serialize object of type '{type(obj)}': {e}"
@@ -424,23 +478,22 @@ def serialize_object(obj: Any, save_dir: str, filename: str, verbose: bool=True,
424
478
  return full_path
425
479
 
426
480
 
427
- def deserialize_object(filepath: str, verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
481
+ def deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
428
482
  """
429
483
  Loads a serialized object from a .joblib file.
430
484
 
431
485
  Parameters:
432
- filepath (str): Full path to the serialized .joblib file.
486
+ filepath (str | Path): Full path to the serialized .joblib file.
433
487
 
434
488
  Returns:
435
489
  (Any | None): The deserialized Python object, or None if loading fails.
436
490
  """
437
- if not os.path.exists(filepath):
438
- print(f"❌ File does not exist: {filepath}")
439
- return None
491
+ true_filepath = make_fullpath(filepath)
492
+
440
493
  try:
441
- obj = joblib.load(filepath)
494
+ obj = joblib.load(true_filepath)
442
495
  except (IOError, OSError, EOFError, TypeError, ValueError) as e:
443
- message = f"❌ Failed to deserialize object from '{filepath}': {e}"
496
+ message = f"❌ Failed to deserialize object from '{true_filepath}': {e}"
444
497
  if raise_on_error:
445
498
  raise Exception(message)
446
499
  else:
@@ -453,7 +506,7 @@ def deserialize_object(filepath: str, verbose: bool=True, raise_on_error: bool=T
453
506
 
454
507
 
455
508
  def distribute_datasets_by_target(
456
- df_or_path: Union[pd.DataFrame, str],
509
+ df_or_path: Union[pd.DataFrame, str, Path],
457
510
  target_columns: list[str],
458
511
  verbose: bool = False
459
512
  ) -> Iterator[Tuple[str, pd.DataFrame]]:
@@ -463,7 +516,7 @@ def distribute_datasets_by_target(
463
516
 
464
517
  Parameters
465
518
  ----------
466
- df_or_path : [pd.DataFrame | str]
519
+ df_or_path : [pd.DataFrame | str | Path]
467
520
  Dataframe or path to Dataframe with all feature and target columns ready to split and train a model.
468
521
  target_columns : List[str]
469
522
  List of target column names to generate per-target DataFrames.
@@ -476,9 +529,10 @@ def distribute_datasets_by_target(
476
529
  * Target name.
477
530
  * Pandas DataFrame.
478
531
  """
479
- # Validate path
480
- if isinstance(df_or_path, str):
481
- df, _ = load_dataframe(df_or_path)
532
+ # Validate path or dataframe
533
+ if isinstance(df_or_path, str) or isinstance(df_or_path, Path):
534
+ df_path = make_fullpath(df_or_path)
535
+ df, _ = load_dataframe(df_path)
482
536
  else:
483
537
  df = df_or_path
484
538
 
@@ -1,20 +0,0 @@
1
- dragon_ml_toolbox-2.0.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-2.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
3
- ml_tools/MICE_imputation.py,sha256=wIfl8I3SyHUett-0vizaCiv0y_q43-zij8VczsbEIOI,11088
4
- ml_tools/PSO_optimization.py,sha256=bNiuKqyVoShGM4VBx4exJ8jjVVxQjlunkVpzaMb7fwY,20850
5
- ml_tools/VIF_factor.py,sha256=HEBsLJy_qSDaPw1Btha5B7omxN4wjJXg-sqoetCjCJw,10016
6
- ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
8
- ml_tools/data_exploration.py,sha256=NfPuN57wL5CXBnRyvIayxaYMe_ZKieHT3ZIcmtO_XIQ,20115
9
- ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
10
- ml_tools/ensemble_learning.py,sha256=v_btCkVthuEl3Pu1WipASvU5lGAVbXxxKEMq3boF-HI,37305
11
- ml_tools/handle_excel.py,sha256=NrCOWSENgb1HdqId_QOdPTjBUIJPePI9a2pnmmBd3lw,12613
12
- ml_tools/logger.py,sha256=WI7wiGmmALCQPl0AIauw_mPzFNTbaQf0v9J8pojvHUg,4708
13
- ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
14
- ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
15
- ml_tools/utilities.py,sha256=_7RDgk9uBxPuHJRVOOFYFUOZyJ1o9QILnxYsKdGCfLQ,16772
16
- ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
17
- dragon_ml_toolbox-2.0.0.dist-info/METADATA,sha256=7MHJGUXvWThm8-Rv9NZyogTQKBBMH4x0EXLsHel9Dns,2974
18
- dragon_ml_toolbox-2.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
- dragon_ml_toolbox-2.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
20
- dragon_ml_toolbox-2.0.0.dist-info/RECORD,,