dragon-ml-toolbox 4.3.0__py3-none-any.whl → 4.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 4.3.0
3
+ Version: 4.5.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,22 +1,22 @@
1
- dragon_ml_toolbox-4.3.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-4.3.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
- ml_tools/ETL_engineering.py,sha256=P7HN_e3vfmrOqDDK-IenyRSFQPr0N3V9e2gN75QFVWs,39372
1
+ dragon_ml_toolbox-4.5.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-4.5.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
+ ml_tools/ETL_engineering.py,sha256=4wwZXi9_U7xfCY70jGBaKniOeZ0m75ppxWpQBd_DmLc,39369
4
4
  ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
5
5
  ml_tools/MICE_imputation.py,sha256=b6ZTs8RedXFifOpuMCzr68xM16mCBVh1Ua6kcGfiVtg,11462
6
6
  ml_tools/ML_callbacks.py,sha256=0a-Rbr0Xp_B1FNopOKBBmuJ4MqazS5JgDiT7wx1dHvE,13161
7
7
  ml_tools/ML_evaluation.py,sha256=4dVqe6JF1Ukmk1sAcY8E5EG1oB1_oy2HXE5OT-pZwCs,10273
8
8
  ml_tools/ML_inference.py,sha256=Fh-X2UQn3AznWBjf-7iPSxwE-EzkGQm1VEIRUAkURmE,5336
9
9
  ml_tools/ML_trainer.py,sha256=dJjMfCEEM07Txy9KEH-2srZ3CZUa4lFWTJhpNWQ4Ndk,14974
10
- ml_tools/PSO_optimization.py,sha256=xtnPute5pkS_w-VvqOBgRLgke09mjfacGC2m9DiipHE,27626
10
+ ml_tools/PSO_optimization.py,sha256=9bplCNOSe2Ozcz5yQRkbih7geuDO9UJ6dJTMSJJ8zVk,27965
11
11
  ml_tools/RNN_forecast.py,sha256=2CyjBLSYYc3xLHxwLXUmP5Qv8AmV1OB_EndETNX1IBk,1956
12
12
  ml_tools/SQL.py,sha256=9zzS6AFEJM9aj6nE31hDe8S9TqLonk-J1amwZoiHNbk,10468
13
13
  ml_tools/VIF_factor.py,sha256=2nUMupfUoogf8o6ghoFZk_OwWhFXU0R3C9Gj0HOlI14,10415
14
- ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
15
15
  ml_tools/_logger.py,sha256=TpgYguxO-CWYqqgLW0tqFjtwZ58PE_W2OCfWNGZr0n0,1175
16
16
  ml_tools/_pytorch_models.py,sha256=ewPPsTHgmRPzMMWwObZOdH1vxm2Ij2VWZP38NC6zSH4,10135
17
17
  ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
18
- ml_tools/custom_logger.py,sha256=a3ywSCQT7j5ypR-usnKh2l861d_aVJ93ZRVqxrHsBBw,4112
19
- ml_tools/data_exploration.py,sha256=T4nO9YSDGvrpom7JELtoQTyg7XTEmvQz-jG0KKxqTRk,23467
18
+ ml_tools/custom_logger.py,sha256=njM_0XPbQ1S-x5LeSQAaTo2if-XVOR_pQSGg4EDeiTU,4603
19
+ ml_tools/data_exploration.py,sha256=qc_Oolxco2x9IhlYu5zPIuVBGiBw65HnypuGm8cQOOM,23677
20
20
  ml_tools/datasetmaster.py,sha256=_tNC2v98eCQGr3nMW_EFs83TRgRme8Uc7ttg1vosmQU,30106
21
21
  ml_tools/ensemble_inference.py,sha256=0SNX3YAz5bpvtwYmqEwqyWeIJP2Pb-v-bemENRSO7qg,9426
22
22
  ml_tools/ensemble_learning.py,sha256=Zi1oy6G2FWnTI5hBwjlexwF3JKALFS2FN6F8HAlVi_s,35391
@@ -24,7 +24,7 @@ ml_tools/handle_excel.py,sha256=J9iwIqMZemoxK49J5osSwp9Ge0h9YTKyYGbOm53hcno,1300
24
24
  ml_tools/keys.py,sha256=kK9UF-hek2VcPGFILCKl5geoN6flmMOu7IzhdEA6z5Y,1068
25
25
  ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
26
26
  ml_tools/utilities.py,sha256=mz-M351DzxWxnYVcLX-7ZQ6c-RGoCV9g4VTS9Qif2Es,18348
27
- dragon_ml_toolbox-4.3.0.dist-info/METADATA,sha256=7aZO_5P8SDx4tPFTtb3MTAaRgf_vbcOEURaxpT3MGK8,6572
28
- dragon_ml_toolbox-4.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
29
- dragon_ml_toolbox-4.3.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
30
- dragon_ml_toolbox-4.3.0.dist-info/RECORD,,
27
+ dragon_ml_toolbox-4.5.0.dist-info/METADATA,sha256=PzpYHROSr85CBbNnCcS-XInzpOFhAyXbPZ5YkLaYbps,6572
28
+ dragon_ml_toolbox-4.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
29
+ dragon_ml_toolbox-4.5.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
30
+ dragon_ml_toolbox-4.5.0.dist-info/RECORD,,
@@ -569,7 +569,7 @@ class NumberExtractor:
569
569
  self,
570
570
  regex_pattern: str = r"(\d+\.?\d*)",
571
571
  dtype: Literal["float", "int"] = "float",
572
- round_digits: Optional[int] = None,
572
+ round_digits: Optional[int] = 2,
573
573
  ):
574
574
  # --- Validation ---
575
575
  if not isinstance(regex_pattern, str):
@@ -2,7 +2,7 @@ import numpy as np
2
2
  from pathlib import Path
3
3
  import xgboost as xgb
4
4
  import lightgbm as lgb
5
- from typing import Literal, Union, Tuple, Dict, Optional
5
+ from typing import Literal, Union, Tuple, Dict, Optional, Any
6
6
  import pandas as pd
7
7
  from copy import deepcopy
8
8
  from .utilities import (
@@ -25,6 +25,7 @@ from contextlib import nullcontext
25
25
  __all__ = [
26
26
  "ObjectiveFunction",
27
27
  "multiple_objective_functions_from_dir",
28
+ "parse_lower_upper_bounds",
28
29
  "run_pso",
29
30
  "plot_optimal_feature_distributions"
30
31
  ]
@@ -169,6 +170,18 @@ def multiple_objective_functions_from_dir(directory: Union[str,Path], add_noise:
169
170
  return objective_functions, objective_function_names
170
171
 
171
172
 
173
+ def parse_lower_upper_bounds(source: dict[str,tuple[Any,Any]]):
174
+ """
175
+ Parse lower and upper boundaries, returning 2 lists:
176
+
177
+ `lower_bounds`, `upper_bounds`
178
+ """
179
+ lower = [low[0] for low in source.values()]
180
+ upper = [up[1] for up in source.values()]
181
+
182
+ return lower, upper
183
+
184
+
172
185
  def _set_boundaries(lower_boundaries: list[float], upper_boundaries: list[float]):
173
186
  assert len(lower_boundaries) == len(upper_boundaries), "Lower and upper boundaries must have the same length."
174
187
  assert len(lower_boundaries) >= 1, "At least one boundary pair is required."
ml_tools/__init__.py CHANGED
@@ -0,0 +1 @@
1
+ from .custom_logger import custom_logger
ml_tools/custom_logger.py CHANGED
@@ -1,9 +1,9 @@
1
1
  from pathlib import Path
2
2
  from datetime import datetime
3
3
  from typing import Union, List, Dict, Any
4
- import pandas as pd
5
4
  import traceback
6
5
  import json
6
+ import csv
7
7
  from .path_manager import sanitize_filename, make_fullpath
8
8
  from ._script_info import _script_info
9
9
  from ._logger import _LOGGER
@@ -18,7 +18,6 @@ def custom_logger(
18
18
  data: Union[
19
19
  List[Any],
20
20
  Dict[Any, Any],
21
- pd.DataFrame,
22
21
  str,
23
22
  BaseException
24
23
  ],
@@ -75,7 +74,7 @@ def custom_logger(
75
74
  _log_exception_to_log(data, base_path.with_suffix(".log"))
76
75
 
77
76
  else:
78
- raise ValueError("Unsupported data type. Must be list, dict, DataFrame, str, or BaseException.")
77
+ raise ValueError("Unsupported data type. Must be list, dict, str, or BaseException.")
79
78
 
80
79
  _LOGGER.info(f"🗄️ Log saved to: '{base_path}'")
81
80
 
@@ -106,8 +105,19 @@ def _log_dict_to_csv(data: Dict[Any, List[Any]], path: Path) -> None:
106
105
  padded_value = value + [None] * (max_length - len(value))
107
106
  sanitized_dict[sanitized_key] = padded_value
108
107
 
109
- df = pd.DataFrame(sanitized_dict)
110
- df.to_csv(path, index=False)
108
+ # The `newline=''` argument is important to prevent extra blank rows
109
+ with open(path, 'w', newline='', encoding='utf-8') as csv_file:
110
+ writer = csv.writer(csv_file)
111
+
112
+ # 1. Write the header row from the sanitized dictionary keys
113
+ header = list(sanitized_dict.keys())
114
+ writer.writerow(header)
115
+
116
+ # 2. Transpose columns to rows and write them
117
+ # zip(*sanitized_dict.values()) elegantly converts the column data
118
+ # (lists in the dict) into row-by-row tuples.
119
+ rows_to_write = zip(*sanitized_dict.values())
120
+ writer.writerows(rows_to_write)
111
121
 
112
122
 
113
123
  def _log_string_to_log(data: str, path: Path) -> None:
@@ -126,7 +126,7 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
126
126
  # Stage 1: Drop rows with all target columns missing
127
127
  if targets is not None:
128
128
  # validate targets
129
- valid_targets = [target for target in targets if target in df_clean.columns]
129
+ valid_targets = _validate_columns(df_clean, targets)
130
130
  target_na = df_clean[valid_targets].isnull().all(axis=1)
131
131
  if target_na.any():
132
132
  print(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
@@ -134,10 +134,10 @@ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]],
134
134
  else:
135
135
  print("✅ No rows with all targets missing.")
136
136
  else:
137
- targets = []
137
+ valid_targets = []
138
138
 
139
139
  # Stage 2: Drop rows based on feature column missing values
140
- feature_cols = [col for col in df_clean.columns if col not in targets]
140
+ feature_cols = [col for col in df_clean.columns if col not in valid_targets]
141
141
  if feature_cols:
142
142
  feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
143
143
  rows_to_drop = feature_na_frac[feature_na_frac > threshold].index
@@ -238,8 +238,9 @@ def split_features_targets(df: pd.DataFrame, targets: list[str]):
238
238
  - Shape of the features dataframe.
239
239
  - Shape of the targets dataframe.
240
240
  """
241
- df_targets = df[targets]
242
- df_features = df.drop(columns=targets)
241
+ valid_targets = _validate_columns(df, targets)
242
+ df_targets = df[valid_targets]
243
+ df_features = df.drop(columns=valid_targets)
243
244
  print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
244
245
  return df_features, df_targets
245
246
 
@@ -347,7 +348,7 @@ def plot_correlation_heatmap(df: pd.DataFrame,
347
348
  full_path = save_path / plot_title
348
349
 
349
350
  plt.savefig(full_path, bbox_inches="tight", format='svg')
350
- print(f"Saved correlation heatmap: '{plot_title}.svg'")
351
+ print(f"Saved correlation heatmap: '{plot_title}'")
351
352
 
352
353
  plt.show()
353
354
  plt.close()
@@ -644,5 +645,10 @@ def standardize_percentages(
644
645
  return df_copy
645
646
 
646
647
 
648
+ def _validate_columns(df: pd.DataFrame, columns: list[str]):
649
+ valid_columns = [column for column in columns if column in df.columns]
650
+ return valid_columns
651
+
652
+
647
653
  def info():
648
654
  _script_info(__all__)