dragon-ml-toolbox 10.13.0__tar.gz → 10.15.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (41) hide show
  1. {dragon_ml_toolbox-10.13.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-10.15.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ETL_cleaning.py +28 -16
  4. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ML_optimization.py +103 -3
  5. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/pyproject.toml +1 -1
  6. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/LICENSE +0 -0
  7. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/LICENSE-THIRD-PARTY.md +0 -0
  8. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/README.md +0 -0
  9. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  10. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  11. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  12. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  13. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ETL_engineering.py +0 -0
  14. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/GUI_tools.py +0 -0
  15. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/MICE_imputation.py +0 -0
  16. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ML_callbacks.py +0 -0
  17. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ML_datasetmaster.py +0 -0
  18. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ML_evaluation.py +0 -0
  19. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ML_evaluation_multi.py +0 -0
  20. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ML_inference.py +0 -0
  21. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ML_models.py +0 -0
  22. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ML_scaler.py +0 -0
  23. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ML_trainer.py +0 -0
  24. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/PSO_optimization.py +0 -0
  25. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/RNN_forecast.py +0 -0
  26. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/SQL.py +0 -0
  27. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/VIF_factor.py +0 -0
  28. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/__init__.py +0 -0
  29. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/_logger.py +0 -0
  30. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/_script_info.py +0 -0
  31. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/custom_logger.py +0 -0
  32. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/data_exploration.py +0 -0
  33. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ensemble_evaluation.py +0 -0
  34. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ensemble_inference.py +0 -0
  35. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/ensemble_learning.py +0 -0
  36. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/handle_excel.py +0 -0
  37. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/keys.py +0 -0
  38. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/optimization_tools.py +0 -0
  39. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/path_manager.py +0 -0
  40. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/ml_tools/utilities.py +0 -0
  41. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.15.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.13.0
3
+ Version: 10.15.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.13.0
3
+ Version: 10.15.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -96,7 +96,7 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
96
96
 
97
97
 
98
98
  ########## Basic df cleaners #############
99
- def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
99
+ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
100
100
  # Cleaning rules
101
101
  cleaning_rules = {
102
102
  # 1. Comprehensive Punctuation & Symbol Normalization
@@ -128,7 +128,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
128
128
  # Punctuation
129
129
  '》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']',
130
130
  '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
131
- '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=',
131
+ '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=', '·': '-',
132
132
 
133
133
  # Commas (avoid commas in entries)
134
134
  ',': ';',
@@ -159,6 +159,9 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
159
159
  r'!{2,}': '!', # Replace two or more exclamation marks with a single one
160
160
  r';{2,}': ';',
161
161
  r'-{2,}': '-',
162
+ r'/{2,}': '/',
163
+ r'%{2,}': '%',
164
+ r'&{2,}': '&',
162
165
 
163
166
  # 2. Internal Whitespace Consolidation
164
167
  # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
@@ -170,7 +173,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
170
173
 
171
174
  # 4. Textual Null Standardization (New Step)
172
175
  # Convert common null-like text to actual nulls.
173
- r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;)$': None,
176
+ r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
174
177
 
175
178
  # 5. Final Nullification of Empty Strings
176
179
  # After all cleaning, if a string is now empty, convert it to a null
@@ -191,9 +194,13 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
191
194
  df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
192
195
 
193
196
  # apply lowercase to all string columns
194
- df_final = df_cleaned.with_columns(
195
- pl.col(pl.String).str.to_lowercase()
196
- )
197
+ if all_lowercase:
198
+ df_final = df_cleaned.with_columns(
199
+ pl.col(pl.String).str.to_lowercase()
200
+ )
201
+ else:
202
+ df_final = df_cleaned
203
+
197
204
  except Exception as e:
198
205
  _LOGGER.error(f"An error occurred during the cleaning process.")
199
206
  raise e
@@ -211,7 +218,7 @@ def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
211
218
  return input_path, output_path
212
219
 
213
220
 
214
- def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
221
+ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=True):
215
222
  """
216
223
  Performs a comprehensive, standardized cleaning on all columns of a CSV file.
217
224
 
@@ -221,13 +228,16 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
221
228
  - Stripping any leading or trailing whitespace.
222
229
  - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
223
230
  - Converting strings that become empty after cleaning into true null values.
224
- - Normalizing all text to lowercase.
231
+ - Normalizing all text to lowercase (Optional).
225
232
 
226
233
  Args:
227
- input_filepath (Union[str, Path]):
234
+ input_filepath (str | Path):
228
235
  The path to the source CSV file to be cleaned.
229
- output_filepath (Union[str, Path, None], optional):
236
+ output_filepath (str | Path):
230
237
  The path to save the cleaned CSV file.
238
+ all_lowercase (bool):
239
+ Whether to normalize all text to lowercase.
240
+
231
241
  """
232
242
  # Handle paths
233
243
  input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
@@ -236,7 +246,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
236
246
  df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
237
247
 
238
248
  # CLEAN
239
- df_final = _cleaner_core(df)
249
+ df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
240
250
 
241
251
  # Save cleaned dataframe
242
252
  save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
@@ -245,7 +255,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
245
255
 
246
256
 
247
257
  def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
248
- skip_targets: bool=False, threshold: float=0.8):
258
+ skip_targets: bool=False, threshold: float=0.8, all_lowercase: bool=True):
249
259
  """
250
260
  Performs standardized cleaning followed by iterative removal of rows and
251
261
  columns with excessive missing data.
@@ -262,12 +272,12 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
262
272
  dropping process are saved to the specified log directory.
263
273
 
264
274
  Args:
265
- input_filepath (str, Path):
275
+ input_filepath (str | Path):
266
276
  The path to the source CSV file to be cleaned.
267
- output_filepath (str, Path):
277
+ output_filepath (str | Path):
268
278
  The path to save the fully cleaned CSV file after cleaning
269
279
  and missing-data-based pruning.
270
- log_directory (str, Path):
280
+ log_directory (str | Path):
271
281
  Path to the directory where missing data reports will be stored.
272
282
  targets (list[str]):
273
283
  A list of column names to be treated as target variables.
@@ -279,6 +289,8 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
279
289
  The proportion of missing data required to drop a row or column.
280
290
  For example, 0.8 means a row/column will be dropped if 80% or more
281
291
  of its data is missing.
292
+ all_lowercase (bool):
293
+ Whether to normalize all text to lowercase.
282
294
  """
283
295
  # handle log path
284
296
  log_path = make_fullpath(log_directory, make=True, enforce="directory")
@@ -290,7 +302,7 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
290
302
  df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
291
303
 
292
304
  # CLEAN
293
- df_cleaned = _cleaner_core(df)
305
+ df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
294
306
 
295
307
  # switch to pandas
296
308
  df_cleaned_pandas = df_cleaned.to_pandas()
@@ -20,12 +20,112 @@ from .SQL import DatabaseManager
20
20
  from .optimization_tools import _save_result
21
21
  from .utilities import threshold_binary_values, save_dataframe
22
22
 
23
+
23
24
  __all__ = [
25
+ "MLOptimizer",
24
26
  "create_pytorch_problem",
25
27
  "run_optimization"
26
28
  ]
27
29
 
28
30
 
31
+ class MLOptimizer:
32
+ """
33
+ A wrapper class for setting up and running EvoTorch optimization tasks.
34
+
35
+ This class combines the functionality of `create_pytorch_problem` and
36
+ `run_optimization` functions into a single, streamlined workflow.
37
+
38
+ SNES and CEM algorithms do not accept bounds, the given bounds will be used as an initial starting point.
39
+
40
+ Example:
41
+ >>> # 1. Initialize the optimizer with model and search parameters
42
+ >>> optimizer = MLOptimizer(
43
+ ... inference_handler=my_handler,
44
+ ... bounds=(lower_bounds, upper_bounds),
45
+ ... number_binary_features=2,
46
+ ... task="max",
47
+ ... algorithm="Genetic"
48
+ ... )
49
+ >>> # 2. Run the optimization and save the results
50
+ >>> best_result = optimizer.run(
51
+ ... num_generations=100,
52
+ ... target_name="my_target",
53
+ ... feature_names=my_feature_names,
54
+ ... save_dir="/path/to/results",
55
+ ... save_format="csv"
56
+ ... )
57
+ """
58
+ def __init__(self,
59
+ inference_handler: PyTorchInferenceHandler,
60
+ bounds: Tuple[List[float], List[float]],
61
+ number_binary_features: int,
62
+ task: Literal["min", "max"],
63
+ algorithm: Literal["SNES", "CEM", "Genetic"] = "Genetic",
64
+ population_size: int = 200,
65
+ **searcher_kwargs):
66
+ """
67
+ Initializes the optimizer by creating the EvoTorch problem and searcher.
68
+
69
+ Args:
70
+ inference_handler (PyTorchInferenceHandler): An initialized inference handler containing the model and weights.
71
+ bounds (tuple[list[float], list[float]]): A tuple containing the lower and upper bounds for the solution features.
72
+ number_binary_features (int): Number of binary features located at the END of the feature vector.
73
+ task (str): The optimization goal, either "min" or "max".
74
+ algorithm (str): The search algorithm to use ("SNES", "CEM", "Genetic").
75
+ population_size (int): Population size for CEM and GeneticAlgorithm.
76
+ **searcher_kwargs: Additional keyword arguments for the selected search algorithm's constructor.
77
+ """
78
+ # Call the existing factory function to get the problem and searcher factory
79
+ self.problem, self.searcher_factory = create_pytorch_problem(
80
+ inference_handler=inference_handler,
81
+ bounds=bounds,
82
+ binary_features=number_binary_features,
83
+ task=task,
84
+ algorithm=algorithm,
85
+ population_size=population_size,
86
+ **searcher_kwargs
87
+ )
88
+ # Store binary_features count to pass it to the run function later
89
+ self._binary_features = number_binary_features
90
+
91
+ def run(self,
92
+ num_generations: int,
93
+ target_name: str,
94
+ save_dir: Union[str, Path],
95
+ feature_names: Optional[List[str]],
96
+ save_format: Literal['csv', 'sqlite', 'both'],
97
+ repetitions: int = 1,
98
+ verbose: bool = True) -> Optional[dict]:
99
+ """
100
+ Runs the evolutionary optimization process using the pre-configured settings.
101
+
102
+ Args:
103
+ num_generations (int): The total number of generations for each repetition.
104
+ target_name (str): Target name used for the CSV filename and/or SQL table.
105
+ save_dir (str | Path): The directory where result files will be saved.
106
+ feature_names (List[str] | None): Names of the solution features for labeling output. If None, generic names like 'feature_0', 'feature_1', ... , will be created.
107
+ save_format (Literal['csv', 'sqlite', 'both']): The format for saving results.
108
+ repetitions (int): The number of independent times to run the optimization.
109
+ verbose (bool): If True, enables detailed logging.
110
+
111
+ Returns:
112
+ Optional[dict]: A dictionary with the best result if repetitions is 1, otherwise None.
113
+ """
114
+ # Call the existing run function with the stored problem, searcher, and binary feature count
115
+ return run_optimization(
116
+ problem=self.problem,
117
+ searcher_factory=self.searcher_factory,
118
+ num_generations=num_generations,
119
+ target_name=target_name,
120
+ binary_features=self._binary_features,
121
+ save_dir=save_dir,
122
+ save_format=save_format,
123
+ feature_names=feature_names,
124
+ repetitions=repetitions,
125
+ verbose=verbose
126
+ )
127
+
128
+
29
129
  def create_pytorch_problem(
30
130
  inference_handler: PyTorchInferenceHandler,
31
131
  bounds: Tuple[List[float], List[float]],
@@ -38,7 +138,7 @@ def create_pytorch_problem(
38
138
  """
39
139
  Creates and configures an EvoTorch Problem and a Searcher factory class for a PyTorch model.
40
140
 
41
- SNES and CEM do not accept bounds, the given bounds will be used as initial bounds only.
141
+ SNES and CEM do not accept bounds, the given bounds will be used as an initial starting point.
42
142
 
43
143
  The Genetic Algorithm works directly with the bounds, and operators such as SimulatedBinaryCrossOver and GaussianMutation.
44
144
 
@@ -62,8 +162,8 @@ def create_pytorch_problem(
62
162
 
63
163
  # add binary bounds
64
164
  if binary_features > 0:
65
- lower_bounds.extend([0.45] * binary_features)
66
- upper_bounds.extend([0.55] * binary_features)
165
+ lower_bounds.extend([0.48] * binary_features)
166
+ upper_bounds.extend([0.52] * binary_features)
67
167
 
68
168
  solution_length = len(lower_bounds)
69
169
  device = inference_handler.device
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "10.13.0"
3
+ version = "10.15.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }