dragon-ml-toolbox 10.1.0__tar.gz → 10.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {dragon_ml_toolbox-10.1.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-10.2.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/ETL_cleaning.py +123 -42
  4. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/pyproject.toml +1 -1
  5. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/LICENSE +0 -0
  6. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/LICENSE-THIRD-PARTY.md +0 -0
  7. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/README.md +0 -0
  8. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  9. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  10. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  11. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  12. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/ETL_engineering.py +0 -0
  13. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/GUI_tools.py +0 -0
  14. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/MICE_imputation.py +0 -0
  15. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/ML_callbacks.py +0 -0
  16. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/ML_datasetmaster.py +0 -0
  17. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/ML_evaluation.py +0 -0
  18. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/ML_evaluation_multi.py +0 -0
  19. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/ML_inference.py +0 -0
  20. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/ML_models.py +0 -0
  21. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/ML_optimization.py +0 -0
  22. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/ML_scaler.py +0 -0
  23. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/ML_trainer.py +0 -0
  24. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/PSO_optimization.py +0 -0
  25. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/RNN_forecast.py +0 -0
  26. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/SQL.py +0 -0
  27. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/VIF_factor.py +0 -0
  28. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/__init__.py +0 -0
  29. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/_logger.py +0 -0
  30. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/_script_info.py +0 -0
  31. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/custom_logger.py +0 -0
  32. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/data_exploration.py +0 -0
  33. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/ensemble_evaluation.py +0 -0
  34. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/ensemble_inference.py +0 -0
  35. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/ensemble_learning.py +0 -0
  36. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/handle_excel.py +0 -0
  37. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/keys.py +0 -0
  38. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/optimization_tools.py +0 -0
  39. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/path_manager.py +0 -0
  40. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/ml_tools/utilities.py +0 -0
  41. {dragon_ml_toolbox-10.1.0 → dragon_ml_toolbox-10.2.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.1.0
3
+ Version: 10.2.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.1.0
3
+ Version: 10.2.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -3,6 +3,7 @@ import pandas as pd
3
3
  from pathlib import Path
4
4
  from typing import Union, List, Dict
5
5
  from .path_manager import sanitize_filename, make_fullpath
6
+ from .data_exploration import drop_macro
6
7
  from .utilities import save_dataframe, load_dataframe
7
8
  from ._script_info import _script_info
8
9
  from ._logger import _LOGGER
@@ -11,13 +12,14 @@ from ._logger import _LOGGER
11
12
  __all__ = [
12
13
  "save_unique_values",
13
14
  "basic_clean",
15
+ "basic_clean_drop",
14
16
  "ColumnCleaner",
15
17
  "DataFrameCleaner"
16
18
  ]
17
19
 
18
20
 
19
21
  ################ Unique Values per column #################
20
- def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path]) -> None:
22
+ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path], verbose: bool=False) -> None:
21
23
  """
22
24
  Loads a CSV file, then analyzes it and saves the unique non-null values
23
25
  from each column into a separate text file exactly as they appear.
@@ -50,6 +52,7 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path])
50
52
  _LOGGER.info(f"Data loaded from '{csv_path}'")
51
53
 
52
54
  # --- 3. Process Each Column ---
55
+ counter = 0
53
56
  for i, column_name in enumerate(df.columns):
54
57
  # _LOGGER.info(f"Processing column: '{column_name}'...")
55
58
 
@@ -85,44 +88,15 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path])
85
88
  except IOError:
86
89
  _LOGGER.exception(f"Error writing to file {file_path}.")
87
90
  else:
88
- _LOGGER.info(f"Successfully saved {len(sorted_uniques)} unique values from '{column_name}'.")
89
-
90
- _LOGGER.info("Process complete.")
91
+ if verbose:
92
+ _LOGGER.info(f"Successfully saved {len(sorted_uniques)} unique values from '{column_name}'.")
93
+ counter += 1
91
94
 
95
+ _LOGGER.info(f"{counter} files of unique values created.")
92
96
 
93
- ########## Basic df cleaner #############
94
- def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path,None]=None):
95
- """
96
- Performs a comprehensive, standardized cleaning on all columns of a CSV file.
97
97
 
98
- The cleaning process includes:
99
- - Normalizing full-width and typographical punctuation to standard equivalents.
100
- - Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
101
- - Stripping any leading or trailing whitespace.
102
- - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
103
- - Converting strings that become empty after cleaning into true null values.
104
- - Normalizing all text to lowercase.
105
-
106
- Args:
107
- input_filepath (Union[str, Path]):
108
- The path to the source CSV file to be cleaned.
109
- output_filepath (Union[str, Path, None], optional):
110
- The path to save the cleaned CSV file. If None (default),
111
- the original input file will be overwritten.
112
- """
113
- # Handle paths
114
- input_path = make_fullpath(input_filepath, enforce="file")
115
-
116
- # Unless explicitly defined, overwrite file.
117
- if output_filepath is not None:
118
- parent_dir = make_fullpath(Path(output_filepath).parent, make=True, enforce="directory")
119
- output_path = parent_dir / Path(output_filepath).name
120
- else:
121
- output_path = input_path
122
-
123
- # load polars df
124
- df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
125
-
98
+ ########## Basic df cleaners #############
99
+ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
126
100
  # Cleaning rules
127
101
  cleaning_rules = {
128
102
  # 1. Comprehensive Punctuation & Symbol Normalization
@@ -138,6 +112,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
138
112
  '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
139
113
  '₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
140
114
  '₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
115
+ '⁺': '', '⁻': '', '₊': '', '₋': '',
141
116
  # Uppercase Alphabet
142
117
  'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F',
143
118
  'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L',
@@ -151,7 +126,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
151
126
  's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
152
127
  'y': 'y', 'z': 'z',
153
128
  # Punctuation
154
- '》': '>', '《': '<', ':': ':', ',': ',', '。': '.', ';': ';', '【': '[', '】': ']',
129
+ '》': '>', '《': '<', ':': ':', ',': ',', '。': '.', ';': ';', '【': '[', '】': ']',
155
130
  '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
156
131
  '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|', '、':',', '≈':'=',
157
132
 
@@ -168,6 +143,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
168
143
  # Typographical standardization
169
144
  # Unify various dashes and hyphens to a standard hyphen-minus
170
145
  r'[—–―]': '-',
146
+ r'−': '-',
171
147
  # Unify various quote types to standard quotes
172
148
  r'[“”]': "'",
173
149
  r'[‘’′]': "'",
@@ -181,33 +157,138 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
181
157
  r'^\s+|\s+$': '',
182
158
 
183
159
  # 4. Textual Null Standardization (New Step)
184
- # Convert common null-like text to actual nulls. (?i) makes it case-insensitive.
160
+ # Convert common null-like text to actual nulls.
185
161
  r'^(N/A|无|NA|NULL|NONE|NIL|)$': None,
186
162
 
187
163
  # 5. Final Nullification of Empty Strings
188
164
  # After all cleaning, if a string is now empty, convert it to a null
189
- r'^$': None
165
+ r'^\s*$': None,
166
+ r'^$': None,
190
167
  }
191
168
 
192
169
  # Clean data
193
170
  try:
194
171
  # Create a cleaner for every column in the dataframe
195
- all_columns = df.columns
172
+ all_columns = df_in.columns
196
173
  column_cleaners = [
197
174
  ColumnCleaner(col, rules=cleaning_rules, case_insensitive=True) for col in all_columns
198
175
  ]
199
176
 
200
177
  # Instantiate and run the main dataframe cleaner
201
178
  df_cleaner = DataFrameCleaner(cleaners=column_cleaners)
202
- df_cleaned = df_cleaner.clean(df, clone_df=False) # Use clone_df=False for efficiency
179
+ df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
203
180
 
204
181
  # apply lowercase to all string columns
205
182
  df_final = df_cleaned.with_columns(
206
183
  pl.col(pl.String).str.to_lowercase()
207
184
  )
208
185
  except Exception as e:
209
- _LOGGER.error(f"An error occurred during the cleaning process for '{input_path.name}'.")
186
+ _LOGGER.error(f"An error occurred during the cleaning process.")
210
187
  raise e
188
+ else:
189
+ return df_final
190
+
191
+
192
+ def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
193
+ # Handle paths
194
+ input_path = make_fullpath(path_in, enforce="file")
195
+
196
+ parent_dir = make_fullpath(Path(path_out).parent, make=True, enforce="directory")
197
+ output_path = parent_dir / Path(path_out).name
198
+
199
+ return input_path, output_path
200
+
201
+
202
+ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
203
+ """
204
+ Performs a comprehensive, standardized cleaning on all columns of a CSV file.
205
+
206
+ The cleaning process includes:
207
+ - Normalizing full-width and typographical punctuation to standard equivalents.
208
+ - Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
209
+ - Stripping any leading or trailing whitespace.
210
+ - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
211
+ - Converting strings that become empty after cleaning into true null values.
212
+ - Normalizing all text to lowercase.
213
+
214
+ Args:
215
+ input_filepath (Union[str, Path]):
216
+ The path to the source CSV file to be cleaned.
217
+ output_filepath (Union[str, Path, None], optional):
218
+ The path to save the cleaned CSV file.
219
+ """
220
+ # Handle paths
221
+ input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
222
+
223
+ # load polars df
224
+ df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
225
+
226
+ # CLEAN
227
+ df_final = _cleaner_core(df)
228
+
229
+ # Save cleaned dataframe
230
+ save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
231
+
232
+ _LOGGER.info(f"Data successfully cleaned.")
233
+
234
+
235
+ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
236
+ skip_targets: bool=False, threshold: float=0.8):
237
+ """
238
+ Performs standardized cleaning followed by iterative removal of rows and
239
+ columns with excessive missing data.
240
+
241
+ This function combines the functionality of `basic_clean` and `drop_macro`. It first
242
+ applies a comprehensive normalization process to all columns in the input CSV file,
243
+ ensuring consistent formatting and proper null value handling. The cleaned data is then
244
+ converted to a pandas DataFrame, where iterative row and column dropping is applied
245
+ to remove redundant or incomplete data.
246
+
247
+ The iterative dropping cycle continues until no further rows or columns meet the
248
+ removal criteria, ensuring that dependencies between row and column deletions are
249
+ fully resolved. Logs documenting the missing data profile before and after the
250
+ dropping process are saved to the specified log directory.
251
+
252
+ Args:
253
+ input_filepath (str, Path):
254
+ The path to the source CSV file to be cleaned.
255
+ output_filepath (str, Path):
256
+ The path to save the fully cleaned CSV file after cleaning
257
+ and missing-data-based pruning.
258
+ log_directory (str, Path):
259
+ Path to the directory where missing data reports will be stored.
260
+ targets (list[str]):
261
+ A list of column names to be treated as target variables.
262
+ This list guides the row-dropping logic.
263
+ skip_targets (bool):
264
+ If True, the columns listed in `targets` will be exempt from being dropped,
265
+ even if they exceed the missing data threshold.
266
+ threshold (float):
267
+ The proportion of missing data required to drop a row or column.
268
+ For example, 0.8 means a row/column will be dropped if 80% or more
269
+ of its data is missing.
270
+ """
271
+ # handle log path
272
+ log_path = make_fullpath(log_directory, make=True, enforce="directory")
273
+
274
+ # Handle df paths
275
+ input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
276
+
277
+ # load polars df
278
+ df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
279
+
280
+ # CLEAN
281
+ df_cleaned = _cleaner_core(df)
282
+
283
+ # switch to pandas
284
+ df_cleaned_pandas = df_cleaned.to_pandas()
285
+
286
+ # Drop macro
287
+ df_final = drop_macro(df=df_cleaned_pandas,
288
+ log_directory=log_path,
289
+ targets=targets,
290
+ skip_targets=skip_targets,
291
+ threshold=threshold)
211
292
 
212
293
  # Save cleaned dataframe
213
294
  save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "10.1.0"
3
+ version = "10.2.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }