dragon-ml-toolbox 10.1.1__tar.gz → 10.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (41) hide show
  1. {dragon_ml_toolbox-10.1.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-10.2.1}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/ETL_cleaning.py +135 -47
  4. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/pyproject.toml +1 -1
  5. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/LICENSE +0 -0
  6. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/LICENSE-THIRD-PARTY.md +0 -0
  7. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/README.md +0 -0
  8. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  9. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  10. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  11. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  12. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/ETL_engineering.py +0 -0
  13. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/GUI_tools.py +0 -0
  14. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/MICE_imputation.py +0 -0
  15. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/ML_callbacks.py +0 -0
  16. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/ML_datasetmaster.py +0 -0
  17. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/ML_evaluation.py +0 -0
  18. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/ML_evaluation_multi.py +0 -0
  19. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/ML_inference.py +0 -0
  20. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/ML_models.py +0 -0
  21. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/ML_optimization.py +0 -0
  22. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/ML_scaler.py +0 -0
  23. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/ML_trainer.py +0 -0
  24. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/PSO_optimization.py +0 -0
  25. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/RNN_forecast.py +0 -0
  26. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/SQL.py +0 -0
  27. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/VIF_factor.py +0 -0
  28. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/__init__.py +0 -0
  29. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/_logger.py +0 -0
  30. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/_script_info.py +0 -0
  31. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/custom_logger.py +0 -0
  32. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/data_exploration.py +0 -0
  33. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/ensemble_evaluation.py +0 -0
  34. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/ensemble_inference.py +0 -0
  35. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/ensemble_learning.py +0 -0
  36. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/handle_excel.py +0 -0
  37. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/keys.py +0 -0
  38. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/optimization_tools.py +0 -0
  39. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/path_manager.py +0 -0
  40. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/ml_tools/utilities.py +0 -0
  41. {dragon_ml_toolbox-10.1.1 → dragon_ml_toolbox-10.2.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.1.1
3
+ Version: 10.2.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.1.1
3
+ Version: 10.2.1
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -3,6 +3,7 @@ import pandas as pd
3
3
  from pathlib import Path
4
4
  from typing import Union, List, Dict
5
5
  from .path_manager import sanitize_filename, make_fullpath
6
+ from .data_exploration import drop_macro
6
7
  from .utilities import save_dataframe, load_dataframe
7
8
  from ._script_info import _script_info
8
9
  from ._logger import _LOGGER
@@ -11,6 +12,7 @@ from ._logger import _LOGGER
11
12
  __all__ = [
12
13
  "save_unique_values",
13
14
  "basic_clean",
15
+ "basic_clean_drop",
14
16
  "ColumnCleaner",
15
17
  "DataFrameCleaner"
16
18
  ]
@@ -93,39 +95,8 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
93
95
  _LOGGER.info(f"{counter} files of unique values created.")
94
96
 
95
97
 
96
- ########## Basic df cleaner #############
97
- def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path,None]=None):
98
- """
99
- Performs a comprehensive, standardized cleaning on all columns of a CSV file.
100
-
101
- The cleaning process includes:
102
- - Normalizing full-width and typographical punctuation to standard equivalents.
103
- - Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
104
- - Stripping any leading or trailing whitespace.
105
- - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
106
- - Converting strings that become empty after cleaning into true null values.
107
- - Normalizing all text to lowercase.
108
-
109
- Args:
110
- input_filepath (Union[str, Path]):
111
- The path to the source CSV file to be cleaned.
112
- output_filepath (Union[str, Path, None], optional):
113
- The path to save the cleaned CSV file. If None (default),
114
- the original input file will be overwritten.
115
- """
116
- # Handle paths
117
- input_path = make_fullpath(input_filepath, enforce="file")
118
-
119
- # Unless explicitly defined, overwrite file.
120
- if output_filepath is not None:
121
- parent_dir = make_fullpath(Path(output_filepath).parent, make=True, enforce="directory")
122
- output_path = parent_dir / Path(output_filepath).name
123
- else:
124
- output_path = input_path
125
-
126
- # load polars df
127
- df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
128
-
98
+ ########## Basic df cleaners #############
99
+ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
129
100
  # Cleaning rules
130
101
  cleaning_rules = {
131
102
  # 1. Comprehensive Punctuation & Symbol Normalization
@@ -141,6 +112,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
141
112
  '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
142
113
  '₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
143
114
  '₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
115
+ '⁺': '', '⁻': '', '₊': '', '₋': '',
144
116
  # Uppercase Alphabet
145
117
  'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F',
146
118
  'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L',
@@ -154,26 +126,37 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
154
126
  's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
155
127
  'y': 'y', 'z': 'z',
156
128
  # Punctuation
157
- '》': '>', '《': '<', ':': ':', ',': ',', '。': '.', ';': ';', '【': '[', '】': ']',
129
+ '》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']',
158
130
  '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
159
- '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|', '、':',', '≈':'=',
131
+ '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=',
132
+
133
+ # Commas (avoid commas in entries)
134
+ ',': ';',
135
+ ',': ';',
136
+ '、':';',
160
137
 
161
138
  # Others
162
139
  '©': '',
163
140
  '®': '',
164
141
  '™': '',
142
+ r'[°˚]': '',
143
+
144
+ # Replace special characters in entries
145
+ r'\\': '-',
146
+ '/': '-',
147
+
148
+ # Typographical standardization
149
+ # Unify various dashes and hyphens to a standard hyphen
150
+ r'[—–―]': '-',
151
+ r'−': '-',
152
+ # remove various quote types
153
+ r'[“”"]': '',
154
+ r"[‘’′']": '',
165
155
 
166
156
  # Collapse repeating punctuation
167
157
  r'\.{2,}': '.', # Replace two or more dots with a single dot
168
158
  r'\?{2,}': '?', # Replace two or more question marks with a single question mark
169
159
  r'!{2,}': '!', # Replace two or more exclamation marks with a single one
170
-
171
- # Typographical standardization
172
- # Unify various dashes and hyphens to a standard hyphen-minus
173
- r'[—–―]': '-',
174
- # Unify various quote types to standard quotes
175
- r'[“”]': "'",
176
- r'[‘’′]': "'",
177
160
 
178
161
  # 2. Internal Whitespace Consolidation
179
162
  # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
@@ -184,33 +167,138 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
184
167
  r'^\s+|\s+$': '',
185
168
 
186
169
  # 4. Textual Null Standardization (New Step)
187
- # Convert common null-like text to actual nulls. (?i) makes it case-insensitive.
170
+ # Convert common null-like text to actual nulls.
188
171
  r'^(N/A|无|NA|NULL|NONE|NIL|)$': None,
189
172
 
190
173
  # 5. Final Nullification of Empty Strings
191
174
  # After all cleaning, if a string is now empty, convert it to a null
192
- r'^$': None
175
+ r'^\s*$': None,
176
+ r'^$': None,
193
177
  }
194
178
 
195
179
  # Clean data
196
180
  try:
197
181
  # Create a cleaner for every column in the dataframe
198
- all_columns = df.columns
182
+ all_columns = df_in.columns
199
183
  column_cleaners = [
200
184
  ColumnCleaner(col, rules=cleaning_rules, case_insensitive=True) for col in all_columns
201
185
  ]
202
186
 
203
187
  # Instantiate and run the main dataframe cleaner
204
188
  df_cleaner = DataFrameCleaner(cleaners=column_cleaners)
205
- df_cleaned = df_cleaner.clean(df, clone_df=False) # Use clone_df=False for efficiency
189
+ df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
206
190
 
207
191
  # apply lowercase to all string columns
208
192
  df_final = df_cleaned.with_columns(
209
193
  pl.col(pl.String).str.to_lowercase()
210
194
  )
211
195
  except Exception as e:
212
- _LOGGER.error(f"An error occurred during the cleaning process for '{input_path.name}'.")
196
+ _LOGGER.error(f"An error occurred during the cleaning process.")
213
197
  raise e
198
+ else:
199
+ return df_final
200
+
201
+
202
+ def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
203
+ # Handle paths
204
+ input_path = make_fullpath(path_in, enforce="file")
205
+
206
+ parent_dir = make_fullpath(Path(path_out).parent, make=True, enforce="directory")
207
+ output_path = parent_dir / Path(path_out).name
208
+
209
+ return input_path, output_path
210
+
211
+
212
+ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
213
+ """
214
+ Performs a comprehensive, standardized cleaning on all columns of a CSV file.
215
+
216
+ The cleaning process includes:
217
+ - Normalizing full-width and typographical punctuation to standard equivalents.
218
+ - Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
219
+ - Stripping any leading or trailing whitespace.
220
+ - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
221
+ - Converting strings that become empty after cleaning into true null values.
222
+ - Normalizing all text to lowercase.
223
+
224
+ Args:
225
+ input_filepath (Union[str, Path]):
226
+ The path to the source CSV file to be cleaned.
227
+ output_filepath (Union[str, Path, None], optional):
228
+ The path to save the cleaned CSV file.
229
+ """
230
+ # Handle paths
231
+ input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
232
+
233
+ # load polars df
234
+ df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
235
+
236
+ # CLEAN
237
+ df_final = _cleaner_core(df)
238
+
239
+ # Save cleaned dataframe
240
+ save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
241
+
242
+ _LOGGER.info(f"Data successfully cleaned.")
243
+
244
+
245
+ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
246
+ skip_targets: bool=False, threshold: float=0.8):
247
+ """
248
+ Performs standardized cleaning followed by iterative removal of rows and
249
+ columns with excessive missing data.
250
+
251
+ This function combines the functionality of `basic_clean` and `drop_macro`. It first
252
+ applies a comprehensive normalization process to all columns in the input CSV file,
253
+ ensuring consistent formatting and proper null value handling. The cleaned data is then
254
+ converted to a pandas DataFrame, where iterative row and column dropping is applied
255
+ to remove redundant or incomplete data.
256
+
257
+ The iterative dropping cycle continues until no further rows or columns meet the
258
+ removal criteria, ensuring that dependencies between row and column deletions are
259
+ fully resolved. Logs documenting the missing data profile before and after the
260
+ dropping process are saved to the specified log directory.
261
+
262
+ Args:
263
+ input_filepath (str, Path):
264
+ The path to the source CSV file to be cleaned.
265
+ output_filepath (str, Path):
266
+ The path to save the fully cleaned CSV file after cleaning
267
+ and missing-data-based pruning.
268
+ log_directory (str, Path):
269
+ Path to the directory where missing data reports will be stored.
270
+ targets (list[str]):
271
+ A list of column names to be treated as target variables.
272
+ This list guides the row-dropping logic.
273
+ skip_targets (bool):
274
+ If True, the columns listed in `targets` will be exempt from being dropped,
275
+ even if they exceed the missing data threshold.
276
+ threshold (float):
277
+ The proportion of missing data required to drop a row or column.
278
+ For example, 0.8 means a row/column will be dropped if 80% or more
279
+ of its data is missing.
280
+ """
281
+ # handle log path
282
+ log_path = make_fullpath(log_directory, make=True, enforce="directory")
283
+
284
+ # Handle df paths
285
+ input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
286
+
287
+ # load polars df
288
+ df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
289
+
290
+ # CLEAN
291
+ df_cleaned = _cleaner_core(df)
292
+
293
+ # switch to pandas
294
+ df_cleaned_pandas = df_cleaned.to_pandas()
295
+
296
+ # Drop macro
297
+ df_final = drop_macro(df=df_cleaned_pandas,
298
+ log_directory=log_path,
299
+ targets=targets,
300
+ skip_targets=skip_targets,
301
+ threshold=threshold)
214
302
 
215
303
  # Save cleaned dataframe
216
304
  save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "10.1.1"
3
+ version = "10.2.1"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }