dragon-ml-toolbox 10.13.0__tar.gz → 10.14.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (41) hide show
  1. {dragon_ml_toolbox-10.13.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-10.14.0}/PKG-INFO +1 -1
  2. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
  3. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/ETL_cleaning.py +28 -16
  4. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/pyproject.toml +1 -1
  5. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/LICENSE +0 -0
  6. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/LICENSE-THIRD-PARTY.md +0 -0
  7. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/README.md +0 -0
  8. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  9. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  10. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
  11. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  12. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/ETL_engineering.py +0 -0
  13. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/GUI_tools.py +0 -0
  14. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/MICE_imputation.py +0 -0
  15. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_callbacks.py +0 -0
  16. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_datasetmaster.py +0 -0
  17. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_evaluation.py +0 -0
  18. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_evaluation_multi.py +0 -0
  19. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_inference.py +0 -0
  20. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_models.py +0 -0
  21. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_optimization.py +0 -0
  22. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_scaler.py +0 -0
  23. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_trainer.py +0 -0
  24. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/PSO_optimization.py +0 -0
  25. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/RNN_forecast.py +0 -0
  26. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/SQL.py +0 -0
  27. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/VIF_factor.py +0 -0
  28. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/__init__.py +0 -0
  29. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/_logger.py +0 -0
  30. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/_script_info.py +0 -0
  31. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/custom_logger.py +0 -0
  32. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/data_exploration.py +0 -0
  33. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/ensemble_evaluation.py +0 -0
  34. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/ensemble_inference.py +0 -0
  35. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/ensemble_learning.py +0 -0
  36. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/handle_excel.py +0 -0
  37. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/keys.py +0 -0
  38. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/optimization_tools.py +0 -0
  39. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/path_manager.py +0 -0
  40. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/ml_tools/utilities.py +0 -0
  41. {dragon_ml_toolbox-10.13.0 → dragon_ml_toolbox-10.14.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.13.0
3
+ Version: 10.14.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.13.0
3
+ Version: 10.14.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -96,7 +96,7 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
96
96
 
97
97
 
98
98
  ########## Basic df cleaners #############
99
- def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
99
+ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
100
100
  # Cleaning rules
101
101
  cleaning_rules = {
102
102
  # 1. Comprehensive Punctuation & Symbol Normalization
@@ -128,7 +128,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
128
128
  # Punctuation
129
129
  '》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']',
130
130
  '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
131
- '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=',
131
+ '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=', '·': '-',
132
132
 
133
133
  # Commas (avoid commas in entries)
134
134
  ',': ';',
@@ -159,6 +159,9 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
159
159
  r'!{2,}': '!', # Replace two or more exclamation marks with a single one
160
160
  r';{2,}': ';',
161
161
  r'-{2,}': '-',
162
+ r'/{2,}': '/',
163
+ r'%{2,}': '%',
164
+ r'&{2,}': '&',
162
165
 
163
166
  # 2. Internal Whitespace Consolidation
164
167
  # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
@@ -170,7 +173,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
170
173
 
171
174
  # 4. Textual Null Standardization (New Step)
172
175
  # Convert common null-like text to actual nulls.
173
- r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;)$': None,
176
+ r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
174
177
 
175
178
  # 5. Final Nullification of Empty Strings
176
179
  # After all cleaning, if a string is now empty, convert it to a null
@@ -191,9 +194,13 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
191
194
  df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
192
195
 
193
196
  # apply lowercase to all string columns
194
- df_final = df_cleaned.with_columns(
195
- pl.col(pl.String).str.to_lowercase()
196
- )
197
+ if all_lowercase:
198
+ df_final = df_cleaned.with_columns(
199
+ pl.col(pl.String).str.to_lowercase()
200
+ )
201
+ else:
202
+ df_final = df_cleaned
203
+
197
204
  except Exception as e:
198
205
  _LOGGER.error(f"An error occurred during the cleaning process.")
199
206
  raise e
@@ -211,7 +218,7 @@ def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
211
218
  return input_path, output_path
212
219
 
213
220
 
214
- def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
221
+ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=True):
215
222
  """
216
223
  Performs a comprehensive, standardized cleaning on all columns of a CSV file.
217
224
 
@@ -221,13 +228,16 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
221
228
  - Stripping any leading or trailing whitespace.
222
229
  - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
223
230
  - Converting strings that become empty after cleaning into true null values.
224
- - Normalizing all text to lowercase.
231
+ - Normalizing all text to lowercase (Optional).
225
232
 
226
233
  Args:
227
- input_filepath (Union[str, Path]):
234
+ input_filepath (str | Path):
228
235
  The path to the source CSV file to be cleaned.
229
- output_filepath (Union[str, Path, None], optional):
236
+ output_filepath (str | Path):
230
237
  The path to save the cleaned CSV file.
238
+ all_lowercase (bool):
239
+ Whether to normalize all text to lowercase.
240
+
231
241
  """
232
242
  # Handle paths
233
243
  input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
@@ -236,7 +246,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
236
246
  df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
237
247
 
238
248
  # CLEAN
239
- df_final = _cleaner_core(df)
249
+ df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
240
250
 
241
251
  # Save cleaned dataframe
242
252
  save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
@@ -245,7 +255,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
245
255
 
246
256
 
247
257
  def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
248
- skip_targets: bool=False, threshold: float=0.8):
258
+ skip_targets: bool=False, threshold: float=0.8, all_lowercase: bool=True):
249
259
  """
250
260
  Performs standardized cleaning followed by iterative removal of rows and
251
261
  columns with excessive missing data.
@@ -262,12 +272,12 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
262
272
  dropping process are saved to the specified log directory.
263
273
 
264
274
  Args:
265
- input_filepath (str, Path):
275
+ input_filepath (str | Path):
266
276
  The path to the source CSV file to be cleaned.
267
- output_filepath (str, Path):
277
+ output_filepath (str | Path):
268
278
  The path to save the fully cleaned CSV file after cleaning
269
279
  and missing-data-based pruning.
270
- log_directory (str, Path):
280
+ log_directory (str | Path):
271
281
  Path to the directory where missing data reports will be stored.
272
282
  targets (list[str]):
273
283
  A list of column names to be treated as target variables.
@@ -279,6 +289,8 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
279
289
  The proportion of missing data required to drop a row or column.
280
290
  For example, 0.8 means a row/column will be dropped if 80% or more
281
291
  of its data is missing.
292
+ all_lowercase (bool):
293
+ Whether to normalize all text to lowercase.
282
294
  """
283
295
  # handle log path
284
296
  log_path = make_fullpath(log_directory, make=True, enforce="directory")
@@ -290,7 +302,7 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
290
302
  df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
291
303
 
292
304
  # CLEAN
293
- df_cleaned = _cleaner_core(df)
305
+ df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
294
306
 
295
307
  # switch to pandas
296
308
  df_cleaned_pandas = df_cleaned.to_pandas()
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "10.13.0"
3
+ version = "10.14.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }