dragon-ml-toolbox 10.1.1__py3-none-any.whl → 10.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.1.1
3
+ Version: 10.2.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
- dragon_ml_toolbox-10.1.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-10.1.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
- ml_tools/ETL_cleaning.py,sha256=i-hrafaAivg8wprcCmwHA5MkXFsUmHNR9RRGbIyw4ZE,15981
1
+ dragon_ml_toolbox-10.2.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-10.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
3
+ ml_tools/ETL_cleaning.py,sha256=-hxvnJYkGcBAR2eattOcgfGqPxM3TIORC6pCNvwDsf4,19113
4
4
  ml_tools/ETL_engineering.py,sha256=sgpIhlFIeId4eSJ-a33MnVuPNXs50msxFWa8-kw2hOI,36369
5
5
  ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
6
6
  ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
@@ -30,7 +30,7 @@ ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
30
30
  ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
31
31
  ml_tools/path_manager.py,sha256=TJgoqMAryc5F0dal8W_zvJgE1TpOzlskIyYJk614WW4,13809
32
32
  ml_tools/utilities.py,sha256=SVMaSDigh6SUoAeig2_sXLLIj5w5mUs5KuVWpHvFDec,19816
33
- dragon_ml_toolbox-10.1.1.dist-info/METADATA,sha256=wJ2byoP5azuIBrLRpUUQ96DkDAQuxVtgf2lFPafBUUQ,6968
34
- dragon_ml_toolbox-10.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
- dragon_ml_toolbox-10.1.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
36
- dragon_ml_toolbox-10.1.1.dist-info/RECORD,,
33
+ dragon_ml_toolbox-10.2.0.dist-info/METADATA,sha256=nJ-15xA7A7FgzYDRSi6xjhBmn32Fz57TEn2Wqg5hZRg,6968
34
+ dragon_ml_toolbox-10.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
+ dragon_ml_toolbox-10.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
36
+ dragon_ml_toolbox-10.2.0.dist-info/RECORD,,
ml_tools/ETL_cleaning.py CHANGED
@@ -3,6 +3,7 @@ import pandas as pd
3
3
  from pathlib import Path
4
4
  from typing import Union, List, Dict
5
5
  from .path_manager import sanitize_filename, make_fullpath
6
+ from .data_exploration import drop_macro
6
7
  from .utilities import save_dataframe, load_dataframe
7
8
  from ._script_info import _script_info
8
9
  from ._logger import _LOGGER
@@ -11,6 +12,7 @@ from ._logger import _LOGGER
11
12
  __all__ = [
12
13
  "save_unique_values",
13
14
  "basic_clean",
15
+ "basic_clean_drop",
14
16
  "ColumnCleaner",
15
17
  "DataFrameCleaner"
16
18
  ]
@@ -93,39 +95,8 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
93
95
  _LOGGER.info(f"{counter} files of unique values created.")
94
96
 
95
97
 
96
- ########## Basic df cleaner #############
97
- def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path,None]=None):
98
- """
99
- Performs a comprehensive, standardized cleaning on all columns of a CSV file.
100
-
101
- The cleaning process includes:
102
- - Normalizing full-width and typographical punctuation to standard equivalents.
103
- - Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
104
- - Stripping any leading or trailing whitespace.
105
- - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
106
- - Converting strings that become empty after cleaning into true null values.
107
- - Normalizing all text to lowercase.
108
-
109
- Args:
110
- input_filepath (Union[str, Path]):
111
- The path to the source CSV file to be cleaned.
112
- output_filepath (Union[str, Path, None], optional):
113
- The path to save the cleaned CSV file. If None (default),
114
- the original input file will be overwritten.
115
- """
116
- # Handle paths
117
- input_path = make_fullpath(input_filepath, enforce="file")
118
-
119
- # Unless explicitly defined, overwrite file.
120
- if output_filepath is not None:
121
- parent_dir = make_fullpath(Path(output_filepath).parent, make=True, enforce="directory")
122
- output_path = parent_dir / Path(output_filepath).name
123
- else:
124
- output_path = input_path
125
-
126
- # load polars df
127
- df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
128
-
98
+ ########## Basic df cleaners #############
99
+ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
129
100
  # Cleaning rules
130
101
  cleaning_rules = {
131
102
  # 1. Comprehensive Punctuation & Symbol Normalization
@@ -141,6 +112,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
141
112
  '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
142
113
  '₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
143
114
  '₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
115
+ '⁺': '', '⁻': '', '₊': '', '₋': '',
144
116
  # Uppercase Alphabet
145
117
  'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F',
146
118
  'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L',
@@ -154,7 +126,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
154
126
  's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
155
127
  'y': 'y', 'z': 'z',
156
128
  # Punctuation
157
- '》': '>', '《': '<', ':': ':', ',': ',', '。': '.', ';': ';', '【': '[', '】': ']',
129
+ '》': '>', '《': '<', ':': ':', ',': ',', '。': '.', ';': ';', '【': '[', '】': ']',
158
130
  '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
159
131
  '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|', '、':',', '≈':'=',
160
132
 
@@ -171,6 +143,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
171
143
  # Typographical standardization
172
144
  # Unify various dashes and hyphens to a standard hyphen-minus
173
145
  r'[—–―]': '-',
146
+ r'−': '-',
174
147
  # Unify various quote types to standard quotes
175
148
  r'[“”]': "'",
176
149
  r'[‘’′]': "'",
@@ -184,33 +157,138 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
184
157
  r'^\s+|\s+$': '',
185
158
 
186
159
  # 4. Textual Null Standardization (New Step)
187
- # Convert common null-like text to actual nulls. (?i) makes it case-insensitive.
160
+ # Convert common null-like text to actual nulls.
188
161
  r'^(N/A|无|NA|NULL|NONE|NIL|)$': None,
189
162
 
190
163
  # 5. Final Nullification of Empty Strings
191
164
  # After all cleaning, if a string is now empty, convert it to a null
192
- r'^$': None
165
+ r'^\s*$': None,
166
+ r'^$': None,
193
167
  }
194
168
 
195
169
  # Clean data
196
170
  try:
197
171
  # Create a cleaner for every column in the dataframe
198
- all_columns = df.columns
172
+ all_columns = df_in.columns
199
173
  column_cleaners = [
200
174
  ColumnCleaner(col, rules=cleaning_rules, case_insensitive=True) for col in all_columns
201
175
  ]
202
176
 
203
177
  # Instantiate and run the main dataframe cleaner
204
178
  df_cleaner = DataFrameCleaner(cleaners=column_cleaners)
205
- df_cleaned = df_cleaner.clean(df, clone_df=False) # Use clone_df=False for efficiency
179
+ df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
206
180
 
207
181
  # apply lowercase to all string columns
208
182
  df_final = df_cleaned.with_columns(
209
183
  pl.col(pl.String).str.to_lowercase()
210
184
  )
211
185
  except Exception as e:
212
- _LOGGER.error(f"An error occurred during the cleaning process for '{input_path.name}'.")
186
+ _LOGGER.error(f"An error occurred during the cleaning process.")
213
187
  raise e
188
+ else:
189
+ return df_final
190
+
191
+
192
+ def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
193
+ # Handle paths
194
+ input_path = make_fullpath(path_in, enforce="file")
195
+
196
+ parent_dir = make_fullpath(Path(path_out).parent, make=True, enforce="directory")
197
+ output_path = parent_dir / Path(path_out).name
198
+
199
+ return input_path, output_path
200
+
201
+
202
+ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
203
+ """
204
+ Performs a comprehensive, standardized cleaning on all columns of a CSV file.
205
+
206
+ The cleaning process includes:
207
+ - Normalizing full-width and typographical punctuation to standard equivalents.
208
+ - Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
209
+ - Stripping any leading or trailing whitespace.
210
+ - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
211
+ - Converting strings that become empty after cleaning into true null values.
212
+ - Normalizing all text to lowercase.
213
+
214
+ Args:
215
+ input_filepath (Union[str, Path]):
216
+ The path to the source CSV file to be cleaned.
217
+ output_filepath (Union[str, Path, None], optional):
218
+ The path to save the cleaned CSV file.
219
+ """
220
+ # Handle paths
221
+ input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
222
+
223
+ # load polars df
224
+ df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
225
+
226
+ # CLEAN
227
+ df_final = _cleaner_core(df)
228
+
229
+ # Save cleaned dataframe
230
+ save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
231
+
232
+ _LOGGER.info(f"Data successfully cleaned.")
233
+
234
+
235
+ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
236
+ skip_targets: bool=False, threshold: float=0.8):
237
+ """
238
+ Performs standardized cleaning followed by iterative removal of rows and
239
+ columns with excessive missing data.
240
+
241
+ This function combines the functionality of `basic_clean` and `drop_macro`. It first
242
+ applies a comprehensive normalization process to all columns in the input CSV file,
243
+ ensuring consistent formatting and proper null value handling. The cleaned data is then
244
+ converted to a pandas DataFrame, where iterative row and column dropping is applied
245
+ to remove redundant or incomplete data.
246
+
247
+ The iterative dropping cycle continues until no further rows or columns meet the
248
+ removal criteria, ensuring that dependencies between row and column deletions are
249
+ fully resolved. Logs documenting the missing data profile before and after the
250
+ dropping process are saved to the specified log directory.
251
+
252
+ Args:
253
+ input_filepath (str, Path):
254
+ The path to the source CSV file to be cleaned.
255
+ output_filepath (str, Path):
256
+ The path to save the fully cleaned CSV file after cleaning
257
+ and missing-data-based pruning.
258
+ log_directory (str, Path):
259
+ Path to the directory where missing data reports will be stored.
260
+ targets (list[str]):
261
+ A list of column names to be treated as target variables.
262
+ This list guides the row-dropping logic.
263
+ skip_targets (bool):
264
+ If True, the columns listed in `targets` will be exempt from being dropped,
265
+ even if they exceed the missing data threshold.
266
+ threshold (float):
267
+ The proportion of missing data required to drop a row or column.
268
+ For example, 0.8 means a row/column will be dropped if 80% or more
269
+ of its data is missing.
270
+ """
271
+ # handle log path
272
+ log_path = make_fullpath(log_directory, make=True, enforce="directory")
273
+
274
+ # Handle df paths
275
+ input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
276
+
277
+ # load polars df
278
+ df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
279
+
280
+ # CLEAN
281
+ df_cleaned = _cleaner_core(df)
282
+
283
+ # switch to pandas
284
+ df_cleaned_pandas = df_cleaned.to_pandas()
285
+
286
+ # Drop macro
287
+ df_final = drop_macro(df=df_cleaned_pandas,
288
+ log_directory=log_path,
289
+ targets=targets,
290
+ skip_targets=skip_targets,
291
+ threshold=threshold)
214
292
 
215
293
  # Save cleaned dataframe
216
294
  save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)