dragon-ml-toolbox 10.12.1__py3-none-any.whl → 10.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-10.12.1.dist-info → dragon_ml_toolbox-10.14.0.dist-info}/METADATA +3 -28
- {dragon_ml_toolbox-10.12.1.dist-info → dragon_ml_toolbox-10.14.0.dist-info}/RECORD +8 -8
- {dragon_ml_toolbox-10.12.1.dist-info → dragon_ml_toolbox-10.14.0.dist-info}/licenses/LICENSE +1 -1
- {dragon_ml_toolbox-10.12.1.dist-info → dragon_ml_toolbox-10.14.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +1 -0
- ml_tools/ETL_cleaning.py +28 -16
- ml_tools/ensemble_inference.py +1 -1
- {dragon_ml_toolbox-10.12.1.dist-info → dragon_ml_toolbox-10.14.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-10.12.1.dist-info → dragon_ml_toolbox-10.14.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 10.
|
|
3
|
+
Version: 10.14.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -12,12 +12,6 @@ Requires-Python: >=3.10
|
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
License-File: LICENSE-THIRD-PARTY.md
|
|
15
|
-
Provides-Extra: base
|
|
16
|
-
Requires-Dist: pandas; extra == "base"
|
|
17
|
-
Requires-Dist: numpy; extra == "base"
|
|
18
|
-
Requires-Dist: polars; extra == "base"
|
|
19
|
-
Requires-Dist: joblib; extra == "base"
|
|
20
|
-
Requires-Dist: colorlog; extra == "base"
|
|
21
15
|
Provides-Extra: ml
|
|
22
16
|
Requires-Dist: numpy>=2.0; extra == "ml"
|
|
23
17
|
Requires-Dist: pandas; extra == "ml"
|
|
@@ -38,6 +32,7 @@ Requires-Dist: shap; extra == "ml"
|
|
|
38
32
|
Requires-Dist: tqdm; extra == "ml"
|
|
39
33
|
Requires-Dist: Pillow; extra == "ml"
|
|
40
34
|
Requires-Dist: evotorch; extra == "ml"
|
|
35
|
+
Requires-Dist: pyarrow; extra == "ml"
|
|
41
36
|
Requires-Dist: colorlog; extra == "ml"
|
|
42
37
|
Provides-Extra: mice
|
|
43
38
|
Requires-Dist: numpy<2.0; extra == "mice"
|
|
@@ -51,6 +46,7 @@ Requires-Dist: statsmodels; extra == "mice"
|
|
|
51
46
|
Requires-Dist: lightgbm<=4.5.0; extra == "mice"
|
|
52
47
|
Requires-Dist: shap; extra == "mice"
|
|
53
48
|
Requires-Dist: colorlog; extra == "mice"
|
|
49
|
+
Requires-Dist: pyarrow; extra == "mice"
|
|
54
50
|
Provides-Extra: pytorch
|
|
55
51
|
Requires-Dist: torch; extra == "pytorch"
|
|
56
52
|
Requires-Dist: torchvision; extra == "pytorch"
|
|
@@ -255,27 +251,6 @@ path_manager
|
|
|
255
251
|
|
|
256
252
|
---
|
|
257
253
|
|
|
258
|
-
### 🎫 Base Tools [base]
|
|
259
|
-
|
|
260
|
-
General purpose functions and classes.
|
|
261
|
-
|
|
262
|
-
```Bash
|
|
263
|
-
pip install "dragon-ml-toolbox[base]"
|
|
264
|
-
```
|
|
265
|
-
|
|
266
|
-
#### Modules:
|
|
267
|
-
|
|
268
|
-
```Bash
|
|
269
|
-
ETL_cleaning
|
|
270
|
-
ETL_engineering
|
|
271
|
-
custom_logger
|
|
272
|
-
SQL
|
|
273
|
-
utilities
|
|
274
|
-
path_manager
|
|
275
|
-
```
|
|
276
|
-
|
|
277
|
-
---
|
|
278
|
-
|
|
279
254
|
### ⚒️ APP bundlers
|
|
280
255
|
|
|
281
256
|
Choose one if needed.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
dragon_ml_toolbox-10.
|
|
2
|
-
dragon_ml_toolbox-10.
|
|
3
|
-
ml_tools/ETL_cleaning.py,sha256=
|
|
1
|
+
dragon_ml_toolbox-10.14.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-10.14.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
|
|
3
|
+
ml_tools/ETL_cleaning.py,sha256=ECR3UwRMovifvDkVCyqmGDGlVhWst2eJS821NsRWny8,19851
|
|
4
4
|
ml_tools/ETL_engineering.py,sha256=a6KCWH6kRatZtjaFEF_o917ApPMK5_vRD-BjfCDAl-E,49400
|
|
5
5
|
ml_tools/GUI_tools.py,sha256=kEQWg-bog3pB5tI22gMGKWaCGHnz9TB2Lvvfhf5F2CI,45412
|
|
6
6
|
ml_tools/MICE_imputation.py,sha256=kVSythWfxJFR4-2mtcYCWQaQ1Oz5yyx_SJu5gjnS7H8,11670
|
|
@@ -23,14 +23,14 @@ ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
|
23
23
|
ml_tools/custom_logger.py,sha256=ry43hk54K6xKo8jRAgq1sFxUpOA9T0LIJ7sw0so2BW0,5880
|
|
24
24
|
ml_tools/data_exploration.py,sha256=-aTi5jmv4AepPgi2k_85qEJsSLx5zPOtTbhorqzUvGQ,38542
|
|
25
25
|
ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
|
|
26
|
-
ml_tools/ensemble_inference.py,sha256=
|
|
26
|
+
ml_tools/ensemble_inference.py,sha256=Hun_ipIZaaLrHxSo63J6NKS_O1fMWi_6HkuSHs4RywI,9349
|
|
27
27
|
ml_tools/ensemble_learning.py,sha256=3s0kH4i_naj0IVl_T4knst-Hwg4TScWjEdsXX5KAi7I,21929
|
|
28
28
|
ml_tools/handle_excel.py,sha256=He4UT15sCGhaG-JKfs7uYVAubxWjrqgJ6U7OhMR2fuE,14005
|
|
29
29
|
ml_tools/keys.py,sha256=FDpbS3Jb0pjrVvvp2_8nZi919mbob_-xwuy5OOtKM_A,1848
|
|
30
30
|
ml_tools/optimization_tools.py,sha256=P3I6lIpvZ8Xf2kX5FvvBKBmrK2pB6idBpkTzfUJxTeE,5073
|
|
31
31
|
ml_tools/path_manager.py,sha256=ke0MYOhYheRPX599GUbrvRsYHn2JKUmMDldS5LP6LQA,18431
|
|
32
32
|
ml_tools/utilities.py,sha256=uheMUjQJ1zI69gASsE-mCq4KlRPVGgrgqson02rGNYM,30755
|
|
33
|
-
dragon_ml_toolbox-10.
|
|
34
|
-
dragon_ml_toolbox-10.
|
|
35
|
-
dragon_ml_toolbox-10.
|
|
36
|
-
dragon_ml_toolbox-10.
|
|
33
|
+
dragon_ml_toolbox-10.14.0.dist-info/METADATA,sha256=s9xXp4uPjlB65ieGCO25tkUH7D8l81dAYGO_WYiooY0,6608
|
|
34
|
+
dragon_ml_toolbox-10.14.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
35
|
+
dragon_ml_toolbox-10.14.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
36
|
+
dragon_ml_toolbox-10.14.0.dist-info/RECORD,,
|
|
@@ -26,3 +26,4 @@ This project depends on the following third-party packages. Each is governed by
|
|
|
26
26
|
- [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
|
|
27
27
|
- [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE)
|
|
28
28
|
- [tqdm](https://github.com/tqdm/tqdm/blob/master/LICENSE)
|
|
29
|
+
- [pyarrow](https://github.com/apache/arrow/blob/main/LICENSE.txt)
|
ml_tools/ETL_cleaning.py
CHANGED
|
@@ -96,7 +96,7 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
|
|
|
96
96
|
|
|
97
97
|
|
|
98
98
|
########## Basic df cleaners #############
|
|
99
|
-
def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
|
|
99
|
+
def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
|
|
100
100
|
# Cleaning rules
|
|
101
101
|
cleaning_rules = {
|
|
102
102
|
# 1. Comprehensive Punctuation & Symbol Normalization
|
|
@@ -128,7 +128,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
|
|
|
128
128
|
# Punctuation
|
|
129
129
|
'》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']',
|
|
130
130
|
'(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
|
|
131
|
-
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=',
|
|
131
|
+
'$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=', '·': '-',
|
|
132
132
|
|
|
133
133
|
# Commas (avoid commas in entries)
|
|
134
134
|
',': ';',
|
|
@@ -159,6 +159,9 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
|
|
|
159
159
|
r'!{2,}': '!', # Replace two or more exclamation marks with a single one
|
|
160
160
|
r';{2,}': ';',
|
|
161
161
|
r'-{2,}': '-',
|
|
162
|
+
r'/{2,}': '/',
|
|
163
|
+
r'%{2,}': '%',
|
|
164
|
+
r'&{2,}': '&',
|
|
162
165
|
|
|
163
166
|
# 2. Internal Whitespace Consolidation
|
|
164
167
|
# Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
|
|
@@ -170,7 +173,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
|
|
|
170
173
|
|
|
171
174
|
# 4. Textual Null Standardization (New Step)
|
|
172
175
|
# Convert common null-like text to actual nulls.
|
|
173
|
-
r'^(N/A|无|NA|NULL|NONE|NIL
|
|
176
|
+
r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
|
|
174
177
|
|
|
175
178
|
# 5. Final Nullification of Empty Strings
|
|
176
179
|
# After all cleaning, if a string is now empty, convert it to a null
|
|
@@ -191,9 +194,13 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
|
|
|
191
194
|
df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
|
|
192
195
|
|
|
193
196
|
# apply lowercase to all string columns
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
+
if all_lowercase:
|
|
198
|
+
df_final = df_cleaned.with_columns(
|
|
199
|
+
pl.col(pl.String).str.to_lowercase()
|
|
200
|
+
)
|
|
201
|
+
else:
|
|
202
|
+
df_final = df_cleaned
|
|
203
|
+
|
|
197
204
|
except Exception as e:
|
|
198
205
|
_LOGGER.error(f"An error occurred during the cleaning process.")
|
|
199
206
|
raise e
|
|
@@ -211,7 +218,7 @@ def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
|
|
|
211
218
|
return input_path, output_path
|
|
212
219
|
|
|
213
220
|
|
|
214
|
-
def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
|
|
221
|
+
def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=True):
|
|
215
222
|
"""
|
|
216
223
|
Performs a comprehensive, standardized cleaning on all columns of a CSV file.
|
|
217
224
|
|
|
@@ -221,13 +228,16 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
221
228
|
- Stripping any leading or trailing whitespace.
|
|
222
229
|
- Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
|
|
223
230
|
- Converting strings that become empty after cleaning into true null values.
|
|
224
|
-
- Normalizing all text to lowercase.
|
|
231
|
+
- Normalizing all text to lowercase (Optional).
|
|
225
232
|
|
|
226
233
|
Args:
|
|
227
|
-
input_filepath (
|
|
234
|
+
input_filepath (str | Path):
|
|
228
235
|
The path to the source CSV file to be cleaned.
|
|
229
|
-
output_filepath (
|
|
236
|
+
output_filepath (str | Path):
|
|
230
237
|
The path to save the cleaned CSV file.
|
|
238
|
+
all_lowercase (bool):
|
|
239
|
+
Whether to normalize all text to lowercase.
|
|
240
|
+
|
|
231
241
|
"""
|
|
232
242
|
# Handle paths
|
|
233
243
|
input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
|
|
@@ -236,7 +246,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
236
246
|
df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
|
|
237
247
|
|
|
238
248
|
# CLEAN
|
|
239
|
-
df_final = _cleaner_core(df)
|
|
249
|
+
df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
|
|
240
250
|
|
|
241
251
|
# Save cleaned dataframe
|
|
242
252
|
save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
|
|
@@ -245,7 +255,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
|
|
|
245
255
|
|
|
246
256
|
|
|
247
257
|
def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
|
|
248
|
-
skip_targets: bool=False, threshold: float=0.8):
|
|
258
|
+
skip_targets: bool=False, threshold: float=0.8, all_lowercase: bool=True):
|
|
249
259
|
"""
|
|
250
260
|
Performs standardized cleaning followed by iterative removal of rows and
|
|
251
261
|
columns with excessive missing data.
|
|
@@ -262,12 +272,12 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
|
|
|
262
272
|
dropping process are saved to the specified log directory.
|
|
263
273
|
|
|
264
274
|
Args:
|
|
265
|
-
input_filepath (str
|
|
275
|
+
input_filepath (str | Path):
|
|
266
276
|
The path to the source CSV file to be cleaned.
|
|
267
|
-
output_filepath (str
|
|
277
|
+
output_filepath (str | Path):
|
|
268
278
|
The path to save the fully cleaned CSV file after cleaning
|
|
269
279
|
and missing-data-based pruning.
|
|
270
|
-
log_directory (str
|
|
280
|
+
log_directory (str | Path):
|
|
271
281
|
Path to the directory where missing data reports will be stored.
|
|
272
282
|
targets (list[str]):
|
|
273
283
|
A list of column names to be treated as target variables.
|
|
@@ -279,6 +289,8 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
|
|
|
279
289
|
The proportion of missing data required to drop a row or column.
|
|
280
290
|
For example, 0.8 means a row/column will be dropped if 80% or more
|
|
281
291
|
of its data is missing.
|
|
292
|
+
all_lowercase (bool):
|
|
293
|
+
Whether to normalize all text to lowercase.
|
|
282
294
|
"""
|
|
283
295
|
# handle log path
|
|
284
296
|
log_path = make_fullpath(log_directory, make=True, enforce="directory")
|
|
@@ -290,7 +302,7 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
|
|
|
290
302
|
df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
|
|
291
303
|
|
|
292
304
|
# CLEAN
|
|
293
|
-
df_cleaned = _cleaner_core(df)
|
|
305
|
+
df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
|
|
294
306
|
|
|
295
307
|
# switch to pandas
|
|
296
308
|
df_cleaned_pandas = df_cleaned.to_pandas()
|
ml_tools/ensemble_inference.py
CHANGED
|
@@ -219,7 +219,7 @@ def model_report(
|
|
|
219
219
|
return report_data
|
|
220
220
|
|
|
221
221
|
|
|
222
|
-
# Local implementation to avoid calling utilities
|
|
222
|
+
# Local implementation to avoid calling utilities dependencies
|
|
223
223
|
def _deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
|
|
224
224
|
"""
|
|
225
225
|
Loads a serialized object from a .joblib file.
|
|
File without changes
|
|
File without changes
|