dragon-ml-toolbox 10.1.1__py3-none-any.whl → 14.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (48) hide show
  1. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
  2. dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
  3. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
  4. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
  5. ml_tools/ETL_cleaning.py +175 -59
  6. ml_tools/ETL_engineering.py +506 -70
  7. ml_tools/GUI_tools.py +2 -1
  8. ml_tools/MICE_imputation.py +212 -7
  9. ml_tools/ML_callbacks.py +73 -40
  10. ml_tools/ML_datasetmaster.py +267 -284
  11. ml_tools/ML_evaluation.py +119 -58
  12. ml_tools/ML_evaluation_multi.py +107 -32
  13. ml_tools/ML_inference.py +15 -5
  14. ml_tools/ML_models.py +234 -170
  15. ml_tools/ML_models_advanced.py +323 -0
  16. ml_tools/ML_optimization.py +321 -97
  17. ml_tools/ML_scaler.py +10 -5
  18. ml_tools/ML_trainer.py +585 -40
  19. ml_tools/ML_utilities.py +528 -0
  20. ml_tools/ML_vision_datasetmaster.py +1315 -0
  21. ml_tools/ML_vision_evaluation.py +260 -0
  22. ml_tools/ML_vision_inference.py +428 -0
  23. ml_tools/ML_vision_models.py +627 -0
  24. ml_tools/ML_vision_transformers.py +58 -0
  25. ml_tools/PSO_optimization.py +10 -7
  26. ml_tools/RNN_forecast.py +2 -0
  27. ml_tools/SQL.py +22 -9
  28. ml_tools/VIF_factor.py +4 -3
  29. ml_tools/_ML_vision_recipe.py +88 -0
  30. ml_tools/__init__.py +1 -0
  31. ml_tools/_logger.py +0 -2
  32. ml_tools/_schema.py +96 -0
  33. ml_tools/constants.py +79 -0
  34. ml_tools/custom_logger.py +164 -16
  35. ml_tools/data_exploration.py +1092 -109
  36. ml_tools/ensemble_evaluation.py +48 -1
  37. ml_tools/ensemble_inference.py +6 -7
  38. ml_tools/ensemble_learning.py +4 -3
  39. ml_tools/handle_excel.py +1 -0
  40. ml_tools/keys.py +80 -0
  41. ml_tools/math_utilities.py +259 -0
  42. ml_tools/optimization_tools.py +198 -24
  43. ml_tools/path_manager.py +144 -45
  44. ml_tools/serde.py +192 -0
  45. ml_tools/utilities.py +287 -227
  46. dragon_ml_toolbox-10.1.1.dist-info/RECORD +0 -36
  47. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
  48. {dragon_ml_toolbox-10.1.1.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,17 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.1.1
3
+ Version: 14.2.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
- Author-email: Karl Loza <luigiloza@gmail.com>
5
+ Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
7
7
  Project-URL: Homepage, https://github.com/DrAg0n-BoRn/ML_tools
8
8
  Project-URL: Changelog, https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md
9
9
  Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.10
11
+ Requires-Python: >=3.12
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  License-File: LICENSE-THIRD-PARTY.md
15
- Provides-Extra: base
16
- Requires-Dist: pandas; extra == "base"
17
- Requires-Dist: numpy; extra == "base"
18
- Requires-Dist: polars; extra == "base"
19
- Requires-Dist: joblib; extra == "base"
20
- Requires-Dist: colorlog; extra == "base"
21
15
  Provides-Extra: ml
22
16
  Requires-Dist: numpy>=2.0; extra == "ml"
23
17
  Requires-Dist: pandas; extra == "ml"
@@ -38,7 +32,12 @@ Requires-Dist: shap; extra == "ml"
38
32
  Requires-Dist: tqdm; extra == "ml"
39
33
  Requires-Dist: Pillow; extra == "ml"
40
34
  Requires-Dist: evotorch; extra == "ml"
35
+ Requires-Dist: pyarrow; extra == "ml"
41
36
  Requires-Dist: colorlog; extra == "ml"
37
+ Requires-Dist: torchmetrics; extra == "ml"
38
+ Provides-Extra: py-tab
39
+ Requires-Dist: pytorch_tabular; extra == "py-tab"
40
+ Requires-Dist: omegaconf; extra == "py-tab"
42
41
  Provides-Extra: mice
43
42
  Requires-Dist: numpy<2.0; extra == "mice"
44
43
  Requires-Dist: pandas; extra == "mice"
@@ -51,9 +50,7 @@ Requires-Dist: statsmodels; extra == "mice"
51
50
  Requires-Dist: lightgbm<=4.5.0; extra == "mice"
52
51
  Requires-Dist: shap; extra == "mice"
53
52
  Requires-Dist: colorlog; extra == "mice"
54
- Provides-Extra: pytorch
55
- Requires-Dist: torch; extra == "pytorch"
56
- Requires-Dist: torchvision; extra == "pytorch"
53
+ Requires-Dist: pyarrow; extra == "mice"
57
54
  Provides-Extra: excel
58
55
  Requires-Dist: pandas; extra == "excel"
59
56
  Requires-Dist: openpyxl; extra == "excel"
@@ -72,9 +69,6 @@ Requires-Dist: lightgbm; extra == "gui-boost"
72
69
  Provides-Extra: gui-torch
73
70
  Requires-Dist: numpy; extra == "gui-torch"
74
71
  Requires-Dist: FreeSimpleGUI>=5.2; extra == "gui-torch"
75
- Provides-Extra: plot
76
- Requires-Dist: matplotlib; extra == "plot"
77
- Requires-Dist: seaborn; extra == "plot"
78
72
  Provides-Extra: pyinstaller
79
73
  Requires-Dist: pyinstaller; extra == "pyinstaller"
80
74
  Provides-Extra: nuitka
@@ -94,7 +88,7 @@ A collection of Python utilities for data science and machine learning, structur
94
88
 
95
89
  ## Installation
96
90
 
97
- **Python 3.10+**
91
+ **Python 3.12**
98
92
 
99
93
  ### Via PyPI
100
94
 
@@ -104,22 +98,22 @@ Install the latest stable release from PyPI:
104
98
  pip install dragon-ml-toolbox
105
99
  ```
106
100
 
107
- ### Via GitHub (Editable)
101
+ ### Via conda-forge
108
102
 
109
- Clone the repository and install in editable mode with optional dependencies:
103
+ Install from the conda-forge channel:
110
104
 
111
105
  ```bash
112
- git clone https://github.com/DrAg0n-BoRn/ML_tools.git
113
- cd ML_tools
114
- pip install -e .
106
+ conda install -c conda-forge dragon-ml-toolbox
115
107
  ```
116
108
 
117
- ### Via conda-forge
109
+ ### Via GitHub (Editable)
118
110
 
119
- Install from the conda-forge channel:
111
+ Clone the repository and install in editable mode:
120
112
 
121
113
  ```bash
122
- conda install -c conda-forge dragon-ml-toolbox
114
+ git clone https://github.com/DrAg0n-BoRn/ML_tools.git
115
+ cd ML_tools
116
+ pip install -e .
123
117
  ```
124
118
 
125
119
  ## Modular Installation
@@ -132,17 +126,12 @@ Installs a comprehensive set of tools for typical data science workflows, includ
132
126
  pip install "dragon-ml-toolbox[ML]"
133
127
  ```
134
128
 
135
- To install the standard CPU-only versions of Torch and Torchvision:
136
-
137
- ```Bash
138
- pip install "dragon-ml-toolbox[pytorch]"
139
- ```
140
-
141
- ⚠️ To make use of GPU acceleration (highly recommended), follow the official instructions: [PyTorch website](https://pytorch.org/get-started/locally/)
129
+ ⚠️ PyTorch required, follow the official instructions: [PyTorch website](https://pytorch.org/get-started/locally/)
142
130
 
143
131
  #### Modules:
144
132
 
145
133
  ```bash
134
+ constants
146
135
  custom_logger
147
136
  data_exploration
148
137
  ensemble_evaluation
@@ -150,19 +139,28 @@ ensemble_inference
150
139
  ensemble_learning
151
140
  ETL_cleaning
152
141
  ETL_engineering
142
+ math_utilities
153
143
  ML_callbacks
154
144
  ML_datasetmaster
155
145
  ML_evaluation_multi
156
146
  ML_evaluation
157
147
  ML_inference
158
148
  ML_models
149
+ ML_models_advanced # Requires the extra flag [py-tab]
159
150
  ML_optimization
160
151
  ML_scaler
161
152
  ML_trainer
153
+ ML_utilities
154
+ ML_vision_datasetmaster
155
+ ML_vision_evaluation
156
+ ML_vision_inference
157
+ ML_vision_models
158
+ ML_vision_transformers
162
159
  optimization_tools
163
160
  path_manager
164
161
  PSO_optimization
165
162
  RNN_forecast
163
+ serde
166
164
  SQL
167
165
  utilities
168
166
  ```
@@ -180,8 +178,11 @@ pip install "dragon-ml-toolbox[mice]"
180
178
  #### Modules:
181
179
 
182
180
  ```Bash
181
+ constants
183
182
  custom_logger
183
+ math_utilities
184
184
  MICE_imputation
185
+ serde
185
186
  VIF_factor
186
187
  path_manager
187
188
  utilities
@@ -209,42 +210,37 @@ path_manager
209
210
 
210
211
  ### 🎰 GUI for Boosting Algorithms (XGBoost, LightGBM) [gui-boost]
211
212
 
212
- For GUIs that include plotting functionality, you must also install the [plot] extra.
213
+ GUI tools compatible with XGBoost and LightGBM models used for inference.
213
214
 
214
215
  ```Bash
215
216
  pip install "dragon-ml-toolbox[gui-boost]"
216
217
  ```
217
218
 
218
- ```Bash
219
- pip install "dragon-ml-toolbox[gui-boost,plot]"
220
- ```
221
-
222
219
  #### Modules:
223
220
 
224
221
  ```Bash
222
+ constants
225
223
  custom_logger
226
224
  GUI_tools
227
225
  ensemble_inference
228
226
  path_manager
227
+ serde
229
228
  ```
230
229
 
231
230
  ---
232
231
 
233
232
  ### 🤖 GUI for PyTorch Models [gui-torch]
234
233
 
235
- For GUIs that include plotting functionality, you must also install the [plot] extra.
234
+ GUI tools compatible with PyTorch models used for inference.
236
235
 
237
236
  ```Bash
238
237
  pip install "dragon-ml-toolbox[gui-torch]"
239
238
  ```
240
239
 
241
- ```Bash
242
- pip install "dragon-ml-toolbox[gui-torch,plot]"
243
- ```
244
-
245
240
  #### Modules:
246
241
 
247
242
  ```Bash
243
+ constants
248
244
  custom_logger
249
245
  GUI_tools
250
246
  ML_models
@@ -255,27 +251,6 @@ path_manager
255
251
 
256
252
  ---
257
253
 
258
- ### 🎫 Base Tools [base]
259
-
260
- General purpose functions and classes.
261
-
262
- ```Bash
263
- pip install "dragon-ml-toolbox[base]"
264
- ```
265
-
266
- #### Modules:
267
-
268
- ```Bash
269
- ETL_cleaning
270
- ETL_engineering
271
- custom_logger
272
- SQL
273
- utilities
274
- path_manager
275
- ```
276
-
277
- ---
278
-
279
254
  ### ⚒️ APP bundlers
280
255
 
281
256
  Choose one if needed.
@@ -293,6 +268,6 @@ pip install "dragon-ml-toolbox[nuitka]"
293
268
  After installation, import modules like this:
294
269
 
295
270
  ```python
296
- from ml_tools.utilities import serialize_object, deserialize_object
271
+ from ml_tools.serde import serialize_object, deserialize_object
297
272
  from ml_tools import custom_logger
298
273
  ```
@@ -0,0 +1,48 @@
1
+ dragon_ml_toolbox-14.2.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
+ dragon_ml_toolbox-14.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=gkOdNDbKYpIJezwSo2CEnISkLeYfYHv9t8b5K2-P69A,2687
3
+ ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
4
+ ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
5
+ ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
6
+ ml_tools/MICE_imputation.py,sha256=KLJXGQLKJ6AuWWttAG-LCCaxpS-ygM4dXPiguHDaL6Y,20815
7
+ ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
8
+ ml_tools/ML_datasetmaster.py,sha256=rsJgZEGBJmfeKF6cR8CQZzfEx4T7Y-p1wUnR15_nNw0,28400
9
+ ml_tools/ML_evaluation.py,sha256=4GU86rUWMIGbkXrvN6PyjfGwKtWvXKE7pMlWpWeBq14,18988
10
+ ml_tools/ML_evaluation_multi.py,sha256=rJKdgtq-9I7oaI7PRzq7aIZ84XdNV0xzlVePZW4nj0k,16095
11
+ ml_tools/ML_inference.py,sha256=YJ953bhNWsdlPRtJQh3h2ACfMIgp8dQ9KtL9Azar-5s,23489
12
+ ml_tools/ML_models.py,sha256=PqOcNlws7vCJMbiVCKqlPuktxvskZVUHG3VfU-Yshf8,31415
13
+ ml_tools/ML_models_advanced.py,sha256=vk3PZBSu3DVso2S1rKTxxdS43XG8Q5FnasIL3-rMajc,12410
14
+ ml_tools/ML_optimization.py,sha256=P0zkhKAwTpkorIBtR0AOIDcyexo5ngmvFUzo3DfNO-E,22692
15
+ ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
16
+ ml_tools/ML_trainer.py,sha256=ZWI4MbUcLeBxyfoUTL96l5tjHHMp9I64h4SdXnjYmBE,49795
17
+ ml_tools/ML_utilities.py,sha256=z6LbpbZwhn8F__fWlKi-g-cAJQXSxwg1NHfC5FBoAyc,21139
18
+ ml_tools/ML_vision_datasetmaster.py,sha256=tOrdatuq_AP8-GDiTrtARvSJdpc8h7dT-OhDJtRQnsk,54433
19
+ ml_tools/ML_vision_evaluation.py,sha256=t12R7i1RkOCt9zu1_lxSBr8OH6A6Get0k8ftDLctn6I,10486
20
+ ml_tools/ML_vision_inference.py,sha256=He3KV3VJAm8PwO-fOq4b9VO8UXFr-GmpuCnoHXf4VZI,20588
21
+ ml_tools/ML_vision_models.py,sha256=G3S4jB9AE9wMpU9ZygOgOx9q1K6t6LAXBYcJ-U2XQ1M,25600
22
+ ml_tools/ML_vision_transformers.py,sha256=95e0aBkHY5VDGE8i5xy57COU7NvSNIgFknnhBubwE40,1832
23
+ ml_tools/PSO_optimization.py,sha256=T-HWHMRJUnPvPwixdU5jif3_rnnI36TzcL8u3oSCwuA,22960
24
+ ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
25
+ ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
26
+ ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
27
+ ml_tools/_ML_vision_recipe.py,sha256=zrgxFUvTJqQVuwR7jWlbIC2FD29u6eNFPkTRoJ7yEZI,3178
28
+ ml_tools/__init__.py,sha256=kJiankjz9_qXu7gU92mYqYg_anLvt-B6RtW0mMH8uGo,76
29
+ ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
30
+ ml_tools/_schema.py,sha256=yu6aWmn_2Z4_AxAtJGDDCIa96y6JcUp-vgnCS013Qmw,3908
31
+ ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
32
+ ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
33
+ ml_tools/custom_logger.py,sha256=TGc0Ww2Xlqj2XE3q4bP43hV7T3qnb5ci9f0pYHXF5TY,11226
34
+ ml_tools/data_exploration.py,sha256=bwHzFJ-IAo5GN3T53F-1J_pXUg8VHS91sG_90utAsfg,69911
35
+ ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
36
+ ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
37
+ ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
38
+ ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
39
+ ml_tools/keys.py,sha256=wZOBuEnnHc54vlOZiimnrxfk-sZh6f6suPppJW8rbPQ,3326
40
+ ml_tools/math_utilities.py,sha256=xeKq1quR_3DYLgowcp4Uam_4s3JltUyOnqMOGuAiYWU,8802
41
+ ml_tools/optimization_tools.py,sha256=TYFQ2nSnp7xxs-VyoZISWgnGJghFbsWasHjruegyJRs,12763
42
+ ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
43
+ ml_tools/serde.py,sha256=c8uDYjYry_VrLvoG4ixqDj5pij88lVn6Tu4NHcPkwDU,6943
44
+ ml_tools/utilities.py,sha256=aWqvYzmxlD74PD5Yqu1VuTekDJeYLQrmPIU_VeVyRp0,22526
45
+ dragon_ml_toolbox-14.2.0.dist-info/METADATA,sha256=T0eIxD-eO3cbAIzJ1HskJbog6RUYgXwXQQ2OU8Z-GQM,6475
46
+ dragon_ml_toolbox-14.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
47
+ dragon_ml_toolbox-14.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
48
+ dragon_ml_toolbox-14.2.0.dist-info/RECORD,,
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2025 Karl Loza
3
+ Copyright (c) 2025 Karl Luigi Loza Vidaurre
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -26,3 +26,14 @@ This project depends on the following third-party packages. Each is governed by
26
26
  - [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
27
27
  - [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE)
28
28
  - [tqdm](https://github.com/tqdm/tqdm/blob/master/LICENSE)
29
+ - [pyarrow](https://github.com/apache/arrow/blob/main/LICENSE.txt)
30
+ - [colorlog](https://github.com/borntyping/python-colorlog/blob/main/LICENSE)
31
+ - [evotorch](https://github.com/nnaisense/evotorch/blob/master/LICENSE)
32
+ - [FreeSimpleGUI](https://github.com/spyoungtech/FreeSimpleGUI/blob/main/license.txt)
33
+ - [nuitka](https://github.com/Nuitka/Nuitka/blob/main/LICENSE.txt)
34
+ - [omegaconf](https://github.com/omry/omegaconf/blob/master/LICENSE)
35
+ - [ordered-set](https://github.com/rspeer/ordered-set/blob/master/MIT-LICENSE)
36
+ - [pyinstaller](https://github.com/pyinstaller/pyinstaller/blob/develop/COPYING.txt)
37
+ - [pytorch_tabular](https://github.com/manujosephv/pytorch_tabular/blob/main/LICENSE)
38
+ - [torchmetrics](https://github.com/Lightning-AI/torchmetrics/blob/master/LICENSE)
39
+ - [zstandard](https://github.com/indygreg/python-zstandard/blob/main/LICENSE)
ml_tools/ETL_cleaning.py CHANGED
@@ -2,8 +2,10 @@ import polars as pl
2
2
  import pandas as pd
3
3
  from pathlib import Path
4
4
  from typing import Union, List, Dict
5
+
5
6
  from .path_manager import sanitize_filename, make_fullpath
6
- from .utilities import save_dataframe, load_dataframe
7
+ from .data_exploration import drop_macro
8
+ from .utilities import save_dataframe_filename, load_dataframe
7
9
  from ._script_info import _script_info
8
10
  from ._logger import _LOGGER
9
11
 
@@ -11,26 +13,33 @@ from ._logger import _LOGGER
11
13
  __all__ = [
12
14
  "save_unique_values",
13
15
  "basic_clean",
16
+ "basic_clean_drop",
14
17
  "ColumnCleaner",
15
18
  "DataFrameCleaner"
16
19
  ]
17
20
 
18
21
 
19
22
  ################ Unique Values per column #################
20
- def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path], verbose: bool=False) -> None:
23
+ def save_unique_values(csv_path: Union[str, Path],
24
+ output_dir: Union[str, Path],
25
+ verbose: bool=False,
26
+ keep_column_order: bool = True) -> None:
21
27
  """
22
28
  Loads a CSV file, then analyzes it and saves the unique non-null values
23
29
  from each column into a separate text file exactly as they appear.
24
30
 
25
31
  This is useful for understanding the raw categories or range of values
26
- within a dataset before cleaning.
32
+ within a dataset before and after cleaning.
27
33
 
28
34
  Args:
29
- csv_path (Union[str, Path]):
35
+ csv_path (str | Path):
30
36
  The file path to the input CSV file.
31
- output_dir (Union[str, Path]):
37
+ output_dir (str | Path):
32
38
  The path to the directory where the .txt files will be saved.
33
39
  The directory will be created if it does not exist.
40
+ keep_column_order (bool):
41
+ If True, prepends a numeric prefix (e.g., '1_', '2_') to each
42
+ output filename to maintain the original column order.
34
43
  """
35
44
  # --- 1. Input Validation ---
36
45
  csv_path = make_fullpath(input_path=csv_path, enforce="file")
@@ -72,7 +81,12 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
72
81
  sanitized_name = sanitize_filename(column_name)
73
82
  if not sanitized_name.strip('_'):
74
83
  sanitized_name = f'column_{i}'
75
- file_path = output_dir / f"{sanitized_name}_unique_values.txt"
84
+
85
+ # --- create filename prefix ---
86
+ # If keep_column_order is True, create a prefix like "1_", "2_", etc.
87
+ prefix = f"{i + 1}_" if keep_column_order else ''
88
+
89
+ file_path = output_dir / f"{prefix}{sanitized_name}_unique_values.txt"
76
90
 
77
91
  # --- Write to file ---
78
92
  try:
@@ -93,39 +107,8 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
93
107
  _LOGGER.info(f"{counter} files of unique values created.")
94
108
 
95
109
 
96
- ########## Basic df cleaner #############
97
- def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path,None]=None):
98
- """
99
- Performs a comprehensive, standardized cleaning on all columns of a CSV file.
100
-
101
- The cleaning process includes:
102
- - Normalizing full-width and typographical punctuation to standard equivalents.
103
- - Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
104
- - Stripping any leading or trailing whitespace.
105
- - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
106
- - Converting strings that become empty after cleaning into true null values.
107
- - Normalizing all text to lowercase.
108
-
109
- Args:
110
- input_filepath (Union[str, Path]):
111
- The path to the source CSV file to be cleaned.
112
- output_filepath (Union[str, Path, None], optional):
113
- The path to save the cleaned CSV file. If None (default),
114
- the original input file will be overwritten.
115
- """
116
- # Handle paths
117
- input_path = make_fullpath(input_filepath, enforce="file")
118
-
119
- # Unless explicitly defined, overwrite file.
120
- if output_filepath is not None:
121
- parent_dir = make_fullpath(Path(output_filepath).parent, make=True, enforce="directory")
122
- output_path = parent_dir / Path(output_filepath).name
123
- else:
124
- output_path = input_path
125
-
126
- # load polars df
127
- df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
128
-
110
+ ########## Basic df cleaners #############
111
+ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
129
112
  # Cleaning rules
130
113
  cleaning_rules = {
131
114
  # 1. Comprehensive Punctuation & Symbol Normalization
@@ -141,6 +124,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
141
124
  '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0',
142
125
  '₁': '1', '₂': '2', '₃': '3', '₄': '4', '₅': '5',
143
126
  '₆': '6', '₇': '7', '₈': '8', '₉': '9', '₀': '0',
127
+ '⁺': '', '⁻': '', '₊': '', '₋': '',
144
128
  # Uppercase Alphabet
145
129
  'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F',
146
130
  'G': 'G', 'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L',
@@ -154,26 +138,44 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
154
138
  's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
155
139
  'y': 'y', 'z': 'z',
156
140
  # Punctuation
157
- '》': '>', '《': '<', ':': ':', ',': ',', '。': '.', ';': ';', '【': '[', '】': ']',
141
+ '》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']', '∼': '~',
158
142
  '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
159
- '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|', '':',', '':'=',
143
+ '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '':'=', '·': '', '': '',
144
+ '¯': '-',
145
+
146
+ # Commas (avoid commas in entries)
147
+ ',': ';',
148
+ ',': ';',
149
+ '、':';',
160
150
 
161
151
  # Others
152
+ 'σ': '',
153
+ '□': '',
162
154
  '©': '',
163
155
  '®': '',
164
156
  '™': '',
157
+ r'[°˚]': '',
158
+
159
+ # Replace special characters in entries
160
+ r'\\': '_',
161
+
162
+ # Typographical standardization
163
+ # Unify various dashes and hyphens to a standard hyphen
164
+ r'[—–―]': '-',
165
+ r'−': '-',
166
+ # remove various quote types
167
+ r'[“”"]': '',
168
+ r"[‘’′']": '',
165
169
 
166
170
  # Collapse repeating punctuation
167
171
  r'\.{2,}': '.', # Replace two or more dots with a single dot
168
172
  r'\?{2,}': '?', # Replace two or more question marks with a single question mark
169
173
  r'!{2,}': '!', # Replace two or more exclamation marks with a single one
170
-
171
- # Typographical standardization
172
- # Unify various dashes and hyphens to a standard hyphen-minus
173
- r'[—–―]': '-',
174
- # Unify various quote types to standard quotes
175
- r'[“”]': "'",
176
- r'[‘’′]': "'",
174
+ r';{2,}': ';',
175
+ r'-{2,}': '-',
176
+ r'/{2,}': '/',
177
+ r'%{2,}': '%',
178
+ r'&{2,}': '&',
177
179
 
178
180
  # 2. Internal Whitespace Consolidation
179
181
  # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
@@ -184,36 +186,150 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
184
186
  r'^\s+|\s+$': '',
185
187
 
186
188
  # 4. Textual Null Standardization (New Step)
187
- # Convert common null-like text to actual nulls. (?i) makes it case-insensitive.
188
- r'^(N/A|无|NA|NULL|NONE|NIL|)$': None,
189
+ # Convert common null-like text to actual nulls.
190
+ r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
189
191
 
190
192
  # 5. Final Nullification of Empty Strings
191
193
  # After all cleaning, if a string is now empty, convert it to a null
192
- r'^$': None
194
+ r'^\s*$': None,
195
+ r'^$': None,
193
196
  }
194
197
 
195
198
  # Clean data
196
199
  try:
197
200
  # Create a cleaner for every column in the dataframe
198
- all_columns = df.columns
201
+ all_columns = df_in.columns
199
202
  column_cleaners = [
200
203
  ColumnCleaner(col, rules=cleaning_rules, case_insensitive=True) for col in all_columns
201
204
  ]
202
205
 
203
206
  # Instantiate and run the main dataframe cleaner
204
207
  df_cleaner = DataFrameCleaner(cleaners=column_cleaners)
205
- df_cleaned = df_cleaner.clean(df, clone_df=False) # Use clone_df=False for efficiency
208
+ df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
206
209
 
207
210
  # apply lowercase to all string columns
208
- df_final = df_cleaned.with_columns(
209
- pl.col(pl.String).str.to_lowercase()
210
- )
211
+ if all_lowercase:
212
+ df_final = df_cleaned.with_columns(
213
+ pl.col(pl.String).str.to_lowercase()
214
+ )
215
+ else:
216
+ df_final = df_cleaned
217
+
211
218
  except Exception as e:
212
- _LOGGER.error(f"An error occurred during the cleaning process for '{input_path.name}'.")
219
+ _LOGGER.error(f"An error occurred during the cleaning process.")
213
220
  raise e
221
+ else:
222
+ return df_final
223
+
224
+
225
+ def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
226
+ # Handle paths
227
+ input_path = make_fullpath(path_in, enforce="file")
228
+
229
+ parent_dir = make_fullpath(Path(path_out).parent, make=True, enforce="directory")
230
+ output_path = parent_dir / Path(path_out).name
231
+
232
+ return input_path, output_path
233
+
234
+
235
+ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=True):
236
+ """
237
+ Performs a comprehensive, standardized cleaning on all columns of a CSV file.
238
+
239
+ The cleaning process includes:
240
+ - Normalizing full-width and typographical punctuation to standard equivalents.
241
+ - Consolidating all internal whitespace (spaces, tabs, newlines) into a single space.
242
+ - Stripping any leading or trailing whitespace.
243
+ - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
244
+ - Converting strings that become empty after cleaning into true null values.
245
+ - Normalizing all text to lowercase (Optional).
246
+
247
+ Args:
248
+ input_filepath (str | Path):
249
+ The path to the source CSV file to be cleaned.
250
+ output_filepath (str | Path):
251
+ The path to save the cleaned CSV file.
252
+ all_lowercase (bool):
253
+ Whether to normalize all text to lowercase.
254
+
255
+ """
256
+ # Handle paths
257
+ input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
258
+
259
+ # load polars df
260
+ df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
261
+
262
+ # CLEAN
263
+ df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
264
+
265
+ # Save cleaned dataframe
266
+ save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
267
+
268
+ _LOGGER.info(f"Data successfully cleaned.")
269
+
270
+
271
+ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
272
+ skip_targets: bool=False, threshold: float=0.8, all_lowercase: bool=True):
273
+ """
274
+ Performs standardized cleaning followed by iterative removal of rows and
275
+ columns with excessive missing data.
276
+
277
+ This function combines the functionality of `basic_clean` and `drop_macro`. It first
278
+ applies a comprehensive normalization process to all columns in the input CSV file,
279
+ ensuring consistent formatting and proper null value handling. The cleaned data is then
280
+ converted to a pandas DataFrame, where iterative row and column dropping is applied
281
+ to remove redundant or incomplete data.
282
+
283
+ The iterative dropping cycle continues until no further rows or columns meet the
284
+ removal criteria, ensuring that dependencies between row and column deletions are
285
+ fully resolved. Logs documenting the missing data profile before and after the
286
+ dropping process are saved to the specified log directory.
287
+
288
+ Args:
289
+ input_filepath (str | Path):
290
+ The path to the source CSV file to be cleaned.
291
+ output_filepath (str | Path):
292
+ The path to save the fully cleaned CSV file after cleaning
293
+ and missing-data-based pruning.
294
+ log_directory (str | Path):
295
+ Path to the directory where missing data reports will be stored.
296
+ targets (list[str]):
297
+ A list of column names to be treated as target variables.
298
+ This list guides the row-dropping logic.
299
+ skip_targets (bool):
300
+ If True, the columns listed in `targets` will be exempt from being dropped,
301
+ even if they exceed the missing data threshold.
302
+ threshold (float):
303
+ The proportion of missing data required to drop a row or column.
304
+ For example, 0.8 means a row/column will be dropped if 80% or more
305
+ of its data is missing.
306
+ all_lowercase (bool):
307
+ Whether to normalize all text to lowercase.
308
+ """
309
+ # handle log path
310
+ log_path = make_fullpath(log_directory, make=True, enforce="directory")
311
+
312
+ # Handle df paths
313
+ input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
314
+
315
+ # load polars df
316
+ df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
317
+
318
+ # CLEAN
319
+ df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
320
+
321
+ # switch to pandas
322
+ df_cleaned_pandas = df_cleaned.to_pandas()
323
+
324
+ # Drop macro
325
+ df_final = drop_macro(df=df_cleaned_pandas,
326
+ log_directory=log_path,
327
+ targets=targets,
328
+ skip_targets=skip_targets,
329
+ threshold=threshold)
214
330
 
215
331
  # Save cleaned dataframe
216
- save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
332
+ save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
217
333
 
218
334
  _LOGGER.info(f"Data successfully cleaned.")
219
335
 
@@ -378,7 +494,7 @@ class DataFrameCleaner:
378
494
  if isinstance(output_filepath, str):
379
495
  output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
380
496
 
381
- save_dataframe(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
497
+ save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
382
498
 
383
499
  return None
384
500