dragon-ml-toolbox 10.2.0__py3-none-any.whl → 14.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (48) hide show
  1. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
  2. dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
  3. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
  4. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
  5. ml_tools/ETL_cleaning.py +72 -34
  6. ml_tools/ETL_engineering.py +506 -70
  7. ml_tools/GUI_tools.py +2 -1
  8. ml_tools/MICE_imputation.py +212 -7
  9. ml_tools/ML_callbacks.py +73 -40
  10. ml_tools/ML_datasetmaster.py +267 -284
  11. ml_tools/ML_evaluation.py +119 -58
  12. ml_tools/ML_evaluation_multi.py +107 -32
  13. ml_tools/ML_inference.py +15 -5
  14. ml_tools/ML_models.py +234 -170
  15. ml_tools/ML_models_advanced.py +323 -0
  16. ml_tools/ML_optimization.py +321 -97
  17. ml_tools/ML_scaler.py +10 -5
  18. ml_tools/ML_trainer.py +585 -40
  19. ml_tools/ML_utilities.py +528 -0
  20. ml_tools/ML_vision_datasetmaster.py +1315 -0
  21. ml_tools/ML_vision_evaluation.py +260 -0
  22. ml_tools/ML_vision_inference.py +428 -0
  23. ml_tools/ML_vision_models.py +627 -0
  24. ml_tools/ML_vision_transformers.py +58 -0
  25. ml_tools/PSO_optimization.py +10 -7
  26. ml_tools/RNN_forecast.py +2 -0
  27. ml_tools/SQL.py +22 -9
  28. ml_tools/VIF_factor.py +4 -3
  29. ml_tools/_ML_vision_recipe.py +88 -0
  30. ml_tools/__init__.py +1 -0
  31. ml_tools/_logger.py +0 -2
  32. ml_tools/_schema.py +96 -0
  33. ml_tools/constants.py +79 -0
  34. ml_tools/custom_logger.py +164 -16
  35. ml_tools/data_exploration.py +1092 -109
  36. ml_tools/ensemble_evaluation.py +48 -1
  37. ml_tools/ensemble_inference.py +6 -7
  38. ml_tools/ensemble_learning.py +4 -3
  39. ml_tools/handle_excel.py +1 -0
  40. ml_tools/keys.py +80 -0
  41. ml_tools/math_utilities.py +259 -0
  42. ml_tools/optimization_tools.py +198 -24
  43. ml_tools/path_manager.py +144 -45
  44. ml_tools/serde.py +192 -0
  45. ml_tools/utilities.py +287 -227
  46. dragon_ml_toolbox-10.2.0.dist-info/RECORD +0 -36
  47. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
  48. {dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,17 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.2.0
3
+ Version: 14.2.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
- Author-email: Karl Loza <luigiloza@gmail.com>
5
+ Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
7
7
  Project-URL: Homepage, https://github.com/DrAg0n-BoRn/ML_tools
8
8
  Project-URL: Changelog, https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md
9
9
  Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.10
11
+ Requires-Python: >=3.12
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  License-File: LICENSE-THIRD-PARTY.md
15
- Provides-Extra: base
16
- Requires-Dist: pandas; extra == "base"
17
- Requires-Dist: numpy; extra == "base"
18
- Requires-Dist: polars; extra == "base"
19
- Requires-Dist: joblib; extra == "base"
20
- Requires-Dist: colorlog; extra == "base"
21
15
  Provides-Extra: ml
22
16
  Requires-Dist: numpy>=2.0; extra == "ml"
23
17
  Requires-Dist: pandas; extra == "ml"
@@ -38,7 +32,12 @@ Requires-Dist: shap; extra == "ml"
38
32
  Requires-Dist: tqdm; extra == "ml"
39
33
  Requires-Dist: Pillow; extra == "ml"
40
34
  Requires-Dist: evotorch; extra == "ml"
35
+ Requires-Dist: pyarrow; extra == "ml"
41
36
  Requires-Dist: colorlog; extra == "ml"
37
+ Requires-Dist: torchmetrics; extra == "ml"
38
+ Provides-Extra: py-tab
39
+ Requires-Dist: pytorch_tabular; extra == "py-tab"
40
+ Requires-Dist: omegaconf; extra == "py-tab"
42
41
  Provides-Extra: mice
43
42
  Requires-Dist: numpy<2.0; extra == "mice"
44
43
  Requires-Dist: pandas; extra == "mice"
@@ -51,9 +50,7 @@ Requires-Dist: statsmodels; extra == "mice"
51
50
  Requires-Dist: lightgbm<=4.5.0; extra == "mice"
52
51
  Requires-Dist: shap; extra == "mice"
53
52
  Requires-Dist: colorlog; extra == "mice"
54
- Provides-Extra: pytorch
55
- Requires-Dist: torch; extra == "pytorch"
56
- Requires-Dist: torchvision; extra == "pytorch"
53
+ Requires-Dist: pyarrow; extra == "mice"
57
54
  Provides-Extra: excel
58
55
  Requires-Dist: pandas; extra == "excel"
59
56
  Requires-Dist: openpyxl; extra == "excel"
@@ -72,9 +69,6 @@ Requires-Dist: lightgbm; extra == "gui-boost"
72
69
  Provides-Extra: gui-torch
73
70
  Requires-Dist: numpy; extra == "gui-torch"
74
71
  Requires-Dist: FreeSimpleGUI>=5.2; extra == "gui-torch"
75
- Provides-Extra: plot
76
- Requires-Dist: matplotlib; extra == "plot"
77
- Requires-Dist: seaborn; extra == "plot"
78
72
  Provides-Extra: pyinstaller
79
73
  Requires-Dist: pyinstaller; extra == "pyinstaller"
80
74
  Provides-Extra: nuitka
@@ -94,7 +88,7 @@ A collection of Python utilities for data science and machine learning, structur
94
88
 
95
89
  ## Installation
96
90
 
97
- **Python 3.10+**
91
+ **Python 3.12**
98
92
 
99
93
  ### Via PyPI
100
94
 
@@ -104,22 +98,22 @@ Install the latest stable release from PyPI:
104
98
  pip install dragon-ml-toolbox
105
99
  ```
106
100
 
107
- ### Via GitHub (Editable)
101
+ ### Via conda-forge
108
102
 
109
- Clone the repository and install in editable mode with optional dependencies:
103
+ Install from the conda-forge channel:
110
104
 
111
105
  ```bash
112
- git clone https://github.com/DrAg0n-BoRn/ML_tools.git
113
- cd ML_tools
114
- pip install -e .
106
+ conda install -c conda-forge dragon-ml-toolbox
115
107
  ```
116
108
 
117
- ### Via conda-forge
109
+ ### Via GitHub (Editable)
118
110
 
119
- Install from the conda-forge channel:
111
+ Clone the repository and install in editable mode:
120
112
 
121
113
  ```bash
122
- conda install -c conda-forge dragon-ml-toolbox
114
+ git clone https://github.com/DrAg0n-BoRn/ML_tools.git
115
+ cd ML_tools
116
+ pip install -e .
123
117
  ```
124
118
 
125
119
  ## Modular Installation
@@ -132,17 +126,12 @@ Installs a comprehensive set of tools for typical data science workflows, includ
132
126
  pip install "dragon-ml-toolbox[ML]"
133
127
  ```
134
128
 
135
- To install the standard CPU-only versions of Torch and Torchvision:
136
-
137
- ```Bash
138
- pip install "dragon-ml-toolbox[pytorch]"
139
- ```
140
-
141
- ⚠️ To make use of GPU acceleration (highly recommended), follow the official instructions: [PyTorch website](https://pytorch.org/get-started/locally/)
129
+ ⚠️ PyTorch required, follow the official instructions: [PyTorch website](https://pytorch.org/get-started/locally/)
142
130
 
143
131
  #### Modules:
144
132
 
145
133
  ```bash
134
+ constants
146
135
  custom_logger
147
136
  data_exploration
148
137
  ensemble_evaluation
@@ -150,19 +139,28 @@ ensemble_inference
150
139
  ensemble_learning
151
140
  ETL_cleaning
152
141
  ETL_engineering
142
+ math_utilities
153
143
  ML_callbacks
154
144
  ML_datasetmaster
155
145
  ML_evaluation_multi
156
146
  ML_evaluation
157
147
  ML_inference
158
148
  ML_models
149
+ ML_models_advanced # Requires the extra flag [py-tab]
159
150
  ML_optimization
160
151
  ML_scaler
161
152
  ML_trainer
153
+ ML_utilities
154
+ ML_vision_datasetmaster
155
+ ML_vision_evaluation
156
+ ML_vision_inference
157
+ ML_vision_models
158
+ ML_vision_transformers
162
159
  optimization_tools
163
160
  path_manager
164
161
  PSO_optimization
165
162
  RNN_forecast
163
+ serde
166
164
  SQL
167
165
  utilities
168
166
  ```
@@ -180,8 +178,11 @@ pip install "dragon-ml-toolbox[mice]"
180
178
  #### Modules:
181
179
 
182
180
  ```Bash
181
+ constants
183
182
  custom_logger
183
+ math_utilities
184
184
  MICE_imputation
185
+ serde
185
186
  VIF_factor
186
187
  path_manager
187
188
  utilities
@@ -209,42 +210,37 @@ path_manager
209
210
 
210
211
  ### 🎰 GUI for Boosting Algorithms (XGBoost, LightGBM) [gui-boost]
211
212
 
212
- For GUIs that include plotting functionality, you must also install the [plot] extra.
213
+ GUI tools compatible with XGBoost and LightGBM models used for inference.
213
214
 
214
215
  ```Bash
215
216
  pip install "dragon-ml-toolbox[gui-boost]"
216
217
  ```
217
218
 
218
- ```Bash
219
- pip install "dragon-ml-toolbox[gui-boost,plot]"
220
- ```
221
-
222
219
  #### Modules:
223
220
 
224
221
  ```Bash
222
+ constants
225
223
  custom_logger
226
224
  GUI_tools
227
225
  ensemble_inference
228
226
  path_manager
227
+ serde
229
228
  ```
230
229
 
231
230
  ---
232
231
 
233
232
  ### 🤖 GUI for PyTorch Models [gui-torch]
234
233
 
235
- For GUIs that include plotting functionality, you must also install the [plot] extra.
234
+ GUI tools compatible with PyTorch models used for inference.
236
235
 
237
236
  ```Bash
238
237
  pip install "dragon-ml-toolbox[gui-torch]"
239
238
  ```
240
239
 
241
- ```Bash
242
- pip install "dragon-ml-toolbox[gui-torch,plot]"
243
- ```
244
-
245
240
  #### Modules:
246
241
 
247
242
  ```Bash
243
+ constants
248
244
  custom_logger
249
245
  GUI_tools
250
246
  ML_models
@@ -255,27 +251,6 @@ path_manager
255
251
 
256
252
  ---
257
253
 
258
- ### 🎫 Base Tools [base]
259
-
260
- General purpose functions and classes.
261
-
262
- ```Bash
263
- pip install "dragon-ml-toolbox[base]"
264
- ```
265
-
266
- #### Modules:
267
-
268
- ```Bash
269
- ETL_cleaning
270
- ETL_engineering
271
- custom_logger
272
- SQL
273
- utilities
274
- path_manager
275
- ```
276
-
277
- ---
278
-
279
254
  ### ⚒️ APP bundlers
280
255
 
281
256
  Choose one if needed.
@@ -293,6 +268,6 @@ pip install "dragon-ml-toolbox[nuitka]"
293
268
  After installation, import modules like this:
294
269
 
295
270
  ```python
296
- from ml_tools.utilities import serialize_object, deserialize_object
271
+ from ml_tools.serde import serialize_object, deserialize_object
297
272
  from ml_tools import custom_logger
298
273
  ```
@@ -0,0 +1,48 @@
1
+ dragon_ml_toolbox-14.2.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
+ dragon_ml_toolbox-14.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=gkOdNDbKYpIJezwSo2CEnISkLeYfYHv9t8b5K2-P69A,2687
3
+ ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
4
+ ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
5
+ ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
6
+ ml_tools/MICE_imputation.py,sha256=KLJXGQLKJ6AuWWttAG-LCCaxpS-ygM4dXPiguHDaL6Y,20815
7
+ ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
8
+ ml_tools/ML_datasetmaster.py,sha256=rsJgZEGBJmfeKF6cR8CQZzfEx4T7Y-p1wUnR15_nNw0,28400
9
+ ml_tools/ML_evaluation.py,sha256=4GU86rUWMIGbkXrvN6PyjfGwKtWvXKE7pMlWpWeBq14,18988
10
+ ml_tools/ML_evaluation_multi.py,sha256=rJKdgtq-9I7oaI7PRzq7aIZ84XdNV0xzlVePZW4nj0k,16095
11
+ ml_tools/ML_inference.py,sha256=YJ953bhNWsdlPRtJQh3h2ACfMIgp8dQ9KtL9Azar-5s,23489
12
+ ml_tools/ML_models.py,sha256=PqOcNlws7vCJMbiVCKqlPuktxvskZVUHG3VfU-Yshf8,31415
13
+ ml_tools/ML_models_advanced.py,sha256=vk3PZBSu3DVso2S1rKTxxdS43XG8Q5FnasIL3-rMajc,12410
14
+ ml_tools/ML_optimization.py,sha256=P0zkhKAwTpkorIBtR0AOIDcyexo5ngmvFUzo3DfNO-E,22692
15
+ ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
16
+ ml_tools/ML_trainer.py,sha256=ZWI4MbUcLeBxyfoUTL96l5tjHHMp9I64h4SdXnjYmBE,49795
17
+ ml_tools/ML_utilities.py,sha256=z6LbpbZwhn8F__fWlKi-g-cAJQXSxwg1NHfC5FBoAyc,21139
18
+ ml_tools/ML_vision_datasetmaster.py,sha256=tOrdatuq_AP8-GDiTrtARvSJdpc8h7dT-OhDJtRQnsk,54433
19
+ ml_tools/ML_vision_evaluation.py,sha256=t12R7i1RkOCt9zu1_lxSBr8OH6A6Get0k8ftDLctn6I,10486
20
+ ml_tools/ML_vision_inference.py,sha256=He3KV3VJAm8PwO-fOq4b9VO8UXFr-GmpuCnoHXf4VZI,20588
21
+ ml_tools/ML_vision_models.py,sha256=G3S4jB9AE9wMpU9ZygOgOx9q1K6t6LAXBYcJ-U2XQ1M,25600
22
+ ml_tools/ML_vision_transformers.py,sha256=95e0aBkHY5VDGE8i5xy57COU7NvSNIgFknnhBubwE40,1832
23
+ ml_tools/PSO_optimization.py,sha256=T-HWHMRJUnPvPwixdU5jif3_rnnI36TzcL8u3oSCwuA,22960
24
+ ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
25
+ ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
26
+ ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
27
+ ml_tools/_ML_vision_recipe.py,sha256=zrgxFUvTJqQVuwR7jWlbIC2FD29u6eNFPkTRoJ7yEZI,3178
28
+ ml_tools/__init__.py,sha256=kJiankjz9_qXu7gU92mYqYg_anLvt-B6RtW0mMH8uGo,76
29
+ ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
30
+ ml_tools/_schema.py,sha256=yu6aWmn_2Z4_AxAtJGDDCIa96y6JcUp-vgnCS013Qmw,3908
31
+ ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
32
+ ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
33
+ ml_tools/custom_logger.py,sha256=TGc0Ww2Xlqj2XE3q4bP43hV7T3qnb5ci9f0pYHXF5TY,11226
34
+ ml_tools/data_exploration.py,sha256=bwHzFJ-IAo5GN3T53F-1J_pXUg8VHS91sG_90utAsfg,69911
35
+ ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
36
+ ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
37
+ ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
38
+ ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
39
+ ml_tools/keys.py,sha256=wZOBuEnnHc54vlOZiimnrxfk-sZh6f6suPppJW8rbPQ,3326
40
+ ml_tools/math_utilities.py,sha256=xeKq1quR_3DYLgowcp4Uam_4s3JltUyOnqMOGuAiYWU,8802
41
+ ml_tools/optimization_tools.py,sha256=TYFQ2nSnp7xxs-VyoZISWgnGJghFbsWasHjruegyJRs,12763
42
+ ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
43
+ ml_tools/serde.py,sha256=c8uDYjYry_VrLvoG4ixqDj5pij88lVn6Tu4NHcPkwDU,6943
44
+ ml_tools/utilities.py,sha256=aWqvYzmxlD74PD5Yqu1VuTekDJeYLQrmPIU_VeVyRp0,22526
45
+ dragon_ml_toolbox-14.2.0.dist-info/METADATA,sha256=T0eIxD-eO3cbAIzJ1HskJbog6RUYgXwXQQ2OU8Z-GQM,6475
46
+ dragon_ml_toolbox-14.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
47
+ dragon_ml_toolbox-14.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
48
+ dragon_ml_toolbox-14.2.0.dist-info/RECORD,,
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2025 Karl Loza
3
+ Copyright (c) 2025 Karl Luigi Loza Vidaurre
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -26,3 +26,14 @@ This project depends on the following third-party packages. Each is governed by
26
26
  - [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
27
27
  - [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE)
28
28
  - [tqdm](https://github.com/tqdm/tqdm/blob/master/LICENSE)
29
+ - [pyarrow](https://github.com/apache/arrow/blob/main/LICENSE.txt)
30
+ - [colorlog](https://github.com/borntyping/python-colorlog/blob/main/LICENSE)
31
+ - [evotorch](https://github.com/nnaisense/evotorch/blob/master/LICENSE)
32
+ - [FreeSimpleGUI](https://github.com/spyoungtech/FreeSimpleGUI/blob/main/license.txt)
33
+ - [nuitka](https://github.com/Nuitka/Nuitka/blob/main/LICENSE.txt)
34
+ - [omegaconf](https://github.com/omry/omegaconf/blob/master/LICENSE)
35
+ - [ordered-set](https://github.com/rspeer/ordered-set/blob/master/MIT-LICENSE)
36
+ - [pyinstaller](https://github.com/pyinstaller/pyinstaller/blob/develop/COPYING.txt)
37
+ - [pytorch_tabular](https://github.com/manujosephv/pytorch_tabular/blob/main/LICENSE)
38
+ - [torchmetrics](https://github.com/Lightning-AI/torchmetrics/blob/master/LICENSE)
39
+ - [zstandard](https://github.com/indygreg/python-zstandard/blob/main/LICENSE)
ml_tools/ETL_cleaning.py CHANGED
@@ -2,9 +2,10 @@ import polars as pl
2
2
  import pandas as pd
3
3
  from pathlib import Path
4
4
  from typing import Union, List, Dict
5
+
5
6
  from .path_manager import sanitize_filename, make_fullpath
6
7
  from .data_exploration import drop_macro
7
- from .utilities import save_dataframe, load_dataframe
8
+ from .utilities import save_dataframe_filename, load_dataframe
8
9
  from ._script_info import _script_info
9
10
  from ._logger import _LOGGER
10
11
 
@@ -19,20 +20,26 @@ __all__ = [
19
20
 
20
21
 
21
22
  ################ Unique Values per column #################
22
- def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path], verbose: bool=False) -> None:
23
+ def save_unique_values(csv_path: Union[str, Path],
24
+ output_dir: Union[str, Path],
25
+ verbose: bool=False,
26
+ keep_column_order: bool = True) -> None:
23
27
  """
24
28
  Loads a CSV file, then analyzes it and saves the unique non-null values
25
29
  from each column into a separate text file exactly as they appear.
26
30
 
27
31
  This is useful for understanding the raw categories or range of values
28
- within a dataset before cleaning.
32
+ within a dataset before and after cleaning.
29
33
 
30
34
  Args:
31
- csv_path (Union[str, Path]):
35
+ csv_path (str | Path):
32
36
  The file path to the input CSV file.
33
- output_dir (Union[str, Path]):
37
+ output_dir (str | Path):
34
38
  The path to the directory where the .txt files will be saved.
35
39
  The directory will be created if it does not exist.
40
+ keep_column_order (bool):
41
+ If True, prepends a numeric prefix (e.g., '1_', '2_') to each
42
+ output filename to maintain the original column order.
36
43
  """
37
44
  # --- 1. Input Validation ---
38
45
  csv_path = make_fullpath(input_path=csv_path, enforce="file")
@@ -74,7 +81,12 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
74
81
  sanitized_name = sanitize_filename(column_name)
75
82
  if not sanitized_name.strip('_'):
76
83
  sanitized_name = f'column_{i}'
77
- file_path = output_dir / f"{sanitized_name}_unique_values.txt"
84
+
85
+ # --- create filename prefix ---
86
+ # If keep_column_order is True, create a prefix like "1_", "2_", etc.
87
+ prefix = f"{i + 1}_" if keep_column_order else ''
88
+
89
+ file_path = output_dir / f"{prefix}{sanitized_name}_unique_values.txt"
78
90
 
79
91
  # --- Write to file ---
80
92
  try:
@@ -96,7 +108,7 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
96
108
 
97
109
 
98
110
  ########## Basic df cleaners #############
99
- def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
111
+ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
100
112
  # Cleaning rules
101
113
  cleaning_rules = {
102
114
  # 1. Comprehensive Punctuation & Symbol Normalization
@@ -126,27 +138,44 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
126
138
  's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x',
127
139
  'y': 'y', 'z': 'z',
128
140
  # Punctuation
129
- '》': '>', '《': '<', ':': ':', ',': ',', '。': '.', ';': ';', '【': '[', '】': ']',
141
+ '》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']', '∼': '~',
130
142
  '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
131
- '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '\\', '|': '|', '':',', '':'=',
143
+ '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '':'=', '·': '', '': '',
144
+ '¯': '-',
145
+
146
+ # Commas (avoid commas in entries)
147
+ ',': ';',
148
+ ',': ';',
149
+ '、':';',
132
150
 
133
151
  # Others
152
+ 'σ': '',
153
+ '□': '',
134
154
  '©': '',
135
155
  '®': '',
136
156
  '™': '',
157
+ r'[°˚]': '',
137
158
 
138
- # Collapse repeating punctuation
139
- r'\.{2,}': '.', # Replace two or more dots with a single dot
140
- r'\?{2,}': '?', # Replace two or more question marks with a single question mark
141
- r'!{2,}': '!', # Replace two or more exclamation marks with a single one
159
+ # Replace special characters in entries
160
+ r'\\': '_',
142
161
 
143
162
  # Typographical standardization
144
- # Unify various dashes and hyphens to a standard hyphen-minus
163
+ # Unify various dashes and hyphens to a standard hyphen
145
164
  r'[—–―]': '-',
146
165
  r'−': '-',
147
- # Unify various quote types to standard quotes
148
- r'[“”]': "'",
149
- r'[‘’′]': "'",
166
+ # remove various quote types
167
+ r'[“”"]': '',
168
+ r"[‘’′']": '',
169
+
170
+ # Collapse repeating punctuation
171
+ r'\.{2,}': '.', # Replace two or more dots with a single dot
172
+ r'\?{2,}': '?', # Replace two or more question marks with a single question mark
173
+ r'!{2,}': '!', # Replace two or more exclamation marks with a single one
174
+ r';{2,}': ';',
175
+ r'-{2,}': '-',
176
+ r'/{2,}': '/',
177
+ r'%{2,}': '%',
178
+ r'&{2,}': '&',
150
179
 
151
180
  # 2. Internal Whitespace Consolidation
152
181
  # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
@@ -158,7 +187,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
158
187
 
159
188
  # 4. Textual Null Standardization (New Step)
160
189
  # Convert common null-like text to actual nulls.
161
- r'^(N/A|无|NA|NULL|NONE|NIL|)$': None,
190
+ r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
162
191
 
163
192
  # 5. Final Nullification of Empty Strings
164
193
  # After all cleaning, if a string is now empty, convert it to a null
@@ -179,9 +208,13 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
179
208
  df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
180
209
 
181
210
  # apply lowercase to all string columns
182
- df_final = df_cleaned.with_columns(
183
- pl.col(pl.String).str.to_lowercase()
184
- )
211
+ if all_lowercase:
212
+ df_final = df_cleaned.with_columns(
213
+ pl.col(pl.String).str.to_lowercase()
214
+ )
215
+ else:
216
+ df_final = df_cleaned
217
+
185
218
  except Exception as e:
186
219
  _LOGGER.error(f"An error occurred during the cleaning process.")
187
220
  raise e
@@ -199,7 +232,7 @@ def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
199
232
  return input_path, output_path
200
233
 
201
234
 
202
- def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
235
+ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=True):
203
236
  """
204
237
  Performs a comprehensive, standardized cleaning on all columns of a CSV file.
205
238
 
@@ -209,13 +242,16 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
209
242
  - Stripping any leading or trailing whitespace.
210
243
  - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
211
244
  - Converting strings that become empty after cleaning into true null values.
212
- - Normalizing all text to lowercase.
245
+ - Normalizing all text to lowercase (Optional).
213
246
 
214
247
  Args:
215
- input_filepath (Union[str, Path]):
248
+ input_filepath (str | Path):
216
249
  The path to the source CSV file to be cleaned.
217
- output_filepath (Union[str, Path, None], optional):
250
+ output_filepath (str | Path):
218
251
  The path to save the cleaned CSV file.
252
+ all_lowercase (bool):
253
+ Whether to normalize all text to lowercase.
254
+
219
255
  """
220
256
  # Handle paths
221
257
  input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
@@ -224,16 +260,16 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
224
260
  df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
225
261
 
226
262
  # CLEAN
227
- df_final = _cleaner_core(df)
263
+ df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
228
264
 
229
265
  # Save cleaned dataframe
230
- save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
266
+ save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
231
267
 
232
268
  _LOGGER.info(f"Data successfully cleaned.")
233
269
 
234
270
 
235
271
  def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
236
- skip_targets: bool=False, threshold: float=0.8):
272
+ skip_targets: bool=False, threshold: float=0.8, all_lowercase: bool=True):
237
273
  """
238
274
  Performs standardized cleaning followed by iterative removal of rows and
239
275
  columns with excessive missing data.
@@ -250,12 +286,12 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
250
286
  dropping process are saved to the specified log directory.
251
287
 
252
288
  Args:
253
- input_filepath (str, Path):
289
+ input_filepath (str | Path):
254
290
  The path to the source CSV file to be cleaned.
255
- output_filepath (str, Path):
291
+ output_filepath (str | Path):
256
292
  The path to save the fully cleaned CSV file after cleaning
257
293
  and missing-data-based pruning.
258
- log_directory (str, Path):
294
+ log_directory (str | Path):
259
295
  Path to the directory where missing data reports will be stored.
260
296
  targets (list[str]):
261
297
  A list of column names to be treated as target variables.
@@ -267,6 +303,8 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
267
303
  The proportion of missing data required to drop a row or column.
268
304
  For example, 0.8 means a row/column will be dropped if 80% or more
269
305
  of its data is missing.
306
+ all_lowercase (bool):
307
+ Whether to normalize all text to lowercase.
270
308
  """
271
309
  # handle log path
272
310
  log_path = make_fullpath(log_directory, make=True, enforce="directory")
@@ -278,7 +316,7 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
278
316
  df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
279
317
 
280
318
  # CLEAN
281
- df_cleaned = _cleaner_core(df)
319
+ df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
282
320
 
283
321
  # switch to pandas
284
322
  df_cleaned_pandas = df_cleaned.to_pandas()
@@ -291,7 +329,7 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
291
329
  threshold=threshold)
292
330
 
293
331
  # Save cleaned dataframe
294
- save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
332
+ save_dataframe_filename(df=df_final, save_dir=output_path.parent, filename=output_path.name)
295
333
 
296
334
  _LOGGER.info(f"Data successfully cleaned.")
297
335
 
@@ -456,7 +494,7 @@ class DataFrameCleaner:
456
494
  if isinstance(output_filepath, str):
457
495
  output_filepath = make_fullpath(input_path=output_filepath, enforce="file")
458
496
 
459
- save_dataframe(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
497
+ save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
460
498
 
461
499
  return None
462
500