dragon-ml-toolbox 10.12.1__tar.gz → 10.14.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (41) hide show
  1. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/LICENSE +1 -1
  2. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/LICENSE-THIRD-PARTY.md +1 -0
  3. {dragon_ml_toolbox-10.12.1/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-10.14.0}/PKG-INFO +3 -28
  4. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/README.md +0 -21
  5. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0/dragon_ml_toolbox.egg-info}/PKG-INFO +3 -28
  6. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/dragon_ml_toolbox.egg-info/requires.txt +2 -7
  7. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ETL_cleaning.py +28 -16
  8. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ensemble_inference.py +1 -1
  9. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/pyproject.toml +5 -12
  10. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
  11. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
  12. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
  13. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ETL_engineering.py +0 -0
  14. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/GUI_tools.py +0 -0
  15. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/MICE_imputation.py +0 -0
  16. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_callbacks.py +0 -0
  17. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_datasetmaster.py +0 -0
  18. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_evaluation.py +0 -0
  19. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_evaluation_multi.py +0 -0
  20. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_inference.py +0 -0
  21. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_models.py +0 -0
  22. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_optimization.py +0 -0
  23. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_scaler.py +0 -0
  24. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ML_trainer.py +0 -0
  25. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/PSO_optimization.py +0 -0
  26. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/RNN_forecast.py +0 -0
  27. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/SQL.py +0 -0
  28. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/VIF_factor.py +0 -0
  29. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/__init__.py +0 -0
  30. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/_logger.py +0 -0
  31. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/_script_info.py +0 -0
  32. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/custom_logger.py +0 -0
  33. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/data_exploration.py +0 -0
  34. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ensemble_evaluation.py +0 -0
  35. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/ensemble_learning.py +0 -0
  36. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/handle_excel.py +0 -0
  37. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/keys.py +0 -0
  38. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/optimization_tools.py +0 -0
  39. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/path_manager.py +0 -0
  40. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/ml_tools/utilities.py +0 -0
  41. {dragon_ml_toolbox-10.12.1 → dragon_ml_toolbox-10.14.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2025 Karl Loza
3
+ Copyright (c) 2025 Karl Luigi Loza Vidaurre
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -26,3 +26,4 @@ This project depends on the following third-party packages. Each is governed by
26
26
  - [polars](https://github.com/pola-rs/polars/blob/main/LICENSE)
27
27
  - [plotnine](https://github.com/has2k1/plotnine/blob/main/LICENSE)
28
28
  - [tqdm](https://github.com/tqdm/tqdm/blob/master/LICENSE)
29
+ - [pyarrow](https://github.com/apache/arrow/blob/main/LICENSE.txt)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.12.1
3
+ Version: 10.14.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -12,12 +12,6 @@ Requires-Python: >=3.10
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  License-File: LICENSE-THIRD-PARTY.md
15
- Provides-Extra: base
16
- Requires-Dist: pandas; extra == "base"
17
- Requires-Dist: numpy; extra == "base"
18
- Requires-Dist: polars; extra == "base"
19
- Requires-Dist: joblib; extra == "base"
20
- Requires-Dist: colorlog; extra == "base"
21
15
  Provides-Extra: ml
22
16
  Requires-Dist: numpy>=2.0; extra == "ml"
23
17
  Requires-Dist: pandas; extra == "ml"
@@ -38,6 +32,7 @@ Requires-Dist: shap; extra == "ml"
38
32
  Requires-Dist: tqdm; extra == "ml"
39
33
  Requires-Dist: Pillow; extra == "ml"
40
34
  Requires-Dist: evotorch; extra == "ml"
35
+ Requires-Dist: pyarrow; extra == "ml"
41
36
  Requires-Dist: colorlog; extra == "ml"
42
37
  Provides-Extra: mice
43
38
  Requires-Dist: numpy<2.0; extra == "mice"
@@ -51,6 +46,7 @@ Requires-Dist: statsmodels; extra == "mice"
51
46
  Requires-Dist: lightgbm<=4.5.0; extra == "mice"
52
47
  Requires-Dist: shap; extra == "mice"
53
48
  Requires-Dist: colorlog; extra == "mice"
49
+ Requires-Dist: pyarrow; extra == "mice"
54
50
  Provides-Extra: pytorch
55
51
  Requires-Dist: torch; extra == "pytorch"
56
52
  Requires-Dist: torchvision; extra == "pytorch"
@@ -255,27 +251,6 @@ path_manager
255
251
 
256
252
  ---
257
253
 
258
- ### 🎫 Base Tools [base]
259
-
260
- General purpose functions and classes.
261
-
262
- ```Bash
263
- pip install "dragon-ml-toolbox[base]"
264
- ```
265
-
266
- #### Modules:
267
-
268
- ```Bash
269
- ETL_cleaning
270
- ETL_engineering
271
- custom_logger
272
- SQL
273
- utilities
274
- path_manager
275
- ```
276
-
277
- ---
278
-
279
254
  ### ⚒️ APP bundlers
280
255
 
281
256
  Choose one if needed.
@@ -170,27 +170,6 @@ path_manager
170
170
 
171
171
  ---
172
172
 
173
- ### 🎫 Base Tools [base]
174
-
175
- General purpose functions and classes.
176
-
177
- ```Bash
178
- pip install "dragon-ml-toolbox[base]"
179
- ```
180
-
181
- #### Modules:
182
-
183
- ```Bash
184
- ETL_cleaning
185
- ETL_engineering
186
- custom_logger
187
- SQL
188
- utilities
189
- path_manager
190
- ```
191
-
192
- ---
193
-
194
173
  ### ⚒️ APP bundlers
195
174
 
196
175
  Choose one if needed.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 10.12.1
3
+ Version: 10.14.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -12,12 +12,6 @@ Requires-Python: >=3.10
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  License-File: LICENSE-THIRD-PARTY.md
15
- Provides-Extra: base
16
- Requires-Dist: pandas; extra == "base"
17
- Requires-Dist: numpy; extra == "base"
18
- Requires-Dist: polars; extra == "base"
19
- Requires-Dist: joblib; extra == "base"
20
- Requires-Dist: colorlog; extra == "base"
21
15
  Provides-Extra: ml
22
16
  Requires-Dist: numpy>=2.0; extra == "ml"
23
17
  Requires-Dist: pandas; extra == "ml"
@@ -38,6 +32,7 @@ Requires-Dist: shap; extra == "ml"
38
32
  Requires-Dist: tqdm; extra == "ml"
39
33
  Requires-Dist: Pillow; extra == "ml"
40
34
  Requires-Dist: evotorch; extra == "ml"
35
+ Requires-Dist: pyarrow; extra == "ml"
41
36
  Requires-Dist: colorlog; extra == "ml"
42
37
  Provides-Extra: mice
43
38
  Requires-Dist: numpy<2.0; extra == "mice"
@@ -51,6 +46,7 @@ Requires-Dist: statsmodels; extra == "mice"
51
46
  Requires-Dist: lightgbm<=4.5.0; extra == "mice"
52
47
  Requires-Dist: shap; extra == "mice"
53
48
  Requires-Dist: colorlog; extra == "mice"
49
+ Requires-Dist: pyarrow; extra == "mice"
54
50
  Provides-Extra: pytorch
55
51
  Requires-Dist: torch; extra == "pytorch"
56
52
  Requires-Dist: torchvision; extra == "pytorch"
@@ -255,27 +251,6 @@ path_manager
255
251
 
256
252
  ---
257
253
 
258
- ### 🎫 Base Tools [base]
259
-
260
- General purpose functions and classes.
261
-
262
- ```Bash
263
- pip install "dragon-ml-toolbox[base]"
264
- ```
265
-
266
- #### Modules:
267
-
268
- ```Bash
269
- ETL_cleaning
270
- ETL_engineering
271
- custom_logger
272
- SQL
273
- utilities
274
- path_manager
275
- ```
276
-
277
- ---
278
-
279
254
  ### ⚒️ APP bundlers
280
255
 
281
256
  Choose one if needed.
@@ -19,13 +19,7 @@ shap
19
19
  tqdm
20
20
  Pillow
21
21
  evotorch
22
- colorlog
23
-
24
- [base]
25
- pandas
26
- numpy
27
- polars
28
- joblib
22
+ pyarrow
29
23
  colorlog
30
24
 
31
25
  [excel]
@@ -61,6 +55,7 @@ statsmodels
61
55
  lightgbm<=4.5.0
62
56
  shap
63
57
  colorlog
58
+ pyarrow
64
59
 
65
60
  [nuitka]
66
61
  nuitka
@@ -96,7 +96,7 @@ def save_unique_values(csv_path: Union[str, Path], output_dir: Union[str, Path],
96
96
 
97
97
 
98
98
  ########## Basic df cleaners #############
99
- def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
99
+ def _cleaner_core(df_in: pl.DataFrame, all_lowercase: bool) -> pl.DataFrame:
100
100
  # Cleaning rules
101
101
  cleaning_rules = {
102
102
  # 1. Comprehensive Punctuation & Symbol Normalization
@@ -128,7 +128,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
128
128
  # Punctuation
129
129
  '》': '>', '《': '<', ':': ':', '。': '.', ';': ';', '【': '[', '】': ']',
130
130
  '(': '(', ')': ')', '?': '?', '!': '!', '~': '~', '@': '@', '#': '#', '+': '+', '-': '-',
131
- '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=',
131
+ '$': '$', '%': '%', '^': '^', '&': '&', '*': '*', '\': '-', '|': '|', '≈':'=', '·': '-',
132
132
 
133
133
  # Commas (avoid commas in entries)
134
134
  ',': ';',
@@ -159,6 +159,9 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
159
159
  r'!{2,}': '!', # Replace two or more exclamation marks with a single one
160
160
  r';{2,}': ';',
161
161
  r'-{2,}': '-',
162
+ r'/{2,}': '/',
163
+ r'%{2,}': '%',
164
+ r'&{2,}': '&',
162
165
 
163
166
  # 2. Internal Whitespace Consolidation
164
167
  # Collapse any sequence of whitespace chars (including non-breaking spaces) to a single space
@@ -170,7 +173,7 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
170
173
 
171
174
  # 4. Textual Null Standardization (New Step)
172
175
  # Convert common null-like text to actual nulls.
173
- r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;)$': None,
176
+ r'^(N/A|无|NA|NULL|NONE|NIL|-|\.|;|/|%|&)$': None,
174
177
 
175
178
  # 5. Final Nullification of Empty Strings
176
179
  # After all cleaning, if a string is now empty, convert it to a null
@@ -191,9 +194,13 @@ def _cleaner_core(df_in: pl.DataFrame) -> pl.DataFrame:
191
194
  df_cleaned = df_cleaner.clean(df_in, clone_df=False) # Use clone_df=False for efficiency
192
195
 
193
196
  # apply lowercase to all string columns
194
- df_final = df_cleaned.with_columns(
195
- pl.col(pl.String).str.to_lowercase()
196
- )
197
+ if all_lowercase:
198
+ df_final = df_cleaned.with_columns(
199
+ pl.col(pl.String).str.to_lowercase()
200
+ )
201
+ else:
202
+ df_final = df_cleaned
203
+
197
204
  except Exception as e:
198
205
  _LOGGER.error(f"An error occurred during the cleaning process.")
199
206
  raise e
@@ -211,7 +218,7 @@ def _path_manager(path_in: Union[str,Path], path_out: Union[str,Path]):
211
218
  return input_path, output_path
212
219
 
213
220
 
214
- def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path]):
221
+ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path], all_lowercase: bool=True):
215
222
  """
216
223
  Performs a comprehensive, standardized cleaning on all columns of a CSV file.
217
224
 
@@ -221,13 +228,16 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
221
228
  - Stripping any leading or trailing whitespace.
222
229
  - Converting common textual representations of null (e.g., "N/A", "NULL") to true null values.
223
230
  - Converting strings that become empty after cleaning into true null values.
224
- - Normalizing all text to lowercase.
231
+ - Normalizing all text to lowercase (Optional).
225
232
 
226
233
  Args:
227
- input_filepath (Union[str, Path]):
234
+ input_filepath (str | Path):
228
235
  The path to the source CSV file to be cleaned.
229
- output_filepath (Union[str, Path, None], optional):
236
+ output_filepath (str | Path):
230
237
  The path to save the cleaned CSV file.
238
+ all_lowercase (bool):
239
+ Whether to normalize all text to lowercase.
240
+
231
241
  """
232
242
  # Handle paths
233
243
  input_path, output_path = _path_manager(path_in=input_filepath, path_out=output_filepath)
@@ -236,7 +246,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
236
246
  df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
237
247
 
238
248
  # CLEAN
239
- df_final = _cleaner_core(df)
249
+ df_final = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
240
250
 
241
251
  # Save cleaned dataframe
242
252
  save_dataframe(df=df_final, save_dir=output_path.parent, filename=output_path.name)
@@ -245,7 +255,7 @@ def basic_clean(input_filepath: Union[str,Path], output_filepath: Union[str,Path
245
255
 
246
256
 
247
257
  def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str,Path], log_directory: Union[str,Path], targets: list[str],
248
- skip_targets: bool=False, threshold: float=0.8):
258
+ skip_targets: bool=False, threshold: float=0.8, all_lowercase: bool=True):
249
259
  """
250
260
  Performs standardized cleaning followed by iterative removal of rows and
251
261
  columns with excessive missing data.
@@ -262,12 +272,12 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
262
272
  dropping process are saved to the specified log directory.
263
273
 
264
274
  Args:
265
- input_filepath (str, Path):
275
+ input_filepath (str | Path):
266
276
  The path to the source CSV file to be cleaned.
267
- output_filepath (str, Path):
277
+ output_filepath (str | Path):
268
278
  The path to save the fully cleaned CSV file after cleaning
269
279
  and missing-data-based pruning.
270
- log_directory (str, Path):
280
+ log_directory (str | Path):
271
281
  Path to the directory where missing data reports will be stored.
272
282
  targets (list[str]):
273
283
  A list of column names to be treated as target variables.
@@ -279,6 +289,8 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
279
289
  The proportion of missing data required to drop a row or column.
280
290
  For example, 0.8 means a row/column will be dropped if 80% or more
281
291
  of its data is missing.
292
+ all_lowercase (bool):
293
+ Whether to normalize all text to lowercase.
282
294
  """
283
295
  # handle log path
284
296
  log_path = make_fullpath(log_directory, make=True, enforce="directory")
@@ -290,7 +302,7 @@ def basic_clean_drop(input_filepath: Union[str,Path], output_filepath: Union[str
290
302
  df, _ = load_dataframe(df_path=input_path, kind="polars", all_strings=True)
291
303
 
292
304
  # CLEAN
293
- df_cleaned = _cleaner_core(df)
305
+ df_cleaned = _cleaner_core(df_in=df, all_lowercase=all_lowercase)
294
306
 
295
307
  # switch to pandas
296
308
  df_cleaned_pandas = df_cleaned.to_pandas()
@@ -219,7 +219,7 @@ def model_report(
219
219
  return report_data
220
220
 
221
221
 
222
- # Local implementation to avoid calling utilities' dependencies
222
+ # Local implementation to avoid calling utilities dependencies
223
223
  def _deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
224
224
  """
225
225
  Loads a serialized object from a .joblib file.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "dragon-ml-toolbox"
3
- version = "10.12.1"
3
+ version = "10.14.0"
4
4
  description = "A collection of tools for data science and machine learning projects."
5
5
  authors = [
6
6
  { name = "Karl Loza", email = "luigiloza@gmail.com" }
@@ -19,15 +19,6 @@ Homepage = "https://github.com/DrAg0n-BoRn/ML_tools"
19
19
  Changelog = "https://github.com/DrAg0n-BoRn/ML_tools/blob/master/CHANGELOG.md"
20
20
 
21
21
  [project.optional-dependencies]
22
- # Base all purpose tools
23
- base = [
24
- "pandas",
25
- "numpy",
26
- "polars",
27
- "joblib",
28
- "colorlog"
29
- ]
30
-
31
22
  # Machine Learning main toolbox. Additionally Requires PyTorch with CUDA / MPS support
32
23
  ML = [
33
24
  "numpy>=2.0",
@@ -48,7 +39,8 @@ ML = [
48
39
  "shap",
49
40
  "tqdm",
50
41
  "Pillow",
51
- "evotorch",
42
+ "evotorch",
43
+ "pyarrow",
52
44
  "colorlog"
53
45
  ]
54
46
 
@@ -64,7 +56,8 @@ mice = [
64
56
  "statsmodels",
65
57
  "lightgbm<=4.5.0",
66
58
  "shap",
67
- "colorlog"
59
+ "colorlog",
60
+ "pyarrow"
68
61
  ]
69
62
 
70
63
  # pytorch base CPU installations - not recommended