dragon-ml-toolbox 2.1.0__py3-none-any.whl → 2.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 2.1.0
3
+ Version: 2.2.1
4
4
  Summary: A collection of tools for data science and machine learning projects
5
5
  Author-email: Karl Loza <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,20 +1,21 @@
1
- dragon_ml_toolbox-2.1.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
- dragon_ml_toolbox-2.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
1
+ dragon_ml_toolbox-2.2.1.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
2
+ dragon_ml_toolbox-2.2.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
3
+ ml_tools/ETL_engineering.py,sha256=meQwdMUmAGXmrOSF5K5MaIhztvAbwxPeKnPnv8TxBi0,23283
3
4
  ml_tools/MICE_imputation.py,sha256=1fovHycZMdZ6OgVh_bk8-r3wGi4rqf6rS10LOEWYaQo,11177
4
- ml_tools/PSO_optimization.py,sha256=vty1dZDY7P2iGUuE_oojyGdgM1EkDj5kXCfCxRMdk28,20957
5
+ ml_tools/PSO_optimization.py,sha256=T-wnB94DcRWuRd2M3loDVT4POtIP0MOhs-VilAf1L4E,20974
5
6
  ml_tools/VIF_factor.py,sha256=lpM3Z2X_iZfXUWbCbURoeI0Tb196lU0bAsRo7q6AzBM,10235
6
7
  ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
8
  ml_tools/_particle_swarm_optimization.py,sha256=b_eNNkA89Y40hj76KauivT8KLScH1B9wF2IXptOqkOw,22220
8
- ml_tools/data_exploration.py,sha256=CDUVRTHfww105IXDRpBQ81KZWx5HXSsA-FVsVYBzNw8,21298
9
+ ml_tools/data_exploration.py,sha256=Fzbz_DKZ7F2e3-JbahLqKr3aP6lt9aCK9rNOHvR7nlA,23665
9
10
  ml_tools/datasetmaster.py,sha256=EFUEX-tqq94Ak1rXXYR-XaX85olrxvF2EuytdzUK7y0,29131
10
11
  ml_tools/ensemble_learning.py,sha256=q9jbu7SupvXz61sURFQ9V2-7gUsLbA3cSgyb2MQFyyc,37351
11
12
  ml_tools/handle_excel.py,sha256=Uasx-DX7RNVQSzGHVJhX7UQ9RgBbX5H1ud1Hw_y8Kp4,12944
12
13
  ml_tools/logger.py,sha256=_k7WJdpFJj3IsjOgvjLJgUFZyF8RK3Jlgp5tAu_dLQU,4767
13
14
  ml_tools/pytorch_models.py,sha256=bpWZsrSwCvHJQkR6UfoPpElsMv9AvmiNErNHC8NYB_I,10132
14
15
  ml_tools/trainer.py,sha256=WAZ4EdrZuTOAnGXRWV3XcLNce4s7EKGf2-qchLC08Ik,15702
15
- ml_tools/utilities.py,sha256=5vVXqIH-jiY4PHUAoDI1o26mZYPsmrWO6I97Fs3oC90,18661
16
+ ml_tools/utilities.py,sha256=A7Wm1ArpqFG80WKmnkYdtSzIRLvg5x-9nPNidZIbpPA,20671
16
17
  ml_tools/vision_helpers.py,sha256=idQ-Ugp1IdsvwXiYyhYa9G3rTRTm37YRpkQDLEpANHM,7701
17
- dragon_ml_toolbox-2.1.0.dist-info/METADATA,sha256=LDXrXkR1nm6WiEVHudCy7wI0dwkMejT0NzPuYptGSmw,2974
18
- dragon_ml_toolbox-2.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
- dragon_ml_toolbox-2.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
20
- dragon_ml_toolbox-2.1.0.dist-info/RECORD,,
18
+ dragon_ml_toolbox-2.2.1.dist-info/METADATA,sha256=1Xjem3tZp5rlaFrz5_lQKdtal_jUB9lKRUIlQqYseyE,2974
19
+ dragon_ml_toolbox-2.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
+ dragon_ml_toolbox-2.2.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
21
+ dragon_ml_toolbox-2.2.1.dist-info/RECORD,,
@@ -0,0 +1,601 @@
1
+ import polars as pl
2
+ import re
3
+ from typing import Literal, Union, Optional, Any, Callable, List, Dict
4
+ from .utilities import _script_info
5
+
6
+
7
+ __all__ = [
8
+ "TransformationRecipe",
9
+ "DataProcessor",
10
+ "KeywordDummifier",
11
+ "NumberExtractor",
12
+ "MultiNumberExtractor",
13
+ "CategoryMapper",
14
+ "ValueBinner",
15
+ "DateFeatureExtractor"
16
+ ]
17
+
18
+ # Magic word for rename-only transformation
19
+ _RENAME = "rename"
20
+
21
+ class TransformationRecipe:
22
+ """
23
+ A builder class for creating a data transformation recipe.
24
+
25
+ This class provides a structured way to define a series of transformation
26
+ steps, with validation performed at the time of addition. It is designed
27
+ to be passed to a `DataProcessor`.
28
+
29
+ Use the method `add()` to add recipes.
30
+ """
31
+ def __init__(self):
32
+ self._steps: List[Dict[str, Any]] = []
33
+
34
+ def add(
35
+ self,
36
+ input_col_name: str,
37
+ output_col_names: Union[str, List[str]],
38
+ transform: Union[str, Callable],
39
+ ) -> "TransformationRecipe":
40
+ """
41
+ Adds a new transformation step to the recipe.
42
+
43
+ Args:
44
+ input_col: The name of the column from the source DataFrame.
45
+ output_col: The desired name(s) for the output column(s).
46
+ A string for a 1-to-1 mapping, or a list of strings
47
+ for a 1-to-many mapping.
48
+ transform: The transformation to apply:
49
+ - Use "rename" for simple column renaming
50
+ - If callable, must accept a `pl.Series` as the only parameter and return either a `pl.Series` or `pl.DataFrame`.
51
+
52
+ Returns:
53
+ The instance of the recipe itself to allow for method chaining.
54
+ """
55
+ # --- Validation ---
56
+ if not isinstance(input_col_name, str) or not input_col_name:
57
+ raise TypeError("'input_col' must be a non-empty string.")
58
+
59
+ if transform == _RENAME:
60
+ if not isinstance(output_col_names, str):
61
+ raise TypeError("For a RENAME operation, 'output_col' must be a string.")
62
+ elif not isinstance(transform, Callable):
63
+ raise TypeError(f"'transform' must be a callable function or the string '{_RENAME}'.")
64
+
65
+ if isinstance(output_col_names, list) and transform == _RENAME:
66
+ raise ValueError("A RENAME operation cannot have a list of output columns.")
67
+
68
+ # --- Add Step ---
69
+ step = {
70
+ "input_col": input_col_name,
71
+ "output_col": output_col_names,
72
+ "transform": transform,
73
+ }
74
+ self._steps.append(step)
75
+ return self # Allow chaining: recipe.add(...).add(...)
76
+
77
+ def __iter__(self):
78
+ """Allows the class to be iterated over, like a list."""
79
+ return iter(self._steps)
80
+
81
+ def __len__(self):
82
+ """Allows the len() function to be used on an instance."""
83
+ return len(self._steps)
84
+
85
+
86
+ class DataProcessor:
87
+ """
88
+ Transforms a Polars DataFrame based on a provided `TransformationRecipe` object.
89
+
90
+ Use the method `transform()`.
91
+ """
92
+ def __init__(self, recipe: TransformationRecipe):
93
+ """
94
+ Initializes the DataProcessor with a transformation recipe.
95
+
96
+ Args:
97
+ recipe: An instance of the `TransformationRecipe` class that has
98
+ been populated with transformation steps.
99
+ """
100
+ if not isinstance(recipe, TransformationRecipe):
101
+ raise TypeError("The recipe must be an instance of TransformationRecipe.")
102
+ if len(recipe) == 0:
103
+ raise ValueError("The recipe cannot be empty.")
104
+ self._recipe = recipe
105
+
106
+ def transform(self, df: pl.DataFrame) -> pl.DataFrame:
107
+ """
108
+ Applies the transformation recipe to the input DataFrame.
109
+ """
110
+ processed_columns = []
111
+ # Recipe object is iterable
112
+ for step in self._recipe:
113
+ input_col_name = step["input_col"]
114
+ output_col_spec = step["output_col"]
115
+ transform_action = step["transform"]
116
+
117
+ if input_col_name not in df.columns:
118
+ raise ValueError(f"Input column '{input_col_name}' not found in DataFrame.")
119
+
120
+ input_series = df.get_column(input_col_name)
121
+
122
+ if transform_action == _RENAME:
123
+ processed_columns.append(input_series.alias(output_col_spec))
124
+ continue
125
+
126
+ if isinstance(transform_action, Callable):
127
+ result = transform_action(input_series)
128
+
129
+ if isinstance(result, pl.Series):
130
+ if not isinstance(output_col_spec, str):
131
+ raise TypeError(f"Function for '{input_col_name}' returned a Series but 'output_col' is not a string.")
132
+ processed_columns.append(result.alias(output_col_spec))
133
+
134
+ elif isinstance(result, pl.DataFrame):
135
+ if not isinstance(output_col_spec, list):
136
+ raise TypeError(f"Function for '{input_col_name}' returned a DataFrame but 'output_col' is not a list.")
137
+ if len(result.columns) != len(output_col_spec):
138
+ raise ValueError(
139
+ f"Mismatch in '{input_col_name}': function produced {len(result.columns)} columns, "
140
+ f"but recipe specifies {len(output_col_spec)} output names."
141
+ )
142
+
143
+ renamed_df = result.rename(dict(zip(result.columns, output_col_spec)))
144
+ processed_columns.extend(renamed_df.get_columns())
145
+
146
+ else:
147
+ raise TypeError(f"Function for '{input_col_name}' returned an unexpected type: {type(result)}.")
148
+
149
+ else: # This case is now unlikely due to builder validation.
150
+ raise TypeError(f"Invalid 'transform' action for '{input_col_name}': {transform_action}")
151
+
152
+ if not processed_columns:
153
+ print("Warning: The transformation resulted in an empty DataFrame.")
154
+ return pl.DataFrame()
155
+
156
+ return pl.DataFrame(processed_columns)
157
+
158
+ def __str__(self) -> str:
159
+ """
160
+ Provides a detailed, human-readable string representation of the
161
+ entire processing pipeline.
162
+ """
163
+ header = "DataProcessor Pipeline"
164
+ divider = "-" * len(header)
165
+ num_steps = len(self._recipe)
166
+
167
+ lines = [
168
+ header,
169
+ divider,
170
+ f"Number of steps: {num_steps}\n"
171
+ ]
172
+
173
+ if num_steps == 0:
174
+ lines.append("No transformation steps defined.")
175
+ return "\n".join(lines)
176
+
177
+ for i, step in enumerate(self._recipe, 1):
178
+ transform_action = step["transform"]
179
+
180
+ # Get a clean name for the transformation action
181
+ if transform_action == _RENAME: # "rename"
182
+ transform_name = "Rename"
183
+ else:
184
+ # This works for both functions and class instances
185
+ transform_name = type(transform_action).__name__
186
+
187
+ lines.append(f"[{i}] Input: '{step['input_col']}'")
188
+ lines.append(f" - Transform: {transform_name}")
189
+ lines.append(f" - Output(s): {step['output_col']}")
190
+ if i < num_steps:
191
+ lines.append("") # Add a blank line between steps
192
+
193
+ return "\n".join(lines)
194
+
195
+ def inspect(self) -> None:
196
+ """
197
+ Prints the detailed string representation of the pipeline to the console.
198
+ """
199
+ print(self)
200
+
201
+
202
+ class KeywordDummifier:
203
+ """
204
+ A configurable transformer that creates one-hot encoded columns based on
205
+ keyword matching in a Polars Series.
206
+
207
+ Instantiate this class with keyword configurations. The instance can be used as a 'transform' callable compatible with the `TransformationRecipe`.
208
+
209
+ Args:
210
+ group_names (List[str]):
211
+ A list of strings, where each string is the name of a category.
212
+ This defines the matching priority and the base column names of the
213
+ DataFrame returned by the transformation.
214
+ group_keywords (List[List[str]]):
215
+ A list of lists of strings. Each inner list corresponds to a
216
+ `group_name` at the same index and contains the keywords to search for.
217
+ """
218
+ def __init__(self, group_names: List[str], group_keywords: List[List[str]]):
219
+ if len(group_names) != len(group_keywords):
220
+ raise ValueError("Initialization failed: 'group_names' and 'group_keywords' must have the same length.")
221
+
222
+ self.group_names = group_names
223
+ self.group_keywords = group_keywords
224
+
225
+ def __call__(self, column: pl.Series) -> pl.DataFrame:
226
+ """
227
+ Executes the one-hot encoding logic.
228
+
229
+ Args:
230
+ column (pl.Series): The input Polars Series to transform.
231
+
232
+ Returns:
233
+ pl.DataFrame: A DataFrame with one-hot encoded columns.
234
+ """
235
+ column = column.cast(pl.Utf8)
236
+
237
+ categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None))
238
+ for name, keywords in zip(self.group_names, self.group_keywords):
239
+ pattern = "|".join(re.escape(k) for k in keywords)
240
+ categorize_expr = categorize_expr.when(
241
+ column.str.contains(pattern)
242
+ ).then(pl.lit(name))
243
+
244
+ categorize_expr = categorize_expr.otherwise(None).alias("category")
245
+
246
+ temp_df = pl.DataFrame(categorize_expr)
247
+ df_with_dummies = temp_df.to_dummies(columns=["category"])
248
+
249
+ final_columns = []
250
+ for name in self.group_names:
251
+ dummy_col_name = f"category_{name}"
252
+ if dummy_col_name in df_with_dummies.columns:
253
+ # The alias here uses the group name as the temporary column name
254
+ final_columns.append(
255
+ df_with_dummies.get_column(dummy_col_name).alias(name)
256
+ )
257
+ else:
258
+ final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
259
+
260
+ return pl.DataFrame(final_columns)
261
+
262
+
263
+ class NumberExtractor:
264
+ """
265
+ A configurable transformer that extracts a single number from a Polars string series using a regular expression.
266
+
267
+ An instance can be used as a 'transform' callable within the
268
+ `DataProcessor` pipeline.
269
+
270
+ Args:
271
+ regex_pattern (str):
272
+ The regular expression used to find the number. This pattern
273
+ MUST contain exactly one capturing group `(...)`. Defaults to a standard pattern for integers and floats.
274
+ dtype (str):
275
+ The desired data type for the output column. Defaults to "float".
276
+ round_digits (int | None):
277
+ If the dtype is 'float', you can specify the number of decimal
278
+ places to round the result to. This parameter is ignored if
279
+ dtype is 'int'. Defaults to None (no rounding).
280
+ """
281
+ def __init__(
282
+ self,
283
+ regex_pattern: str = r"(\d+\.?\d*)",
284
+ dtype: Literal["float", "int"] = "float",
285
+ round_digits: Optional[int] = None,
286
+ ):
287
+ # --- Validation ---
288
+ if not isinstance(regex_pattern, str):
289
+ raise TypeError("regex_pattern must be a string.")
290
+
291
+ # Validate that the regex has exactly one capturing group
292
+ try:
293
+ if re.compile(regex_pattern).groups != 1:
294
+ raise ValueError("regex_pattern must contain exactly one capturing group '(...)'")
295
+ except re.error as e:
296
+ raise ValueError(f"Invalid regex pattern provided: {e}") from e
297
+
298
+ if dtype not in ["float", "int"]:
299
+ raise ValueError("dtype must be either 'float' or 'int'.")
300
+
301
+ if round_digits is not None:
302
+ if not isinstance(round_digits, int):
303
+ raise TypeError("round_digits must be an integer.")
304
+ if dtype == "int":
305
+ print(f"Warning: 'round_digits' is specified but dtype is 'int'. Rounding will be ignored.")
306
+
307
+ self.regex_pattern = regex_pattern
308
+ self.dtype = dtype
309
+ self.round_digits = round_digits
310
+ self.polars_dtype = pl.Float64 if dtype == "float" else pl.Int64
311
+
312
+ def __call__(self, column: pl.Series) -> pl.Series:
313
+ """
314
+ Executes the number extraction logic.
315
+
316
+ Args:
317
+ column (pl.Series): The input Polars Series to transform.
318
+
319
+ Returns:
320
+ pl.Series: A new Series containing the extracted numbers.
321
+ """
322
+ # Extract the first (and only) capturing group
323
+ extracted = column.str.extract(self.regex_pattern, 1)
324
+
325
+ # Cast to the desired numeric type. Non-matching strings become null.
326
+ casted = extracted.cast(self.polars_dtype, strict=False)
327
+
328
+ # Apply rounding only if it's a float and round_digits is set
329
+ if self.dtype == "float" and self.round_digits is not None:
330
+ return casted.round(self.round_digits)
331
+
332
+ return casted
333
+
334
+
335
+ class MultiNumberExtractor:
336
+ """
337
+ Extracts multiple numbers from a single polars string column into several new columns.
338
+
339
+ This transformer is designed for one-to-many mappings, such as parsing
340
+ ratios (100:30) or coordinates (10, 25) into separate columns.
341
+
342
+ Args:
343
+ num_outputs (int):
344
+ Number of numeric columns to create.
345
+ regex_pattern (str):
346
+ The regex pattern to find all numbers. Must contain one
347
+ capturing group around the number part.
348
+ Defaults to a standard pattern for integers and floats.
349
+ dtype (str):
350
+ The desired data type for the output columns. Defaults to "float".
351
+ fill_value (int | float | None):
352
+ A value to fill in if a number is not found at a given position (if positive match).
353
+ - For example, if `num_outputs=2` and only one number is found in a string, the second output column will be filled with this value. If None, it will be filled with null.
354
+ """
355
+ def __init__(
356
+ self,
357
+ num_outputs: int,
358
+ regex_pattern: str = r"(\d+\.?\d*)",
359
+ dtype: Literal["float", "int"] = "float",
360
+ fill_value: Optional[Union[int, float]] = None
361
+ ):
362
+ # --- Validation ---
363
+ if not isinstance(num_outputs, int) or num_outputs <= 0:
364
+ raise ValueError("num_outputs must be a positive integer.")
365
+
366
+ if not isinstance(regex_pattern, str):
367
+ raise TypeError("regex_pattern must be a string.")
368
+
369
+ # Validate that the regex has exactly one capturing group
370
+ try:
371
+ if re.compile(regex_pattern).groups != 1:
372
+ raise ValueError("regex_pattern must contain exactly one capturing group '(...)'")
373
+ except re.error as e:
374
+ raise ValueError(f"Invalid regex pattern provided: {e}") from e
375
+
376
+ # Validate dtype
377
+ if dtype not in ["float", "int"]:
378
+ raise ValueError("dtype must be either 'float' or 'int'.")
379
+
380
+ self.num_outputs = num_outputs
381
+ self.regex_pattern = regex_pattern
382
+ self.fill_value = fill_value
383
+ self.polars_dtype = pl.Float64 if dtype == "float" else pl.Int64
384
+
385
+ def __call__(self, column: pl.Series) -> pl.DataFrame:
386
+ """
387
+ Executes the multi-number extraction logic. Preserves nulls from the input column.
388
+ """
389
+ output_expressions = []
390
+ for i in range(self.num_outputs):
391
+ # Define the core extraction logic for the i-th number
392
+ extraction_expr = (
393
+ column.str.extract_all(self.regex_pattern)
394
+ .list.get(i)
395
+ .cast(self.polars_dtype, strict=False)
396
+ )
397
+
398
+ # Apply the fill value if provided
399
+ if self.fill_value is not None:
400
+ extraction_expr = extraction_expr.fill_null(self.fill_value)
401
+
402
+ # Only apply the logic when the input is not null.
403
+ # Otherwise, the result should also be null.
404
+ final_expr = (
405
+ pl.when(column.is_not_null())
406
+ .then(extraction_expr)
407
+ .otherwise(None)
408
+ .alias(f"col_{i}") # Name the final output expression
409
+ )
410
+
411
+ output_expressions.append(final_expr)
412
+
413
+ return pl.select(output_expressions)
414
+
415
+
416
+ class CategoryMapper:
417
+ """
418
+ A transformer that maps string categories to specified numerical values using a dictionary.
419
+
420
+ Ideal for ordinal encoding.
421
+
422
+ Args:
423
+ mapping (Dict[str, [int | float]]):
424
+ A dictionary that defines the mapping from a string category (key)
425
+ to a numerical value (value).
426
+ unseen_value (int | float | None):
427
+ The numerical value to use for categories that are present in the
428
+ data but not in the mapping dictionary. If not provided or set
429
+ to None, unseen categories will be mapped to a null value.
430
+ """
431
+ def __init__(
432
+ self,
433
+ mapping: Dict[str, Union[int, float]],
434
+ unseen_value: Optional[Union[int, float]] = None,
435
+ ):
436
+ if not isinstance(mapping, dict):
437
+ raise TypeError("The 'mapping' argument must be a dictionary.")
438
+
439
+ self.mapping = mapping
440
+ self.default_value = unseen_value
441
+
442
+ def __call__(self, column: pl.Series) -> pl.Series:
443
+ """
444
+ Applies the dictionary mapping to the input column.
445
+
446
+ Args:
447
+ column (pl.Series): The input Polars Series of categories.
448
+
449
+ Returns:
450
+ pl.Series: A new Series with categories mapped to numbers.
451
+ """
452
+ # Ensure the column is treated as a string for matching keys
453
+ str_column = column.cast(pl.Utf8)
454
+
455
+ # Create a list of 'when/then' expressions, one for each mapping
456
+ mapping_expressions = [
457
+ pl.when(str_column == from_val).then(pl.lit(to_val))
458
+ for from_val, to_val in self.mapping.items()
459
+ ]
460
+
461
+ # Use coalesce to find the first non-null value.
462
+ # The default_value acts as the final fallback.
463
+ final_expr = pl.coalesce(
464
+ *mapping_expressions, # Unpack the list of expressions
465
+ pl.lit(self.default_value)
466
+ )
467
+
468
+ return pl.select(final_expr).to_series()
469
+
470
+
471
+ class ValueBinner:
472
+ """
473
+ A transformer that discretizes a continuous numerical column into a finite number of bins.
474
+
475
+ Each bin is assigned an integer label (0, 1, 2, ...).
476
+
477
+ Args:
478
+ breaks (List[int | float]):
479
+ A list of numbers defining the boundaries of the bins. The list
480
+ must be sorted in ascending order and contain at least two values.
481
+ For example, `breaks=[0, 18, 40, 65]` creates three bins.
482
+ left_closed (bool):
483
+ Determines which side of the interval is inclusive.
484
+ - If `False` (default): Intervals are (lower, upper].
485
+ - If `True`: Intervals are [lower, upper).
486
+ """
487
+ def __init__(
488
+ self,
489
+ breaks: List[Union[int, float]],
490
+ left_closed: bool = False,
491
+ ):
492
+ # --- Validation ---
493
+ if not isinstance(breaks, list) or len(breaks) < 2:
494
+ raise ValueError("The 'breaks' argument must be a list of at least two numbers.")
495
+
496
+ # Check if the list is sorted
497
+ if not all(breaks[i] <= breaks[i+1] for i in range(len(breaks)-1)):
498
+ raise ValueError("The 'breaks' list must be sorted in ascending order.")
499
+
500
+ self.breaks = breaks
501
+ self.left_closed = left_closed
502
+ # Generate numerical labels [0, 1, 2, ...] for the bins
503
+ self.labels = [str(i) for i in range(len(breaks) - 1)]
504
+
505
+ def __call__(self, column: pl.Series) -> pl.Series:
506
+ """
507
+ Applies the binning logic to the input column.
508
+
509
+ Args:
510
+ column (pl.Series): The input Polars Series of numerical data.
511
+
512
+ Returns:
513
+ pl.Series: A new Series of integer labels for the bins. Values
514
+ outside the specified breaks will become null.
515
+ """
516
+ # `cut` creates a new column of type Categorical
517
+ binned_column = column.cut(
518
+ breaks=self.breaks,
519
+ labels=self.labels,
520
+ left_closed=self.left_closed
521
+ )
522
+
523
+ # to_physical() converts the Categorical type to its underlying
524
+ # integer representation (u32), which is perfect for ML.
525
+ return binned_column.to_physical()
526
+
527
+
528
+ class DateFeatureExtractor:
529
+ """
530
+ A one-to-many transformer that extracts multiple numerical features from a date or datetime column.
531
+
532
+ It can handle columns that are already in a Polars Date/Datetime format,
533
+ or it can parse string columns if a format is provided.
534
+
535
+ Args:
536
+ features (List[str]):
537
+ A list of the date/time features to extract. Supported features are:
538
+ 'year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond',
539
+ 'microsecond', 'nanosecond', 'ordinal_day' (day of year),
540
+ 'weekday' (Mon=1, Sun=7), 'week' (week of year), and 'timestamp'.
541
+ format (str | None):
542
+ The format code used to parse string dates (e.g., "%Y-%m-%d %H:%M:%S").
543
+ Use if the input column is not a Date or Datetime type.
544
+ """
545
+
546
+ ALLOWED_FEATURES = {
547
+ 'year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond',
548
+ 'microsecond', 'nanosecond', 'ordinal_day', 'weekday', 'week', 'timestamp'
549
+ }
550
+
551
+ def __init__(
552
+ self,
553
+ features: List[str],
554
+ format: Optional[str] = None,
555
+ ):
556
+ # --- Validation ---
557
+ if not isinstance(features, list) or not features:
558
+ raise ValueError("'features' must be a non-empty list of strings.")
559
+
560
+ for feature in features:
561
+ if feature not in self.ALLOWED_FEATURES:
562
+ raise ValueError(
563
+ f"Feature '{feature}' is not supported. "
564
+ f"Allowed features are: {self.ALLOWED_FEATURES}"
565
+ )
566
+
567
+ self.features = features
568
+ self.format = format
569
+
570
+ def __call__(self, column: pl.Series) -> pl.DataFrame:
571
+ """
572
+ Applies the feature extraction logic to the input column.
573
+
574
+ Args:
575
+ column (pl.Series): The input Polars Series of dates.
576
+
577
+ Returns:
578
+ pl.DataFrame: A DataFrame with columns for each extracted feature.
579
+ """
580
+ date_col = column
581
+ # First, parse strings into a datetime object if a format is given
582
+ if self.format is not None:
583
+ date_col = date_col.str.to_datetime(format=self.format, strict=False)
584
+
585
+ output_expressions = []
586
+ for i, feature in enumerate(self.features):
587
+ # Build the expression based on the feature name
588
+ if feature == 'timestamp':
589
+ expr = date_col.dt.timestamp(time_unit="ms")
590
+ else:
591
+ # getattr is a clean way to call methods like .dt.year(), .dt.month(), etc.
592
+ expr = getattr(date_col.dt, feature)()
593
+
594
+ # Alias with a generic name for the processor to handle
595
+ output_expressions.append(expr.alias(f"col_{i}"))
596
+
597
+ return pl.select(output_expressions)
598
+
599
+
600
+ def info():
601
+ _script_info(__all__)
@@ -340,8 +340,8 @@ def _pso(func: ObjectiveFunction,
340
340
  lb: np.ndarray,
341
341
  ub: np.ndarray,
342
342
  device: torch.device,
343
- swarmsize=100,
344
- maxiter=100,
343
+ swarmsize: int,
344
+ maxiter: int,
345
345
  omega = 0.729, # Clerc and Kennedy’s constriction coefficient
346
346
  phip = 1.49445, # Clerc and Kennedy’s constriction coefficient
347
347
  phig = 1.49445, # Clerc and Kennedy’s constriction coefficient
@@ -391,7 +391,7 @@ def _pso(func: ObjectiveFunction,
391
391
  If True, returns the full history of particle positions and objective scores at each iteration.
392
392
 
393
393
  seed : int or None, default=None
394
- Random seed for reproducibility. If None, defaults to 42.
394
+ Random seed for reproducibility. If None, the random state is not fixed.
395
395
 
396
396
  Returns
397
397
  -------
@@ -1,4 +1,5 @@
1
1
  import pandas as pd
2
+ from pandas.api.types import is_numeric_dtype
2
3
  import numpy as np
3
4
  import matplotlib.pyplot as plt
4
5
  import seaborn as sns
@@ -24,7 +25,8 @@ __all__ = [
24
25
  "plot_value_distributions",
25
26
  "clip_outliers_single",
26
27
  "clip_outliers_multi",
27
- "match_and_filter_columns_by_regex"
28
+ "match_and_filter_columns_by_regex",
29
+ "standardize_percentages"
28
30
  ]
29
31
 
30
32
 
@@ -575,6 +577,72 @@ def match_and_filter_columns_by_regex(
575
577
  return filtered_df, matched_columns
576
578
 
577
579
 
580
+ def standardize_percentages(
581
+ df: pd.DataFrame,
582
+ columns: list[str],
583
+ treat_one_as_proportion: bool = True,
584
+ round_digits: int = 2
585
+ ) -> pd.DataFrame:
586
+ """
587
+ Standardizes numeric columns containing mixed-format percentages.
588
+
589
+ This function cleans columns where percentages might be entered as whole
590
+ numbers (e.g., 55) or as proportions (e.g., 0.55). It assumes values
591
+ between 0 and 1 are proportions and multiplies them by 100.
592
+
593
+ Args:
594
+ df (pd.Dataframe): The input pandas DataFrame.
595
+ columns (list[str]): A list of column names to standardize.
596
+ treat_one_as_proportion (bool):
597
+ - If True (default): The value `1` is treated as a proportion and converted to `100`.
598
+ - If False: The value `1` is treated as `1%`.
599
+ round_digits (int): The number of decimal places to round the final result to.
600
+
601
+ Returns:
602
+ (pd.Dataframe):
603
+ A new DataFrame with the specified columns cleaned and standardized.
604
+ """
605
+ df_copy = df.copy()
606
+
607
+ if df_copy.empty:
608
+ return df_copy
609
+
610
+ # This helper function contains the core cleaning logic
611
+ def _clean_value(x: float) -> float:
612
+ """Applies the standardization rule to a single value."""
613
+ if pd.isna(x):
614
+ return x
615
+
616
+ # If treat_one_as_proportion is True, the range for proportions is [0, 1]
617
+ if treat_one_as_proportion and 0 <= x <= 1:
618
+ return x * 100
619
+ # If False, the range for proportions is [0, 1) (1 is excluded)
620
+ elif not treat_one_as_proportion and 0 <= x < 1:
621
+ return x * 100
622
+
623
+ # Otherwise, the value is assumed to be a correctly formatted percentage
624
+ return x
625
+
626
+ for col in columns:
627
+ # --- Robustness Checks ---
628
+ if col not in df_copy.columns:
629
+ print(f"Warning: Column '{col}' not found. Skipping.")
630
+ continue
631
+
632
+ if not is_numeric_dtype(df_copy[col]):
633
+ print(f"Warning: Column '{col}' is not numeric. Skipping.")
634
+ continue
635
+
636
+ # --- Applying the Logic ---
637
+ # Apply the cleaning function to every value in the column
638
+ df_copy[col] = df_copy[col].apply(_clean_value)
639
+
640
+ # Round the result
641
+ df_copy[col] = df_copy[col].round(round_digits)
642
+
643
+ return df_copy
644
+
645
+
578
646
  def _is_notebook():
579
647
  return get_ipython() is not None
580
648
 
ml_tools/utilities.py CHANGED
@@ -144,23 +144,61 @@ def list_files_by_extension(directory: Union[str,Path], extension: str) -> dict[
144
144
  return name_path_dict
145
145
 
146
146
 
147
- def load_dataframe(df_path: Union[str,Path]) -> tuple[pd.DataFrame, str]:
147
+ def load_dataframe(
148
+ df_path: Union[str, Path],
149
+ kind: Literal["pandas", "polars"] = "pandas",
150
+ all_strings: bool = False
151
+ ) -> Tuple[Union[pd.DataFrame, pl.DataFrame], str]:
148
152
  """
149
- Load a CSV file into a pandas DataFrame and extract the base name (without extension) from the file path.
153
+ Load a CSV file into a DataFrame and extract its base name.
154
+
155
+ Can load data as either a pandas or a polars DataFrame. Allows for loading all
156
+ columns as string types to prevent type inference errors.
150
157
 
151
158
  Args:
152
- df_path (str | Path): The path to the CSV file.
159
+ df_path (Union[str, Path]):
160
+ The path to the CSV file.
161
+ kind (Literal["pandas", "polars"], optional):
162
+ The type of DataFrame to load. Defaults to "pandas".
163
+ all_strings (bool, optional):
164
+ If True, loads all columns as string data types. This is useful for
165
+ ETL tasks and to avoid type-inference errors. Defaults to False.
153
166
 
154
167
  Returns:
155
- Tuple ([pd.DataFrame, str]):
156
- A tuple containing the loaded pandas DataFrame and the base name of the file.
168
+ (Tuple[DataFrameType, str]):
169
+ A tuple containing the loaded DataFrame (either pandas or polars)
170
+ and the base name of the file (without extension).
171
+
172
+ Raises:
173
+ FileNotFoundError: If the file does not exist at the given path.
174
+ ValueError: If the DataFrame is empty or an invalid 'kind' is provided.
157
175
  """
158
176
  path = make_fullpath(df_path)
159
- df = pd.read_csv(path, encoding='utf-8')
177
+
160
178
  df_name = path.stem
161
- if df.empty:
162
- raise ValueError(f"DataFrame '{df_name}' is empty.")
163
- print(f"\n💿 Loaded dataset: '{df_name}' with shape: {df.shape}")
179
+
180
+ if kind == "pandas":
181
+ if all_strings:
182
+ df = pd.read_csv(path, encoding='utf-8', dtype=str)
183
+ else:
184
+ df = pd.read_csv(path, encoding='utf-8')
185
+
186
+ elif kind == "polars":
187
+ if all_strings:
188
+ df = pl.read_csv(path, infer_schema=False)
189
+ else:
190
+ # Default behavior: infer the schema.
191
+ df = pl.read_csv(path, infer_schema_length=1000)
192
+
193
+ else:
194
+ raise ValueError(f"Invalid kind '{kind}'. Must be one of 'pandas' or 'polars'.")
195
+
196
+ # This check works for both pandas and polars DataFrames
197
+ if df.shape[0] == 0:
198
+ raise ValueError(f"DataFrame '{df_name}' loaded from '{path}' is empty.")
199
+
200
+ print(f"\n💿 Loaded {kind} dataset: '{df_name}' with shape: {df.shape}")
201
+
164
202
  return df, df_name
165
203
 
166
204
 
@@ -247,29 +285,42 @@ def merge_dataframes(
247
285
  return merged_df
248
286
 
249
287
 
250
- def save_dataframe(df: pd.DataFrame, save_dir: Union[str,Path], filename: str) -> None:
288
+ def save_dataframe(df: Union[pd.DataFrame, pl.DataFrame], save_dir: Union[str,Path], filename: str) -> None:
251
289
  """
252
- Save a pandas DataFrame to a CSV file.
290
+ Saves a pandas or polars DataFrame to a CSV file.
253
291
 
254
- Parameters:
255
- df (pd.DataFrame): Dataframe to save.
256
- save_dir (str | Path): Directory where the CSV file will be saved.
257
- filename (str): CSV filename, extension will be added if missing.
292
+ Args:
293
+ df (Union[pd.DataFrame, pl.DataFrame]):
294
+ The DataFrame to save.
295
+ save_dir (Union[str, Path]):
296
+ The directory where the CSV file will be saved.
297
+ filename (str):
298
+ The CSV filename. The '.csv' extension will be added if missing.
258
299
  """
259
- if df.empty:
300
+ # This check works for both pandas and polars
301
+ if df.shape[0] == 0:
260
302
  print(f"⚠️ Attempting to save an empty DataFrame: '{filename}'. Process Skipped.")
261
303
  return
262
304
 
305
+ # Create the directory if it doesn't exist
263
306
  save_path = make_fullpath(save_dir, make=True)
264
307
 
308
+ # Clean the filename
265
309
  filename = sanitize_filename(filename)
266
-
267
310
  if not filename.endswith('.csv'):
268
311
  filename += '.csv'
269
312
 
270
313
  output_path = save_path / filename
271
314
 
272
- df.to_csv(output_path, index=False, encoding='utf-8')
315
+ # --- Type-specific saving logic ---
316
+ if isinstance(df, pd.DataFrame):
317
+ df.to_csv(output_path, index=False, encoding='utf-8')
318
+ elif isinstance(df, pl.DataFrame):
319
+ df.write_csv(output_path) # Polars defaults to utf8 and no index
320
+ else:
321
+ # This error handles cases where an unsupported type is passed
322
+ raise TypeError(f"Unsupported DataFrame type: {type(df)}. Must be pandas or polars.")
323
+
273
324
  print(f"✅ Saved dataset: '{filename}' with shape: {df.shape}")
274
325
 
275
326
 
@@ -446,7 +497,7 @@ def threshold_binary_values_batch(
446
497
  return np.hstack([cont_part, bin_part])
447
498
 
448
499
 
449
- def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[str]:
500
+ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> Optional[Path]:
450
501
  """
451
502
  Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
452
503
 
@@ -456,7 +507,7 @@ def serialize_object(obj: Any, save_dir: Union[str,Path], filename: str, verbose
456
507
  filename (str) : Name for the output file, extension will be appended if needed.
457
508
 
458
509
  Returns:
459
- (str | None) : The full file path where the object was saved if successful; otherwise, None.
510
+ (Path | None) : The full file path where the object was saved if successful; otherwise, None.
460
511
  """
461
512
  try:
462
513
  save_path = make_fullpath(save_dir, make=True)
@@ -540,7 +591,7 @@ def distribute_datasets_by_target(
540
591
  feature_columns = [col for col in df.columns if col not in valid_targets]
541
592
 
542
593
  for target in valid_targets:
543
- subset = df[feature_columns + [target]].dropna(subset=[target])
594
+ subset = df[feature_columns + [target]].dropna(subset=[target]) # type: ignore
544
595
  if verbose:
545
596
  print(f"Target: '{target}' - Dataframe shape: {subset.shape}")
546
597
  yield target, subset