dragon-ml-toolbox 3.1.0__py3-none-any.whl → 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-3.1.0.dist-info → dragon_ml_toolbox-3.2.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-3.1.0.dist-info → dragon_ml_toolbox-3.2.0.dist-info}/RECORD +7 -7
- ml_tools/ETL_engineering.py +111 -21
- {dragon_ml_toolbox-3.1.0.dist-info → dragon_ml_toolbox-3.2.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-3.1.0.dist-info → dragon_ml_toolbox-3.2.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-3.1.0.dist-info → dragon_ml_toolbox-3.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-3.1.0.dist-info → dragon_ml_toolbox-3.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
dragon_ml_toolbox-3.
|
|
2
|
-
dragon_ml_toolbox-3.
|
|
3
|
-
ml_tools/ETL_engineering.py,sha256=
|
|
1
|
+
dragon_ml_toolbox-3.2.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-3.2.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=6cfpIeQ6D4Mcs10nkogQrkVyq1T7i2qXjjNHFoUMOyE,1892
|
|
3
|
+
ml_tools/ETL_engineering.py,sha256=xbGti0XS84l1UWQ6y3mP-VV_4DfaRBoz8-B3bTgiCUE,36690
|
|
4
4
|
ml_tools/GUI_tools.py,sha256=uFx6zIrQZzDPSTtOSHz8ptz-fxZiQz-lXHcrqwuYV_E,20385
|
|
5
5
|
ml_tools/MICE_imputation.py,sha256=ed-YeQkEAeHxTNkWIHs09T4YeYNF0aqAnrUTcdIEp9E,11372
|
|
6
6
|
ml_tools/ML_callbacks.py,sha256=gHZk-lyzAax6iEtG26zHuoobdAZCFJ6BmI6pWoXkOrw,13189
|
|
@@ -19,7 +19,7 @@ ml_tools/ensemble_learning.py,sha256=PPtBBLgLvaYOdY-MlcjXuxWWXf3JQavLNEysFgzjc_s
|
|
|
19
19
|
ml_tools/handle_excel.py,sha256=lwds7rDLlGSCWiWGI7xNg-Z7kxAepogp0lstSFa0590,12949
|
|
20
20
|
ml_tools/logger.py,sha256=jC4Q2OqmDm8ZO9VpuZqBSWdXryqaJvLscqVJ6caNMOk,6009
|
|
21
21
|
ml_tools/utilities.py,sha256=opNR-ACH6BnLkWAKcb19ef5tFxfx22TI6E2o0RYwiGA,21021
|
|
22
|
-
dragon_ml_toolbox-3.
|
|
23
|
-
dragon_ml_toolbox-3.
|
|
24
|
-
dragon_ml_toolbox-3.
|
|
25
|
-
dragon_ml_toolbox-3.
|
|
22
|
+
dragon_ml_toolbox-3.2.0.dist-info/METADATA,sha256=QLfQmo-eUJFkFOKbbZzZKMZ5SARrrcbhfkffrPkMHvc,3273
|
|
23
|
+
dragon_ml_toolbox-3.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
24
|
+
dragon_ml_toolbox-3.2.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
25
|
+
dragon_ml_toolbox-3.2.0.dist-info/RECORD,,
|
ml_tools/ETL_engineering.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import polars as pl
|
|
2
2
|
import re
|
|
3
|
-
from typing import Literal, Union, Optional, Any, Callable, List, Dict
|
|
3
|
+
from typing import Literal, Union, Optional, Any, Callable, List, Dict, Tuple
|
|
4
4
|
from .utilities import _script_info
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from .logger import _LOGGER
|
|
@@ -11,6 +11,7 @@ __all__ = [
|
|
|
11
11
|
"DataFrameCleaner",
|
|
12
12
|
"TransformationRecipe",
|
|
13
13
|
"DataProcessor",
|
|
14
|
+
"BinaryTransformer",
|
|
14
15
|
"KeywordDummifier",
|
|
15
16
|
"NumberExtractor",
|
|
16
17
|
"MultiNumberExtractor",
|
|
@@ -144,7 +145,7 @@ class DataFrameCleaner:
|
|
|
144
145
|
return df_cleaned
|
|
145
146
|
|
|
146
147
|
|
|
147
|
-
############ TRANSFORM ####################
|
|
148
|
+
############ TRANSFORM MAIN ####################
|
|
148
149
|
|
|
149
150
|
# Magic word for rename-only transformation
|
|
150
151
|
_RENAME = "rename"
|
|
@@ -329,6 +330,75 @@ class DataProcessor:
|
|
|
329
330
|
"""
|
|
330
331
|
print(self)
|
|
331
332
|
|
|
333
|
+
############ TRANSFORMERS ####################
|
|
334
|
+
|
|
335
|
+
class BinaryTransformer:
|
|
336
|
+
"""
|
|
337
|
+
A transformer that maps string values to a binary 1 or 0 based on keyword matching.
|
|
338
|
+
|
|
339
|
+
Must supply a list of keywords for either the 'true' case (1) or the 'false' case (0), but not both.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
true_keywords (List[str] | None):
|
|
343
|
+
If a string contains any of these keywords, the output is 1, otherwise 0.
|
|
344
|
+
false_keywords (List[str] | None):
|
|
345
|
+
If a string contains any of these keywords, the output is 0, otherwise 1.
|
|
346
|
+
"""
|
|
347
|
+
def __init__(
|
|
348
|
+
self,
|
|
349
|
+
true_keywords: Optional[List[str]] = None,
|
|
350
|
+
false_keywords: Optional[List[str]] = None,
|
|
351
|
+
case_insensitive: bool = True,
|
|
352
|
+
):
|
|
353
|
+
# --- Validation: Enforce one and only one option ---
|
|
354
|
+
if true_keywords is not None and false_keywords is not None:
|
|
355
|
+
raise ValueError(
|
|
356
|
+
"Provide either 'true_keywords' or 'false_keywords', but not both."
|
|
357
|
+
)
|
|
358
|
+
if true_keywords is None and false_keywords is None:
|
|
359
|
+
raise ValueError(
|
|
360
|
+
"You must provide either 'true_keywords' or 'false_keywords'."
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# --- Configuration ---
|
|
364
|
+
self.keywords: List[str] = true_keywords if true_keywords is not None else false_keywords # type: ignore
|
|
365
|
+
if not self.keywords:
|
|
366
|
+
raise ValueError("Keyword list cannot be empty.")
|
|
367
|
+
|
|
368
|
+
self.mode: str = "true_mode" if true_keywords is not None else "false_mode"
|
|
369
|
+
|
|
370
|
+
# --- Create the regex string pattern ---
|
|
371
|
+
# Escape keywords to treat them as literals
|
|
372
|
+
base_pattern = "|".join(re.escape(k) for k in self.keywords)
|
|
373
|
+
|
|
374
|
+
# For polars, add case-insensitivity flag `(?i)` to the pattern string itself
|
|
375
|
+
if case_insensitive:
|
|
376
|
+
self.pattern = f"(?i){base_pattern}"
|
|
377
|
+
else:
|
|
378
|
+
self.pattern = base_pattern
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def __call__(self, column: pl.Series) -> pl.Series:
|
|
382
|
+
"""
|
|
383
|
+
Applies the binary mapping logic to the input column.
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
column (pl.Series): The input Polars Series of string data.
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
pl.Series: A new Series of type UInt8 containing 1s and 0s.
|
|
390
|
+
"""
|
|
391
|
+
# Create a boolean Series: True if any keyword is found, else False
|
|
392
|
+
contains_keyword = column.str.contains(self.pattern)
|
|
393
|
+
|
|
394
|
+
# Apply logic and cast directly to integer type
|
|
395
|
+
if self.mode == "true_mode":
|
|
396
|
+
# True -> 1, False -> 0
|
|
397
|
+
return contains_keyword.cast(pl.UInt8)
|
|
398
|
+
else: # false_mode
|
|
399
|
+
# We want the inverse: True -> 0, False -> 1
|
|
400
|
+
return (~contains_keyword).cast(pl.UInt8)
|
|
401
|
+
|
|
332
402
|
|
|
333
403
|
class KeywordDummifier:
|
|
334
404
|
"""
|
|
@@ -345,13 +415,16 @@ class KeywordDummifier:
|
|
|
345
415
|
group_keywords (List[List[str]]):
|
|
346
416
|
A list of lists of strings. Each inner list corresponds to a
|
|
347
417
|
`group_name` at the same index and contains the keywords to search for.
|
|
418
|
+
case_insensitive (bool):
|
|
419
|
+
If True, keyword matching ignores case.
|
|
348
420
|
"""
|
|
349
|
-
def __init__(self, group_names: List[str], group_keywords: List[List[str]]):
|
|
421
|
+
def __init__(self, group_names: List[str], group_keywords: List[List[str]], case_insensitive: bool = True):
|
|
350
422
|
if len(group_names) != len(group_keywords):
|
|
351
423
|
raise ValueError("Initialization failed: 'group_names' and 'group_keywords' must have the same length.")
|
|
352
424
|
|
|
353
425
|
self.group_names = group_names
|
|
354
426
|
self.group_keywords = group_keywords
|
|
427
|
+
self.case_insensitive = case_insensitive
|
|
355
428
|
|
|
356
429
|
def __call__(self, column: pl.Series) -> pl.DataFrame:
|
|
357
430
|
"""
|
|
@@ -365,9 +438,18 @@ class KeywordDummifier:
|
|
|
365
438
|
"""
|
|
366
439
|
column = column.cast(pl.Utf8)
|
|
367
440
|
|
|
368
|
-
categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None))
|
|
441
|
+
categorize_expr = pl.when(pl.lit(False)).then(pl.lit(None, dtype=pl.Utf8))
|
|
442
|
+
|
|
369
443
|
for name, keywords in zip(self.group_names, self.group_keywords):
|
|
370
|
-
pattern
|
|
444
|
+
# Create the base regex pattern by escaping and joining keywords
|
|
445
|
+
base_pattern = "|".join(re.escape(k) for k in keywords)
|
|
446
|
+
|
|
447
|
+
# Add the case-insensitive flag `(?i)` to the pattern string
|
|
448
|
+
if self.case_insensitive:
|
|
449
|
+
pattern = f"(?i){base_pattern}"
|
|
450
|
+
else:
|
|
451
|
+
pattern = base_pattern
|
|
452
|
+
|
|
371
453
|
categorize_expr = categorize_expr.when(
|
|
372
454
|
column.str.contains(pattern)
|
|
373
455
|
).then(pl.lit(name))
|
|
@@ -386,6 +468,7 @@ class KeywordDummifier:
|
|
|
386
468
|
df_with_dummies.get_column(dummy_col_name).alias(name)
|
|
387
469
|
)
|
|
388
470
|
else:
|
|
471
|
+
# If a group had no matches, create a column of zeros
|
|
389
472
|
final_columns.append(pl.lit(0, dtype=pl.UInt8).alias(name))
|
|
390
473
|
|
|
391
474
|
return pl.DataFrame(final_columns)
|
|
@@ -661,33 +744,42 @@ class RegexMapper:
|
|
|
661
744
|
"first match wins" logic makes the order of the mapping important.
|
|
662
745
|
|
|
663
746
|
Args:
|
|
664
|
-
mapping (Dict[str,
|
|
747
|
+
mapping (Dict[str, [int | float]]):
|
|
665
748
|
An ordered dictionary where keys are regex patterns and values are
|
|
666
749
|
the numbers to map to if the pattern is found.
|
|
667
|
-
unseen_value (
|
|
750
|
+
unseen_value (int | float | None):
|
|
668
751
|
The numerical value to use for strings that do not match any
|
|
669
|
-
of the regex patterns. If None
|
|
670
|
-
|
|
752
|
+
of the regex patterns. If None, unseen values are mapped to null.
|
|
753
|
+
case_insensitive (bool):
|
|
754
|
+
If True , the regex matching for all patterns will ignore case.
|
|
671
755
|
"""
|
|
672
756
|
def __init__(
|
|
673
757
|
self,
|
|
674
758
|
mapping: Dict[str, Union[int, float]],
|
|
675
759
|
unseen_value: Optional[Union[int, float]] = None,
|
|
760
|
+
case_insensitive: bool = True,
|
|
676
761
|
):
|
|
677
762
|
# --- Validation ---
|
|
678
763
|
if not isinstance(mapping, dict):
|
|
679
764
|
raise TypeError("The 'mapping' argument must be a dictionary.")
|
|
765
|
+
|
|
766
|
+
self.unseen_value = unseen_value
|
|
680
767
|
|
|
768
|
+
# --- Process and validate patterns ---
|
|
769
|
+
# Process patterns here to be more efficient, avoiding reprocessing on every __call__.
|
|
770
|
+
self.processed_mapping: List[Tuple[str, Union[int, float]]] = []
|
|
681
771
|
for pattern, value in mapping.items():
|
|
772
|
+
final_pattern = f"(?i){pattern}" if case_insensitive else pattern
|
|
773
|
+
|
|
774
|
+
# Validate the final pattern that will actually be used by Polars
|
|
682
775
|
try:
|
|
683
|
-
re.compile(
|
|
776
|
+
re.compile(final_pattern)
|
|
684
777
|
except re.error as e:
|
|
685
|
-
raise ValueError(f"Invalid regex pattern '{
|
|
778
|
+
raise ValueError(f"Invalid regex pattern '{final_pattern}': {e}") from e
|
|
686
779
|
if not isinstance(value, (int, float)):
|
|
687
780
|
raise TypeError(f"Mapping values must be int or float, but got {type(value)} for pattern '{pattern}'.")
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
self.unseen_value = unseen_value
|
|
781
|
+
|
|
782
|
+
self.processed_mapping.append((final_pattern, value))
|
|
691
783
|
|
|
692
784
|
def __call__(self, column: pl.Series) -> pl.Series:
|
|
693
785
|
"""
|
|
@@ -700,22 +792,20 @@ class RegexMapper:
|
|
|
700
792
|
pl.Series: A new Series with strings mapped to numbers based on
|
|
701
793
|
the first matching regex pattern.
|
|
702
794
|
"""
|
|
703
|
-
#
|
|
704
|
-
str_column = column.cast(pl.
|
|
795
|
+
# pl.String is the modern alias for pl.Utf8
|
|
796
|
+
str_column = column.cast(pl.String)
|
|
705
797
|
|
|
706
|
-
#
|
|
707
|
-
# Start with the final fallback value for non-matches.
|
|
798
|
+
# Start with the fallback value for non-matches.
|
|
708
799
|
mapping_expr = pl.lit(self.unseen_value)
|
|
709
800
|
|
|
710
|
-
# Iterate through the mapping in reverse to construct the nested expression
|
|
711
|
-
for pattern, value in reversed(
|
|
801
|
+
# Iterate through the processed mapping in reverse to construct the nested expression
|
|
802
|
+
for pattern, value in reversed(self.processed_mapping):
|
|
712
803
|
mapping_expr = (
|
|
713
804
|
pl.when(str_column.str.contains(pattern))
|
|
714
805
|
.then(pl.lit(value))
|
|
715
806
|
.otherwise(mapping_expr)
|
|
716
807
|
)
|
|
717
808
|
|
|
718
|
-
# Execute the complete expression chain and return the resulting Series
|
|
719
809
|
return pl.select(mapping_expr).to_series()
|
|
720
810
|
|
|
721
811
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|