dataforge-ml 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge_ml-0.1.0/LICENSE +21 -0
- dataforge_ml-0.1.0/PKG-INFO +34 -0
- dataforge_ml-0.1.0/README.md +13 -0
- dataforge_ml-0.1.0/dataforge_ml.egg-info/PKG-INFO +34 -0
- dataforge_ml-0.1.0/dataforge_ml.egg-info/SOURCES.txt +57 -0
- dataforge_ml-0.1.0/dataforge_ml.egg-info/dependency_links.txt +1 -0
- dataforge_ml-0.1.0/dataforge_ml.egg-info/requires.txt +7 -0
- dataforge_ml-0.1.0/dataforge_ml.egg-info/top_level.txt +6 -0
- dataforge_ml-0.1.0/models/__init__.py +0 -0
- dataforge_ml-0.1.0/models/_data_structure.py +7 -0
- dataforge_ml-0.1.0/models/_data_types.py +12 -0
- dataforge_ml-0.1.0/profiling/__init__.py +35 -0
- dataforge_ml-0.1.0/profiling/_base.py +101 -0
- dataforge_ml-0.1.0/profiling/_boolean_config.py +37 -0
- dataforge_ml-0.1.0/profiling/_boolean_profiler.py +191 -0
- dataforge_ml-0.1.0/profiling/_categorical.py +315 -0
- dataforge_ml-0.1.0/profiling/_categorical_config.py +87 -0
- dataforge_ml-0.1.0/profiling/_correlation_config.py +225 -0
- dataforge_ml-0.1.0/profiling/_correlation_profiler.py +544 -0
- dataforge_ml-0.1.0/profiling/_datetime_config.py +98 -0
- dataforge_ml-0.1.0/profiling/_datetime_profiler.py +406 -0
- dataforge_ml-0.1.0/profiling/_missingness_config.py +137 -0
- dataforge_ml-0.1.0/profiling/_missingness_profiler.py +252 -0
- dataforge_ml-0.1.0/profiling/_numeric_config.py +116 -0
- dataforge_ml-0.1.0/profiling/_numeric_profiler.py +403 -0
- dataforge_ml-0.1.0/profiling/_tabular.py +249 -0
- dataforge_ml-0.1.0/profiling/_target_config.py +74 -0
- dataforge_ml-0.1.0/profiling/_target_profiler.py +156 -0
- dataforge_ml-0.1.0/profiling/_text_config.py +40 -0
- dataforge_ml-0.1.0/profiling/_text_profiler.py +194 -0
- dataforge_ml-0.1.0/profiling/_type_detector.py +463 -0
- dataforge_ml-0.1.0/profiling/config.py +236 -0
- dataforge_ml-0.1.0/profiling/structural.py +280 -0
- dataforge_ml-0.1.0/pyproject.toml +35 -0
- dataforge_ml-0.1.0/setup.cfg +4 -0
- dataforge_ml-0.1.0/splitting/__init__.py +4 -0
- dataforge_ml-0.1.0/splitting/_config.py +56 -0
- dataforge_ml-0.1.0/splitting/_splitter.py +202 -0
- dataforge_ml-0.1.0/tests/__init__.py +0 -0
- dataforge_ml-0.1.0/tests/conftest.py +7 -0
- dataforge_ml-0.1.0/tests/integration/__init__.py +0 -0
- dataforge_ml-0.1.0/tests/integration/conftest.py +82 -0
- dataforge_ml-0.1.0/tests/integration/test_structural_end_to_end.py +219 -0
- dataforge_ml-0.1.0/tests/unit/__init__.py +0 -0
- dataforge_ml-0.1.0/tests/unit/profiling/__init__.py +0 -0
- dataforge_ml-0.1.0/tests/unit/profiling/conftest.py +81 -0
- dataforge_ml-0.1.0/tests/unit/profiling/test_boolean_profiler.py +91 -0
- dataforge_ml-0.1.0/tests/unit/profiling/test_categorical_profiler.py +182 -0
- dataforge_ml-0.1.0/tests/unit/profiling/test_correlation_profiler.py +124 -0
- dataforge_ml-0.1.0/tests/unit/profiling/test_datetime_profiler.py +133 -0
- dataforge_ml-0.1.0/tests/unit/profiling/test_missingness_profiler.py +51 -0
- dataforge_ml-0.1.0/tests/unit/profiling/test_numeric_profiler.py +212 -0
- dataforge_ml-0.1.0/tests/unit/profiling/test_target_profiler.py +44 -0
- dataforge_ml-0.1.0/tests/unit/profiling/test_text_profiler.py +61 -0
- dataforge_ml-0.1.0/tests/unit/profiling/test_type_detector.py +32 -0
- dataforge_ml-0.1.0/tests/unit/splitting/__init__.py +0 -0
- dataforge_ml-0.1.0/tests/unit/splitting/test_data_splitter.py +417 -0
- dataforge_ml-0.1.0/utils/__init__.py +0 -0
- dataforge_ml-0.1.0/utils/data_loader.py +110 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 DEVunderdog
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataforge-ml
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A automated feature engineering and designing pipeline library
|
|
5
|
+
License: MIT
|
|
6
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: polars>=1.0.0
|
|
15
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
16
|
+
Requires-Dist: scipy>=1.10.0
|
|
17
|
+
Requires-Dist: numpy>=2.0.0
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# FeatureForge
|
|
23
|
+
|
|
24
|
+
Automated feature engineering and data profiling pipeline library for tabular datasets.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install dataforge-ml
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## License
|
|
33
|
+
|
|
34
|
+
MIT
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataforge-ml
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A automated feature engineering and designing pipeline library
|
|
5
|
+
License: MIT
|
|
6
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: polars>=1.0.0
|
|
15
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
16
|
+
Requires-Dist: scipy>=1.10.0
|
|
17
|
+
Requires-Dist: numpy>=2.0.0
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# FeatureForge
|
|
23
|
+
|
|
24
|
+
Automated feature engineering and data profiling pipeline library for tabular datasets.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install dataforge-ml
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## License
|
|
33
|
+
|
|
34
|
+
MIT
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
dataforge_ml.egg-info/PKG-INFO
|
|
5
|
+
dataforge_ml.egg-info/SOURCES.txt
|
|
6
|
+
dataforge_ml.egg-info/dependency_links.txt
|
|
7
|
+
dataforge_ml.egg-info/requires.txt
|
|
8
|
+
dataforge_ml.egg-info/top_level.txt
|
|
9
|
+
models/__init__.py
|
|
10
|
+
models/_data_structure.py
|
|
11
|
+
models/_data_types.py
|
|
12
|
+
profiling/__init__.py
|
|
13
|
+
profiling/_base.py
|
|
14
|
+
profiling/_boolean_config.py
|
|
15
|
+
profiling/_boolean_profiler.py
|
|
16
|
+
profiling/_categorical.py
|
|
17
|
+
profiling/_categorical_config.py
|
|
18
|
+
profiling/_correlation_config.py
|
|
19
|
+
profiling/_correlation_profiler.py
|
|
20
|
+
profiling/_datetime_config.py
|
|
21
|
+
profiling/_datetime_profiler.py
|
|
22
|
+
profiling/_missingness_config.py
|
|
23
|
+
profiling/_missingness_profiler.py
|
|
24
|
+
profiling/_numeric_config.py
|
|
25
|
+
profiling/_numeric_profiler.py
|
|
26
|
+
profiling/_tabular.py
|
|
27
|
+
profiling/_target_config.py
|
|
28
|
+
profiling/_target_profiler.py
|
|
29
|
+
profiling/_text_config.py
|
|
30
|
+
profiling/_text_profiler.py
|
|
31
|
+
profiling/_type_detector.py
|
|
32
|
+
profiling/config.py
|
|
33
|
+
profiling/structural.py
|
|
34
|
+
splitting/__init__.py
|
|
35
|
+
splitting/_config.py
|
|
36
|
+
splitting/_splitter.py
|
|
37
|
+
tests/__init__.py
|
|
38
|
+
tests/conftest.py
|
|
39
|
+
tests/integration/__init__.py
|
|
40
|
+
tests/integration/conftest.py
|
|
41
|
+
tests/integration/test_structural_end_to_end.py
|
|
42
|
+
tests/unit/__init__.py
|
|
43
|
+
tests/unit/profiling/__init__.py
|
|
44
|
+
tests/unit/profiling/conftest.py
|
|
45
|
+
tests/unit/profiling/test_boolean_profiler.py
|
|
46
|
+
tests/unit/profiling/test_categorical_profiler.py
|
|
47
|
+
tests/unit/profiling/test_correlation_profiler.py
|
|
48
|
+
tests/unit/profiling/test_datetime_profiler.py
|
|
49
|
+
tests/unit/profiling/test_missingness_profiler.py
|
|
50
|
+
tests/unit/profiling/test_numeric_profiler.py
|
|
51
|
+
tests/unit/profiling/test_target_profiler.py
|
|
52
|
+
tests/unit/profiling/test_text_profiler.py
|
|
53
|
+
tests/unit/profiling/test_type_detector.py
|
|
54
|
+
tests/unit/splitting/__init__.py
|
|
55
|
+
tests/unit/splitting/test_data_splitter.py
|
|
56
|
+
utils/__init__.py
|
|
57
|
+
utils/data_loader.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
_INT_DTYPES = {
|
|
4
|
+
pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
5
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
_CAT_DTYPES = {pl.Utf8, pl.String, pl.Categorical, pl.Boolean}
|
|
9
|
+
|
|
10
|
+
_NUMERIC_DTYPES = _INT_DTYPES | {pl.Float32, pl.Float64}
|
|
11
|
+
|
|
12
|
+
_DATETIME_DTYPES = {pl.Date, pl.Datetime}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from .structural import StructuralProfiler
|
|
2
|
+
from .config import (
|
|
3
|
+
ProfileConfig,
|
|
4
|
+
SemanticType,
|
|
5
|
+
Modality,
|
|
6
|
+
TypeFlag,
|
|
7
|
+
NumericKind,
|
|
8
|
+
NumericStats,
|
|
9
|
+
CategoricalStats,
|
|
10
|
+
DatetimeStats,
|
|
11
|
+
BooleanStats,
|
|
12
|
+
TextStats,
|
|
13
|
+
ColumnProfile,
|
|
14
|
+
DatasetStats,
|
|
15
|
+
StructuralProfileResult,
|
|
16
|
+
)
|
|
17
|
+
from ._base import ModalityProfiler
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"StructuralProfiler",
|
|
21
|
+
"ProfileConfig",
|
|
22
|
+
"SemanticType",
|
|
23
|
+
"Modality",
|
|
24
|
+
"TypeFlag",
|
|
25
|
+
"NumericKind",
|
|
26
|
+
"NumericStats",
|
|
27
|
+
"CategoricalStats",
|
|
28
|
+
"DatetimeStats",
|
|
29
|
+
"BooleanStats",
|
|
30
|
+
"TextStats",
|
|
31
|
+
"ColumnProfile",
|
|
32
|
+
"DatasetStats",
|
|
33
|
+
"StructuralProfileResult",
|
|
34
|
+
"ModalityProfiler",
|
|
35
|
+
]
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Abstract base classes for all structural profilers.
|
|
3
|
+
|
|
4
|
+
Hierarchy
|
|
5
|
+
---------
|
|
6
|
+
Profiling[R] — root: stores config, provides _resolve_columns
|
|
7
|
+
├── ColumnBatchProfiler[R] — registry tier: __init__(config=None) only;
|
|
8
|
+
│ │ profile(df, columns) processes a typed column batch
|
|
9
|
+
│ ├── NumericProfiler
|
|
10
|
+
│ ├── CategoricalProfiler
|
|
11
|
+
│ ├── DatetimeProfiler
|
|
12
|
+
│ ├── BooleanProfiler
|
|
13
|
+
│ └── TextProfiler
|
|
14
|
+
├── DatasetLevelProfiler[R] — direct-call tier: may have extra __init__ params;
|
|
15
|
+
│ │ not compatible with the SemanticType registry
|
|
16
|
+
│ ├── MissingnessProfiler
|
|
17
|
+
│ ├── TargetProfiler
|
|
18
|
+
│ └── CorrelationProfiler
|
|
19
|
+
└── ModalityProfiler — dataset-shape tier: profile(df) → DatasetStats
|
|
20
|
+
└── TabularProfiler
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import polars as pl
|
|
26
|
+
from abc import abstractmethod, ABC
|
|
27
|
+
from typing import Generic, TypeVar
|
|
28
|
+
|
|
29
|
+
from .config import DatasetStats, ProfileConfig
|
|
30
|
+
|
|
31
|
+
R = TypeVar("R")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Profiling(ABC, Generic[R]):
|
|
35
|
+
"""
|
|
36
|
+
Root base for all profilers.
|
|
37
|
+
|
|
38
|
+
Stores config and provides _resolve_columns. Not instantiated directly —
|
|
39
|
+
use one of the three concrete tier bases below.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, config: ProfileConfig | None = None):
|
|
43
|
+
self.config = config or ProfileConfig()
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def profile(self, data: pl.DataFrame, **kwargs) -> R: ...
|
|
47
|
+
|
|
48
|
+
def _resolve_columns(
|
|
49
|
+
self,
|
|
50
|
+
available: list[str],
|
|
51
|
+
requested: list[str] | None,
|
|
52
|
+
) -> list[str]:
|
|
53
|
+
if requested is None:
|
|
54
|
+
return list(available)
|
|
55
|
+
available_set = set(available)
|
|
56
|
+
return [c for c in requested if c in available_set]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class ColumnBatchProfiler(Profiling[R]):
|
|
60
|
+
"""
|
|
61
|
+
Registry-compatible column profiler.
|
|
62
|
+
|
|
63
|
+
Contract
|
|
64
|
+
--------
|
|
65
|
+
- __init__ must accept ONLY config (no extra required params). This allows
|
|
66
|
+
StructuralProfiler to instantiate any registered profiler uniformly via
|
|
67
|
+
profiler_cls(config=self.config)
|
|
68
|
+
- profile(df, columns) receives the full DataFrame and the list of same-type
|
|
69
|
+
column names to process. Returns a result with:
|
|
70
|
+
.columns: dict[str, <Stats>] — per-column stats
|
|
71
|
+
.analysed_columns: list[str] — columns actually profiled
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def profile(self, data: pl.DataFrame, columns: list[str]) -> R: ... # type: ignore[override]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class DatasetLevelProfiler(Profiling[R]):
|
|
79
|
+
"""
|
|
80
|
+
Directly-called profiler.
|
|
81
|
+
|
|
82
|
+
May have extra __init__ params (e.g. target_column, numeric_columns).
|
|
83
|
+
Never registered in the SemanticType registry — always instantiated
|
|
84
|
+
explicitly with its specific arguments.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
@abstractmethod
|
|
88
|
+
def profile(self, data: pl.DataFrame, **kwargs) -> R: ...
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class ModalityProfiler(Profiling[DatasetStats]):
|
|
92
|
+
"""
|
|
93
|
+
Dataset-shape profiler.
|
|
94
|
+
|
|
95
|
+
One concrete implementation per Modality. Returns DatasetStats covering
|
|
96
|
+
shape, memory, duplicates, sparsity, and chunking metadata.
|
|
97
|
+
profile(df) takes only the DataFrame — no column list needed.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
@abstractmethod
|
|
101
|
+
def profile(self, data: pl.DataFrame, **kwargs) -> DatasetStats: ... # type: ignore[override]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Result dataclass for boolean column profiling.
|
|
3
|
+
|
|
4
|
+
Populated by BooleanProfiler.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class BooleanStats:
|
|
15
|
+
true_count: int = 0
|
|
16
|
+
false_count: int = 0
|
|
17
|
+
true_ratio: float = 0.0
|
|
18
|
+
false_ratio: float = 0.0
|
|
19
|
+
mode: Optional[bool] = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class BooleanProfileResult:
|
|
24
|
+
"""
|
|
25
|
+
Boolean profile for all eligible columns.
|
|
26
|
+
|
|
27
|
+
Attributes
|
|
28
|
+
----------
|
|
29
|
+
columns : dict[str, BooleanStats]
|
|
30
|
+
Per-column boolean profiles, keyed by column name.
|
|
31
|
+
analysed_columns : list[str]
|
|
32
|
+
Columns that were actually profiled (after schema intersection
|
|
33
|
+
and eligibility check).
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
columns: dict[str, BooleanStats] = field(default_factory=dict)
|
|
37
|
+
analysed_columns: list[str] = field(default_factory=list)
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BooleanProfiler – Phase 1 extension: Boolean Column Profiling.
|
|
3
|
+
|
|
4
|
+
Handles columns classified as SemanticType.Boolean, which includes:
|
|
5
|
+
- Native Polars Boolean dtype
|
|
6
|
+
- Integer {0, 1} columns with a Boolean override in ProfileConfig
|
|
7
|
+
- Boolean-string columns ("true"/"false", "yes"/"no", "1"/"0") with override
|
|
8
|
+
|
|
9
|
+
Per-column metrics:
|
|
10
|
+
1. true_count – count of non-null truthy values
|
|
11
|
+
2. false_count – count of non-null falsy values
|
|
12
|
+
3. true_ratio – true_count / non_null_count (nulls excluded)
|
|
13
|
+
4. false_ratio – false_count / non_null_count (nulls excluded)
|
|
14
|
+
5. mode – most frequent non-null value (True / False), or None if tied
|
|
15
|
+
|
|
16
|
+
Null values are NOT counted in ratios — missingness is already captured by
|
|
17
|
+
the upstream MissingnessProfiler pass and lives in ColumnProfile.missingness.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import polars as pl
|
|
23
|
+
|
|
24
|
+
from ._base import ColumnBatchProfiler
|
|
25
|
+
from .config import (
|
|
26
|
+
ProfileConfig,
|
|
27
|
+
BooleanStats,
|
|
28
|
+
SemanticType,
|
|
29
|
+
)
|
|
30
|
+
from ._boolean_config import BooleanProfileResult
|
|
31
|
+
from ..models._data_types import _INT_DTYPES
|
|
32
|
+
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
# String values that represent True / False
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
_TRUE_STRINGS: frozenset[str] = frozenset({"true", "yes", "1", "t", "y"})
|
|
38
|
+
_FALSE_STRINGS: frozenset[str] = frozenset({"false", "no", "0", "f", "n"})
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
|
|
42
|
+
"""
|
|
43
|
+
Boolean column profiler for Polars DataFrames.
|
|
44
|
+
|
|
45
|
+
A column is eligible when:
|
|
46
|
+
- Its Polars dtype is pl.Boolean, OR
|
|
47
|
+
- Its dtype is an integer with values exclusively in {0, 1}, OR
|
|
48
|
+
- It has a SemanticType.Boolean override in ProfileConfig.column_overrides
|
|
49
|
+
|
|
50
|
+
Non-eligible columns in the provided list are silently skipped.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
config : ProfileConfig | None
|
|
55
|
+
Shared profiling configuration.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, config: ProfileConfig | None = None) -> None:
|
|
59
|
+
super().__init__(config)
|
|
60
|
+
|
|
61
|
+
# ------------------------------------------------------------------
|
|
62
|
+
# Public API
|
|
63
|
+
# ------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
def profile(
|
|
66
|
+
self,
|
|
67
|
+
data: pl.DataFrame,
|
|
68
|
+
columns: list[str],
|
|
69
|
+
) -> BooleanProfileResult:
|
|
70
|
+
return self._run(data, columns)
|
|
71
|
+
|
|
72
|
+
# ------------------------------------------------------------------
|
|
73
|
+
# Eligibility
|
|
74
|
+
# ------------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
def _eligible(self, series: pl.Series) -> bool:
|
|
77
|
+
override = self.config.column_overrides.get(series.name)
|
|
78
|
+
|
|
79
|
+
# Explicit override — trust it
|
|
80
|
+
if override == SemanticType.Boolean:
|
|
81
|
+
return True
|
|
82
|
+
|
|
83
|
+
# Another override takes precedence over auto-detection
|
|
84
|
+
if override is not None:
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
# Native boolean dtype
|
|
88
|
+
if series.dtype == pl.Boolean:
|
|
89
|
+
return True
|
|
90
|
+
|
|
91
|
+
# Integer {0, 1} column — check after dropping nulls
|
|
92
|
+
if series.dtype in _INT_DTYPES:
|
|
93
|
+
clean = series.drop_nulls()
|
|
94
|
+
if clean.len() == 0:
|
|
95
|
+
return False
|
|
96
|
+
unique_vals = set(clean.unique().to_list())
|
|
97
|
+
return unique_vals <= {0, 1}
|
|
98
|
+
|
|
99
|
+
return False
|
|
100
|
+
|
|
101
|
+
# ------------------------------------------------------------------
|
|
102
|
+
# Orchestration
|
|
103
|
+
# ------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
def _run(
|
|
106
|
+
self,
|
|
107
|
+
df: pl.DataFrame,
|
|
108
|
+
columns: list[str],
|
|
109
|
+
) -> BooleanProfileResult:
|
|
110
|
+
result = BooleanProfileResult()
|
|
111
|
+
|
|
112
|
+
available = [
|
|
113
|
+
c
|
|
114
|
+
for c in self._resolve_columns(df.columns, columns)
|
|
115
|
+
if self._eligible(df[c])
|
|
116
|
+
]
|
|
117
|
+
result.analysed_columns = available
|
|
118
|
+
|
|
119
|
+
for col_name in available:
|
|
120
|
+
result.columns[col_name] = self._profile_column(df[col_name], df.height)
|
|
121
|
+
|
|
122
|
+
return result
|
|
123
|
+
|
|
124
|
+
# ------------------------------------------------------------------
|
|
125
|
+
# Per-column driver
|
|
126
|
+
# ------------------------------------------------------------------
|
|
127
|
+
|
|
128
|
+
def _profile_column(
|
|
129
|
+
self,
|
|
130
|
+
series: pl.Series,
|
|
131
|
+
n_rows: int,
|
|
132
|
+
) -> BooleanStats:
|
|
133
|
+
profile = BooleanStats()
|
|
134
|
+
|
|
135
|
+
# Coerce to a clean boolean series (drop nulls)
|
|
136
|
+
bool_series = self._to_bool_series(series)
|
|
137
|
+
non_null_count = bool_series.len()
|
|
138
|
+
|
|
139
|
+
if non_null_count == 0:
|
|
140
|
+
return profile
|
|
141
|
+
|
|
142
|
+
true_count = int(bool_series.sum())
|
|
143
|
+
false_count = non_null_count - true_count
|
|
144
|
+
|
|
145
|
+
profile.true_count = true_count
|
|
146
|
+
profile.false_count = false_count
|
|
147
|
+
profile.true_ratio = true_count / non_null_count
|
|
148
|
+
profile.false_ratio = false_count / non_null_count
|
|
149
|
+
|
|
150
|
+
# Mode: True if more trues, False if more falses, None if perfectly tied
|
|
151
|
+
if true_count > false_count:
|
|
152
|
+
profile.mode = True
|
|
153
|
+
elif false_count > true_count:
|
|
154
|
+
profile.mode = False
|
|
155
|
+
else:
|
|
156
|
+
profile.mode = None # tied — no single mode
|
|
157
|
+
|
|
158
|
+
return profile
|
|
159
|
+
|
|
160
|
+
# ------------------------------------------------------------------
|
|
161
|
+
# Helpers
|
|
162
|
+
# ------------------------------------------------------------------
|
|
163
|
+
|
|
164
|
+
@staticmethod
|
|
165
|
+
def _to_bool_series(series: pl.Series) -> pl.Series:
|
|
166
|
+
"""
|
|
167
|
+
Return a null-free Boolean Series regardless of the input dtype.
|
|
168
|
+
|
|
169
|
+
Handles:
|
|
170
|
+
- pl.Boolean → drop nulls directly
|
|
171
|
+
- integer {0, 1} → cast to Boolean, drop nulls
|
|
172
|
+
- string → map known true/false strings, drop nulls
|
|
173
|
+
"""
|
|
174
|
+
if series.dtype == pl.Boolean:
|
|
175
|
+
return series.drop_nulls()
|
|
176
|
+
|
|
177
|
+
if series.dtype in _INT_DTYPES:
|
|
178
|
+
return series.cast(pl.Boolean).drop_nulls()
|
|
179
|
+
|
|
180
|
+
if series.dtype == pl.Utf8:
|
|
181
|
+
lower = series.str.to_lowercase().str.strip_chars()
|
|
182
|
+
true_mask = lower.is_in(list(_TRUE_STRINGS))
|
|
183
|
+
false_mask = lower.is_in(list(_FALSE_STRINGS))
|
|
184
|
+
known_mask = true_mask | false_mask
|
|
185
|
+
return true_mask.filter(known_mask)
|
|
186
|
+
|
|
187
|
+
# Fallback: attempt a cast and drop nulls (covers e.g. Categorical)
|
|
188
|
+
try:
|
|
189
|
+
return series.cast(pl.Boolean).drop_nulls()
|
|
190
|
+
except Exception:
|
|
191
|
+
return pl.Series([], dtype=pl.Boolean)
|