dataforge-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. dataforge_ml-0.1.0.dist-info/METADATA +34 -0
  2. dataforge_ml-0.1.0.dist-info/RECORD +54 -0
  3. dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
  4. dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
  5. dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
  6. models/__init__.py +0 -0
  7. models/_data_structure.py +7 -0
  8. models/_data_types.py +12 -0
  9. profiling/__init__.py +35 -0
  10. profiling/_base.py +101 -0
  11. profiling/_boolean_config.py +37 -0
  12. profiling/_boolean_profiler.py +191 -0
  13. profiling/_categorical.py +315 -0
  14. profiling/_categorical_config.py +87 -0
  15. profiling/_correlation_config.py +225 -0
  16. profiling/_correlation_profiler.py +544 -0
  17. profiling/_datetime_config.py +98 -0
  18. profiling/_datetime_profiler.py +406 -0
  19. profiling/_missingness_config.py +137 -0
  20. profiling/_missingness_profiler.py +252 -0
  21. profiling/_numeric_config.py +116 -0
  22. profiling/_numeric_profiler.py +403 -0
  23. profiling/_tabular.py +249 -0
  24. profiling/_target_config.py +74 -0
  25. profiling/_target_profiler.py +156 -0
  26. profiling/_text_config.py +40 -0
  27. profiling/_text_profiler.py +194 -0
  28. profiling/_type_detector.py +463 -0
  29. profiling/config.py +236 -0
  30. profiling/structural.py +280 -0
  31. splitting/__init__.py +4 -0
  32. splitting/_config.py +56 -0
  33. splitting/_splitter.py +202 -0
  34. tests/__init__.py +0 -0
  35. tests/conftest.py +7 -0
  36. tests/integration/__init__.py +0 -0
  37. tests/integration/conftest.py +82 -0
  38. tests/integration/test_structural_end_to_end.py +219 -0
  39. tests/unit/__init__.py +0 -0
  40. tests/unit/profiling/__init__.py +0 -0
  41. tests/unit/profiling/conftest.py +81 -0
  42. tests/unit/profiling/test_boolean_profiler.py +91 -0
  43. tests/unit/profiling/test_categorical_profiler.py +182 -0
  44. tests/unit/profiling/test_correlation_profiler.py +124 -0
  45. tests/unit/profiling/test_datetime_profiler.py +133 -0
  46. tests/unit/profiling/test_missingness_profiler.py +51 -0
  47. tests/unit/profiling/test_numeric_profiler.py +212 -0
  48. tests/unit/profiling/test_target_profiler.py +44 -0
  49. tests/unit/profiling/test_text_profiler.py +61 -0
  50. tests/unit/profiling/test_type_detector.py +32 -0
  51. tests/unit/splitting/__init__.py +0 -0
  52. tests/unit/splitting/test_data_splitter.py +417 -0
  53. utils/__init__.py +0 -0
  54. utils/data_loader.py +110 -0
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataforge-ml
3
+ Version: 0.1.0
4
+ Summary: A automated feature engineering and designing pipeline library
5
+ License: MIT
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.10
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: polars>=1.0.0
15
+ Requires-Dist: scikit-learn>=1.0.0
16
+ Requires-Dist: scipy>=1.10.0
17
+ Requires-Dist: numpy>=2.0.0
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=8.0; extra == "dev"
20
+ Dynamic: license-file
21
+
22
+ # FeatureForge
23
+
24
+ Automated feature engineering and data profiling pipeline library for tabular datasets.
25
+
26
+ ## Installation
27
+
28
+ ```bash
29
+ pip install dataforge-ml
30
+ ```
31
+
32
+ ## License
33
+
34
+ MIT
@@ -0,0 +1,54 @@
1
+ dataforge_ml-0.1.0.dist-info/licenses/LICENSE,sha256=Fud0rbmIzDVnfrHmvEls24iboUWgpGoCRFNj6Fl3i9o,1068
2
+ models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ models/_data_structure.py,sha256=8qeZhIs202Q6loYqp7Ri3TMBd3lor9jFtTTgLwaBqsg,143
4
+ models/_data_types.py,sha256=AACN63vfjrkp6d1p83K220trMzGqKasbqB_XDLmYBwI,294
5
+ profiling/__init__.py,sha256=h_hS_5q8Gxg79eNP4Fz7AfPv_Db9f-BnamB6PVCWPys,674
6
+ profiling/_base.py,sha256=RsG1Ut2q0GZ0wQBmBLsqSB38cCVz7Hh5OngQ47WCoKY,3313
7
+ profiling/_boolean_config.py,sha256=Uj98mJCMoWnOb4IVqBnhHPGeVW0BMKaMC14oVmT4tzg,860
8
+ profiling/_boolean_profiler.py,sha256=EQ45n6OEx9FRzm4ZhO_E0gWhlYRTEkKlUuqZSEVi_K4,6391
9
+ profiling/_categorical.py,sha256=3pBZKK9vY_xIe7TwqItomMUdPwmlB3STGKeLz4N3Kis,10325
10
+ profiling/_categorical_config.py,sha256=s1hZ0AziSVe_8hssBXxZB0a-l3mJ3E8r6GcWrQMh0cQ,2417
11
+ profiling/_correlation_config.py,sha256=Onil5fdIm4h0M8Q6RnFnX4ksQDEd9GTdCF1q2BbVU3g,8018
12
+ profiling/_correlation_profiler.py,sha256=C7C50Lw9fP4W0jByX0lA_XViQtb7wzMFTbI10N6yJ2w,19766
13
+ profiling/_datetime_config.py,sha256=clVMTJByKGUYlO01nI0gAw_ut_U0g-KMLTdLdNjj7ug,2742
14
+ profiling/_datetime_profiler.py,sha256=jStM0tHXLiMVjjiLWILarrcIQ1vAJYiZLqjSGVFQlrg,14172
15
+ profiling/_missingness_config.py,sha256=jkOMsxblQTTnsUu5Wv4sEDC3bph_k3CtemG1pOJPpqM,4726
16
+ profiling/_missingness_profiler.py,sha256=jWpQUdkJhUgHMCYr4UPzjm4jpRObrmS7pGBL1LiEzVM,9174
17
+ profiling/_numeric_config.py,sha256=J-Bv_qHpGWKdUQUMehWPJVkTnVFd5xDgfA-4yByx_R0,2980
18
+ profiling/_numeric_profiler.py,sha256=pQatmq0PB-GZiaFr5z-e5zugvm9aTzINkz6496O9ir0,13232
19
+ profiling/_tabular.py,sha256=wTFL1mrNVMTi_nT6lrUjhj88cVuhbJ7ZAEslb0uTLnA,8622
20
+ profiling/_target_config.py,sha256=tvIYI0t6OOPqsZIXnM9hSijJKsCbMsFR8JrccEpoSlk,3152
21
+ profiling/_target_profiler.py,sha256=kDwWVHiKa6E5BBKIZe6AdI9vW1jnChz8nic5qJ-y_r0,6274
22
+ profiling/_text_config.py,sha256=uKrP2Y6Wm9vnq_xuHSbSpiMfvSjhRYbPwCSY8KNRoA8,964
23
+ profiling/_text_profiler.py,sha256=z07FGquTIrAQyTwLZMnAobnwYyixJ3y2i9aT0A5JxsU,7492
24
+ profiling/_type_detector.py,sha256=KjbsyaUDONMJfJR-RMS9TR5W7zNje1uKTaDB-RLyiXQ,16747
25
+ profiling/config.py,sha256=Dalgw7VPrY3z9o_HE_2INnEfhXAniJDJYPZiLx7YfY8,7641
26
+ profiling/structural.py,sha256=Bl4MMJHOd06fO-79XoesmwswAN790ZNNuDxtW69T8Zw,12264
27
+ splitting/__init__.py,sha256=OG583NBARJARDGBTiCFp1Tx6HmV9WgCtJpS6ouGDvj4,138
28
+ splitting/_config.py,sha256=_ZsuYpmYW7pURK6X6AinShKfm8_bNC7paORWpCgtqpY,1260
29
+ splitting/_splitter.py,sha256=XxHRveYj5hiDVvH46HMo6NU11LptS6Redyeb_4jjNtc,6606
30
+ tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ tests/conftest.py,sha256=57qnuMtKIfaxigMR-pZBaEF6QrTE91SIAy8x_aclmjw,116
32
+ tests/integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ tests/integration/conftest.py,sha256=F0ImmsLYhESpdtVeyju6uJQT1NvoZNK50KMw4cQo5qQ,2400
34
+ tests/integration/test_structural_end_to_end.py,sha256=yGaIX16FESWwN5bFxTnq4eJJMcKuQteoPYN7GZ2WlyY,8219
35
+ tests/unit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
+ tests/unit/profiling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ tests/unit/profiling/conftest.py,sha256=7vy23oSwXPEKEFOT9cXq-llG9GbaesK23Q48d6K1C44,2576
38
+ tests/unit/profiling/test_boolean_profiler.py,sha256=E4odl9c7FoWbVtsnUuASigfnMtoYEG-25pA7g8WuUNg,3500
39
+ tests/unit/profiling/test_categorical_profiler.py,sha256=tMDh8Wgy3D5qWm2a96pOMZn37UMaGNWqGaJXpUJk3oA,7130
40
+ tests/unit/profiling/test_correlation_profiler.py,sha256=wIOcwXfJywaGiMddj3mRFL3wJvfh_SeXvRzZ3_wmyGo,4587
41
+ tests/unit/profiling/test_datetime_profiler.py,sha256=ZxrT5WBWhZMpdAXuIxXoQNkETfLSitkncASV4Xn07JA,5416
42
+ tests/unit/profiling/test_missingness_profiler.py,sha256=nysQ7Nhv216gFjxUuIgtnEC3pDiI9QBzFWJOJykqKYE,2115
43
+ tests/unit/profiling/test_numeric_profiler.py,sha256=SJJ4IiQQZ_FQTUgnlMVdZJFhzkzYK5T5b4OvEL8yiKA,8098
44
+ tests/unit/profiling/test_target_profiler.py,sha256=MfxhULmHn0fwK80h_1Dtnc4bRGCwwIdUQ0DflPRqIrk,1646
45
+ tests/unit/profiling/test_text_profiler.py,sha256=3wrq1zztCKY6q-XPW7I3HKYKQ4jFlCgm3dNrRGMqI3w,2291
46
+ tests/unit/profiling/test_type_detector.py,sha256=H4IOz1KxP06J9hGR6B7j6zjKQZjrt5K3s3cS4tj_7BU,1348
47
+ tests/unit/splitting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
+ tests/unit/splitting/test_data_splitter.py,sha256=oOCl-T3Cdu9RUrVQm6GumkcoqKUkfhUoAgyloFnJdPg,14539
49
+ utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
+ utils/data_loader.py,sha256=Ls0uqWaURLJCzvwcXpwRgxbiuq8m4_Eik6U3N-_HCWQ,3139
51
+ dataforge_ml-0.1.0.dist-info/METADATA,sha256=G7cOeMs1GeLwD0aq2Lr_NS3X3ay3UJbSCOqI2JIojOM,862
52
+ dataforge_ml-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
53
+ dataforge_ml-0.1.0.dist-info/top_level.txt,sha256=ZGn_lp_HOK1i-rv4p0m-qUh4Uhkw6QUpC6LDSkveCao,39
54
+ dataforge_ml-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 DEVunderdog
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,5 @@
1
+ models
2
+ profiling
3
+ splitting
4
+ tests
5
+ utils
models/__init__.py ADDED
File without changes
@@ -0,0 +1,7 @@
1
+ from enum import StrEnum
2
+
3
+
4
+ class DataStructure(StrEnum):
5
+ Tabular = "TABULAR"
6
+ TimeSeries = "TIME_SERIES"
7
+ GeoSpatial = "GEO_SPATIAL"
models/_data_types.py ADDED
@@ -0,0 +1,12 @@
1
+ import polars as pl
2
+
3
+ _INT_DTYPES = {
4
+ pl.Int8, pl.Int16, pl.Int32, pl.Int64,
5
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
6
+ }
7
+
8
+ _CAT_DTYPES = {pl.Utf8, pl.String, pl.Categorical, pl.Boolean}
9
+
10
+ _NUMERIC_DTYPES = _INT_DTYPES | {pl.Float32, pl.Float64}
11
+
12
+ _DATETIME_DTYPES = {pl.Date, pl.Datetime}
profiling/__init__.py ADDED
@@ -0,0 +1,35 @@
1
+ from .structural import StructuralProfiler
2
+ from .config import (
3
+ ProfileConfig,
4
+ SemanticType,
5
+ Modality,
6
+ TypeFlag,
7
+ NumericKind,
8
+ NumericStats,
9
+ CategoricalStats,
10
+ DatetimeStats,
11
+ BooleanStats,
12
+ TextStats,
13
+ ColumnProfile,
14
+ DatasetStats,
15
+ StructuralProfileResult,
16
+ )
17
+ from ._base import ModalityProfiler
18
+
19
+ __all__ = [
20
+ "StructuralProfiler",
21
+ "ProfileConfig",
22
+ "SemanticType",
23
+ "Modality",
24
+ "TypeFlag",
25
+ "NumericKind",
26
+ "NumericStats",
27
+ "CategoricalStats",
28
+ "DatetimeStats",
29
+ "BooleanStats",
30
+ "TextStats",
31
+ "ColumnProfile",
32
+ "DatasetStats",
33
+ "StructuralProfileResult",
34
+ "ModalityProfiler",
35
+ ]
profiling/_base.py ADDED
@@ -0,0 +1,101 @@
1
+ """
2
+ Abstract base classes for all structural profilers.
3
+
4
+ Hierarchy
5
+ ---------
6
+ Profiling[R] — root: stores config, provides _resolve_columns
7
+ ├── ColumnBatchProfiler[R] — registry tier: __init__(config=None) only;
8
+ │ │ profile(df, columns) processes a typed column batch
9
+ │ ├── NumericProfiler
10
+ │ ├── CategoricalProfiler
11
+ │ ├── DatetimeProfiler
12
+ │ ├── BooleanProfiler
13
+ │ └── TextProfiler
14
+ ├── DatasetLevelProfiler[R] — direct-call tier: may have extra __init__ params;
15
+ │ │ not compatible with the SemanticType registry
16
+ │ ├── MissingnessProfiler
17
+ │ ├── TargetProfiler
18
+ │ └── CorrelationProfiler
19
+ └── ModalityProfiler — dataset-shape tier: profile(df) → DatasetStats
20
+ └── TabularProfiler
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import polars as pl
26
+ from abc import abstractmethod, ABC
27
+ from typing import Generic, TypeVar
28
+
29
+ from .config import DatasetStats, ProfileConfig
30
+
31
+ R = TypeVar("R")
32
+
33
+
34
+ class Profiling(ABC, Generic[R]):
35
+ """
36
+ Root base for all profilers.
37
+
38
+ Stores config and provides _resolve_columns. Not instantiated directly —
39
+ use one of the three concrete tier bases below.
40
+ """
41
+
42
+ def __init__(self, config: ProfileConfig | None = None):
43
+ self.config = config or ProfileConfig()
44
+
45
+ @abstractmethod
46
+ def profile(self, data: pl.DataFrame, **kwargs) -> R: ...
47
+
48
+ def _resolve_columns(
49
+ self,
50
+ available: list[str],
51
+ requested: list[str] | None,
52
+ ) -> list[str]:
53
+ if requested is None:
54
+ return list(available)
55
+ available_set = set(available)
56
+ return [c for c in requested if c in available_set]
57
+
58
+
59
+ class ColumnBatchProfiler(Profiling[R]):
60
+ """
61
+ Registry-compatible column profiler.
62
+
63
+ Contract
64
+ --------
65
+ - __init__ must accept ONLY config (no extra required params). This allows
66
+ StructuralProfiler to instantiate any registered profiler uniformly via
67
+ profiler_cls(config=self.config)
68
+ - profile(df, columns) receives the full DataFrame and the list of same-type
69
+ column names to process. Returns a result with:
70
+ .columns: dict[str, <Stats>] — per-column stats
71
+ .analysed_columns: list[str] — columns actually profiled
72
+ """
73
+
74
+ @abstractmethod
75
+ def profile(self, data: pl.DataFrame, columns: list[str]) -> R: ... # type: ignore[override]
76
+
77
+
78
+ class DatasetLevelProfiler(Profiling[R]):
79
+ """
80
+ Directly-called profiler.
81
+
82
+ May have extra __init__ params (e.g. target_column, numeric_columns).
83
+ Never registered in the SemanticType registry — always instantiated
84
+ explicitly with its specific arguments.
85
+ """
86
+
87
+ @abstractmethod
88
+ def profile(self, data: pl.DataFrame, **kwargs) -> R: ...
89
+
90
+
91
+ class ModalityProfiler(Profiling[DatasetStats]):
92
+ """
93
+ Dataset-shape profiler.
94
+
95
+ One concrete implementation per Modality. Returns DatasetStats covering
96
+ shape, memory, duplicates, sparsity, and chunking metadata.
97
+ profile(df) takes only the DataFrame — no column list needed.
98
+ """
99
+
100
+ @abstractmethod
101
+ def profile(self, data: pl.DataFrame, **kwargs) -> DatasetStats: ... # type: ignore[override]
@@ -0,0 +1,37 @@
1
+ """
2
+ Result dataclass for boolean column profiling.
3
+
4
+ Populated by BooleanProfiler.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from typing import Optional
11
+
12
+
13
+ @dataclass
14
+ class BooleanStats:
15
+ true_count: int = 0
16
+ false_count: int = 0
17
+ true_ratio: float = 0.0
18
+ false_ratio: float = 0.0
19
+ mode: Optional[bool] = None
20
+
21
+
22
+ @dataclass
23
+ class BooleanProfileResult:
24
+ """
25
+ Boolean profile for all eligible columns.
26
+
27
+ Attributes
28
+ ----------
29
+ columns : dict[str, BooleanStats]
30
+ Per-column boolean profiles, keyed by column name.
31
+ analysed_columns : list[str]
32
+ Columns that were actually profiled (after schema intersection
33
+ and eligibility check).
34
+ """
35
+
36
+ columns: dict[str, BooleanStats] = field(default_factory=dict)
37
+ analysed_columns: list[str] = field(default_factory=list)
@@ -0,0 +1,191 @@
1
+ """
2
+ BooleanProfiler – Phase 1 extension: Boolean Column Profiling.
3
+
4
+ Handles columns classified as SemanticType.Boolean, which includes:
5
+ - Native Polars Boolean dtype
6
+ - Integer {0, 1} columns with a Boolean override in ProfileConfig
7
+ - Boolean-string columns ("true"/"false", "yes"/"no", "1"/"0") with override
8
+
9
+ Per-column metrics:
10
+ 1. true_count – count of non-null truthy values
11
+ 2. false_count – count of non-null falsy values
12
+ 3. true_ratio – true_count / non_null_count (nulls excluded)
13
+ 4. false_ratio – false_count / non_null_count (nulls excluded)
14
+ 5. mode – most frequent non-null value (True / False), or None if tied
15
+
16
+ Null values are NOT counted in ratios — missingness is already captured by
17
+ the upstream MissingnessProfiler pass and lives in ColumnProfile.missingness.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import polars as pl
23
+
24
+ from ._base import ColumnBatchProfiler
25
+ from .config import (
26
+ ProfileConfig,
27
+ BooleanStats,
28
+ SemanticType,
29
+ )
30
+ from ._boolean_config import BooleanProfileResult
31
+ from ..models._data_types import _INT_DTYPES
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # String values that represent True / False
35
+ # ---------------------------------------------------------------------------
36
+
37
+ _TRUE_STRINGS: frozenset[str] = frozenset({"true", "yes", "1", "t", "y"})
38
+ _FALSE_STRINGS: frozenset[str] = frozenset({"false", "no", "0", "f", "n"})
39
+
40
+
41
+ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
42
+ """
43
+ Boolean column profiler for Polars DataFrames.
44
+
45
+ A column is eligible when:
46
+ - Its Polars dtype is pl.Boolean, OR
47
+ - Its dtype is an integer with values exclusively in {0, 1}, OR
48
+ - It has a SemanticType.Boolean override in ProfileConfig.column_overrides
49
+
50
+ Non-eligible columns in the provided list are silently skipped.
51
+
52
+ Parameters
53
+ ----------
54
+ config : ProfileConfig | None
55
+ Shared profiling configuration.
56
+ """
57
+
58
+ def __init__(self, config: ProfileConfig | None = None) -> None:
59
+ super().__init__(config)
60
+
61
+ # ------------------------------------------------------------------
62
+ # Public API
63
+ # ------------------------------------------------------------------
64
+
65
+ def profile(
66
+ self,
67
+ data: pl.DataFrame,
68
+ columns: list[str],
69
+ ) -> BooleanProfileResult:
70
+ return self._run(data, columns)
71
+
72
+ # ------------------------------------------------------------------
73
+ # Eligibility
74
+ # ------------------------------------------------------------------
75
+
76
+ def _eligible(self, series: pl.Series) -> bool:
77
+ override = self.config.column_overrides.get(series.name)
78
+
79
+ # Explicit override — trust it
80
+ if override == SemanticType.Boolean:
81
+ return True
82
+
83
+ # Another override takes precedence over auto-detection
84
+ if override is not None:
85
+ return False
86
+
87
+ # Native boolean dtype
88
+ if series.dtype == pl.Boolean:
89
+ return True
90
+
91
+ # Integer {0, 1} column — check after dropping nulls
92
+ if series.dtype in _INT_DTYPES:
93
+ clean = series.drop_nulls()
94
+ if clean.len() == 0:
95
+ return False
96
+ unique_vals = set(clean.unique().to_list())
97
+ return unique_vals <= {0, 1}
98
+
99
+ return False
100
+
101
+ # ------------------------------------------------------------------
102
+ # Orchestration
103
+ # ------------------------------------------------------------------
104
+
105
+ def _run(
106
+ self,
107
+ df: pl.DataFrame,
108
+ columns: list[str],
109
+ ) -> BooleanProfileResult:
110
+ result = BooleanProfileResult()
111
+
112
+ available = [
113
+ c
114
+ for c in self._resolve_columns(df.columns, columns)
115
+ if self._eligible(df[c])
116
+ ]
117
+ result.analysed_columns = available
118
+
119
+ for col_name in available:
120
+ result.columns[col_name] = self._profile_column(df[col_name], df.height)
121
+
122
+ return result
123
+
124
+ # ------------------------------------------------------------------
125
+ # Per-column driver
126
+ # ------------------------------------------------------------------
127
+
128
+ def _profile_column(
129
+ self,
130
+ series: pl.Series,
131
+ n_rows: int,
132
+ ) -> BooleanStats:
133
+ profile = BooleanStats()
134
+
135
+ # Coerce to a clean boolean series (drop nulls)
136
+ bool_series = self._to_bool_series(series)
137
+ non_null_count = bool_series.len()
138
+
139
+ if non_null_count == 0:
140
+ return profile
141
+
142
+ true_count = int(bool_series.sum())
143
+ false_count = non_null_count - true_count
144
+
145
+ profile.true_count = true_count
146
+ profile.false_count = false_count
147
+ profile.true_ratio = true_count / non_null_count
148
+ profile.false_ratio = false_count / non_null_count
149
+
150
+ # Mode: True if more trues, False if more falses, None if perfectly tied
151
+ if true_count > false_count:
152
+ profile.mode = True
153
+ elif false_count > true_count:
154
+ profile.mode = False
155
+ else:
156
+ profile.mode = None # tied — no single mode
157
+
158
+ return profile
159
+
160
+ # ------------------------------------------------------------------
161
+ # Helpers
162
+ # ------------------------------------------------------------------
163
+
164
+ @staticmethod
165
+ def _to_bool_series(series: pl.Series) -> pl.Series:
166
+ """
167
+ Return a null-free Boolean Series regardless of the input dtype.
168
+
169
+ Handles:
170
+ - pl.Boolean → drop nulls directly
171
+ - integer {0, 1} → cast to Boolean, drop nulls
172
+ - string → map known true/false strings, drop nulls
173
+ """
174
+ if series.dtype == pl.Boolean:
175
+ return series.drop_nulls()
176
+
177
+ if series.dtype in _INT_DTYPES:
178
+ return series.cast(pl.Boolean).drop_nulls()
179
+
180
+ if series.dtype == pl.Utf8:
181
+ lower = series.str.to_lowercase().str.strip_chars()
182
+ true_mask = lower.is_in(list(_TRUE_STRINGS))
183
+ false_mask = lower.is_in(list(_FALSE_STRINGS))
184
+ known_mask = true_mask | false_mask
185
+ return true_mask.filter(known_mask)
186
+
187
+ # Fallback: attempt a cast and drop nulls (covers e.g. Categorical)
188
+ try:
189
+ return series.cast(pl.Boolean).drop_nulls()
190
+ except Exception:
191
+ return pl.Series([], dtype=pl.Boolean)