dataforge-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge_ml-0.1.0.dist-info/METADATA +34 -0
- dataforge_ml-0.1.0.dist-info/RECORD +54 -0
- dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
- dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
- dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
- models/__init__.py +0 -0
- models/_data_structure.py +7 -0
- models/_data_types.py +12 -0
- profiling/__init__.py +35 -0
- profiling/_base.py +101 -0
- profiling/_boolean_config.py +37 -0
- profiling/_boolean_profiler.py +191 -0
- profiling/_categorical.py +315 -0
- profiling/_categorical_config.py +87 -0
- profiling/_correlation_config.py +225 -0
- profiling/_correlation_profiler.py +544 -0
- profiling/_datetime_config.py +98 -0
- profiling/_datetime_profiler.py +406 -0
- profiling/_missingness_config.py +137 -0
- profiling/_missingness_profiler.py +252 -0
- profiling/_numeric_config.py +116 -0
- profiling/_numeric_profiler.py +403 -0
- profiling/_tabular.py +249 -0
- profiling/_target_config.py +74 -0
- profiling/_target_profiler.py +156 -0
- profiling/_text_config.py +40 -0
- profiling/_text_profiler.py +194 -0
- profiling/_type_detector.py +463 -0
- profiling/config.py +236 -0
- profiling/structural.py +280 -0
- splitting/__init__.py +4 -0
- splitting/_config.py +56 -0
- splitting/_splitter.py +202 -0
- tests/__init__.py +0 -0
- tests/conftest.py +7 -0
- tests/integration/__init__.py +0 -0
- tests/integration/conftest.py +82 -0
- tests/integration/test_structural_end_to_end.py +219 -0
- tests/unit/__init__.py +0 -0
- tests/unit/profiling/__init__.py +0 -0
- tests/unit/profiling/conftest.py +81 -0
- tests/unit/profiling/test_boolean_profiler.py +91 -0
- tests/unit/profiling/test_categorical_profiler.py +182 -0
- tests/unit/profiling/test_correlation_profiler.py +124 -0
- tests/unit/profiling/test_datetime_profiler.py +133 -0
- tests/unit/profiling/test_missingness_profiler.py +51 -0
- tests/unit/profiling/test_numeric_profiler.py +212 -0
- tests/unit/profiling/test_target_profiler.py +44 -0
- tests/unit/profiling/test_text_profiler.py +61 -0
- tests/unit/profiling/test_type_detector.py +32 -0
- tests/unit/splitting/__init__.py +0 -0
- tests/unit/splitting/test_data_splitter.py +417 -0
- utils/__init__.py +0 -0
- utils/data_loader.py +110 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from ...profiling.structural import StructuralProfiler
|
|
3
|
+
from ...profiling.config import (
|
|
4
|
+
ProfileConfig,
|
|
5
|
+
StructuralProfileResult,
|
|
6
|
+
SemanticType,
|
|
7
|
+
)
|
|
8
|
+
from ...profiling._numeric_config import NumericStats
|
|
9
|
+
from ...profiling._categorical_config import CategoricalStats
|
|
10
|
+
from ...profiling._datetime_config import DatetimeStats
|
|
11
|
+
from ...profiling._boolean_config import BooleanStats
|
|
12
|
+
from ...profiling._text_config import TextStats
|
|
13
|
+
from ...profiling._target_config import TargetProfileResult
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_happy_path(mixed_df):
|
|
17
|
+
config = ProfileConfig(compute_correlation=True)
|
|
18
|
+
result = StructuralProfiler(config).profile(mixed_df)
|
|
19
|
+
|
|
20
|
+
assert isinstance(result, StructuralProfileResult)
|
|
21
|
+
assert set(result.columns.keys()) == set(mixed_df.columns)
|
|
22
|
+
for col_profile in result.columns.values():
|
|
23
|
+
assert (
|
|
24
|
+
col_profile.semantic_type is not None
|
|
25
|
+
), f"column '{col_profile.name}' has no semantic_type"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
assert result.dataset.row_count == mixed_df.height
|
|
29
|
+
assert result.dataset.feature_correlation is not None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_no_correlation(mixed_df):
|
|
33
|
+
config = ProfileConfig(compute_correlation=False)
|
|
34
|
+
result = StructuralProfiler(config).profile(mixed_df)
|
|
35
|
+
|
|
36
|
+
assert result.dataset.feature_correlation is None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_boolean_handoff(mixed_df):
|
|
40
|
+
result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
|
|
41
|
+
|
|
42
|
+
cp = result.columns["is_active"]
|
|
43
|
+
assert cp.semantic_type == SemanticType.Boolean
|
|
44
|
+
assert cp.stats is not None
|
|
45
|
+
assert isinstance(cp.stats, BooleanStats)
|
|
46
|
+
assert cp.stats.mode in (True, False, None)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_text_handoff(text_df):
|
|
50
|
+
result = StructuralProfiler(ProfileConfig()).profile(text_df)
|
|
51
|
+
|
|
52
|
+
cp = result.columns["review"]
|
|
53
|
+
assert cp.semantic_type == SemanticType.Text
|
|
54
|
+
assert cp.stats is not None
|
|
55
|
+
assert isinstance(cp.stats, TextStats)
|
|
56
|
+
|
|
57
|
+
assert cp.stats.vocabulary_size > 0
|
|
58
|
+
assert cp.stats.char_length_max >= cp.stats.char_length_min
|
|
59
|
+
assert cp.stats.avg_token_count > 0
|
|
60
|
+
assert 0.0 <= cp.stats.empty_ratio <= 1.0
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_correlation_consistency(mixed_df):
|
|
64
|
+
config = ProfileConfig(compute_correlation=True)
|
|
65
|
+
result = StructuralProfiler(config).profile(mixed_df)
|
|
66
|
+
|
|
67
|
+
fc = result.dataset.feature_correlation
|
|
68
|
+
assert fc is not None
|
|
69
|
+
|
|
70
|
+
# age and income are correlated by construction — forward invariant must not be vacuous
|
|
71
|
+
assert len(fc.near_redundant_pairs) >= 1, (
|
|
72
|
+
"expected at least one near-redundant pair (age/income are strongly correlated)"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Forward invariant: every near_redundant pair must have both columns co-located
|
|
76
|
+
# in the same NearRedundancyGroup
|
|
77
|
+
for pair in fc.pairwise:
|
|
78
|
+
if not pair.near_redundant:
|
|
79
|
+
continue
|
|
80
|
+
assert any(
|
|
81
|
+
pair.col_a in group.columns and pair.col_b in group.columns
|
|
82
|
+
for group in fc.near_redundancy_groups
|
|
83
|
+
), (
|
|
84
|
+
f"near_redundant pair ({pair.col_a}, {pair.col_b}) "
|
|
85
|
+
f"not co-located in any NearRedundancyGroup"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Backward invariant: every column in a redundancy group must have at least
|
|
89
|
+
# one near_redundant=True pair in pairwise
|
|
90
|
+
for group in fc.near_redundancy_groups:
|
|
91
|
+
for col in group.columns:
|
|
92
|
+
assert any(
|
|
93
|
+
(p.col_a == col or p.col_b == col) and p.near_redundant
|
|
94
|
+
for p in fc.pairwise
|
|
95
|
+
), (
|
|
96
|
+
f"column '{col}' is in a NearRedundancyGroup but has no "
|
|
97
|
+
f"near_redundant=True pair in pairwise"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Matrix symmetry — Pearson
|
|
101
|
+
for col_a, row in fc.pearson_matrix.items():
|
|
102
|
+
for col_b, val in row.items():
|
|
103
|
+
mirror = fc.pearson_matrix.get(col_b, {}).get(col_a)
|
|
104
|
+
assert mirror is not None and abs(val - mirror) < 1e-10, (
|
|
105
|
+
f"Pearson matrix asymmetry: [{col_a}][{col_b}]={val} "
|
|
106
|
+
f"vs [{col_b}][{col_a}]={mirror}"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Matrix symmetry — Spearman
|
|
110
|
+
for col_a, row in fc.spearman_matrix.items():
|
|
111
|
+
for col_b, val in row.items():
|
|
112
|
+
mirror = fc.spearman_matrix.get(col_b, {}).get(col_a)
|
|
113
|
+
assert mirror is not None and abs(val - mirror) < 1e-10, (
|
|
114
|
+
f"Spearman matrix asymmetry: [{col_a}][{col_b}]={val} "
|
|
115
|
+
f"vs [{col_b}][{col_a}]={mirror}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Suggested drop is a strict subset of its group's columns
|
|
119
|
+
for group in fc.near_redundancy_groups:
|
|
120
|
+
group_cols = set(group.columns)
|
|
121
|
+
drop_cols = set(group.suggested_drop)
|
|
122
|
+
assert drop_cols < group_cols, (
|
|
123
|
+
f"suggested_drop {drop_cols} is not a strict subset of "
|
|
124
|
+
f"group columns {group_cols}"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def test_column_handoffs(mixed_df):
|
|
129
|
+
result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
|
|
130
|
+
|
|
131
|
+
stats_type_for = {
|
|
132
|
+
SemanticType.Numeric: NumericStats,
|
|
133
|
+
SemanticType.Categorical: CategoricalStats,
|
|
134
|
+
SemanticType.Datetime: DatetimeStats,
|
|
135
|
+
SemanticType.Boolean: BooleanStats,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
for name, cp in result.columns.items():
|
|
139
|
+
expected_type = stats_type_for.get(cp.semantic_type)
|
|
140
|
+
if expected_type is None:
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
assert cp.stats is not None, (
|
|
144
|
+
f"column '{name}' has semantic_type={cp.semantic_type} but stats is None"
|
|
145
|
+
)
|
|
146
|
+
assert isinstance(cp.stats, expected_type), (
|
|
147
|
+
f"column '{name}' has semantic_type={cp.semantic_type} "
|
|
148
|
+
f"but stats type is {type(cp.stats).__name__}, expected {expected_type.__name__}"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ---------------------------------------------------------------------------
|
|
153
|
+
# Override: numeric column forced to Categorical via column_overrides
|
|
154
|
+
# ---------------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_column_override_changes_stats_type(override_df):
|
|
158
|
+
config = ProfileConfig(column_overrides={"score": SemanticType.Categorical})
|
|
159
|
+
result = StructuralProfiler(config).profile(override_df)
|
|
160
|
+
cp = result.columns["score"]
|
|
161
|
+
assert isinstance(cp.stats, CategoricalStats)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# ---------------------------------------------------------------------------
|
|
165
|
+
# Target profiling integration
|
|
166
|
+
# ---------------------------------------------------------------------------
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def test_target_profiling_integration(target_df):
|
|
170
|
+
config = ProfileConfig(target_columns=["label"])
|
|
171
|
+
result = StructuralProfiler(config).profile(target_df)
|
|
172
|
+
assert "label" in result.targets
|
|
173
|
+
assert isinstance(result.targets["label"], TargetProfileResult)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# ---------------------------------------------------------------------------
|
|
177
|
+
# Empty DataFrame does not crash
|
|
178
|
+
# ---------------------------------------------------------------------------
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def test_empty_dataframe_does_not_crash(empty_df):
|
|
182
|
+
result = StructuralProfiler(ProfileConfig()).profile(empty_df)
|
|
183
|
+
assert isinstance(result, StructuralProfileResult)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# ---------------------------------------------------------------------------
|
|
187
|
+
# Numeric handoff: float column produces NumericStats on ColumnProfile
|
|
188
|
+
# ---------------------------------------------------------------------------
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def test_numeric_handoff(mixed_df):
|
|
192
|
+
result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
|
|
193
|
+
cp = result.columns["income"]
|
|
194
|
+
assert cp.stats is not None
|
|
195
|
+
assert isinstance(cp.stats, NumericStats)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
# ---------------------------------------------------------------------------
|
|
199
|
+
# Datetime handoff: date column produces DatetimeStats on ColumnProfile
|
|
200
|
+
# ---------------------------------------------------------------------------
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def test_datetime_handoff(mixed_df):
|
|
204
|
+
result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
|
|
205
|
+
cp = result.columns["joined"]
|
|
206
|
+
assert cp.stats is not None
|
|
207
|
+
assert isinstance(cp.stats, DatetimeStats)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
# ---------------------------------------------------------------------------
|
|
211
|
+
# Missingness surfaced at column level for columns with nulls
|
|
212
|
+
# ---------------------------------------------------------------------------
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def test_missingness_surfaced(mixed_df):
|
|
216
|
+
result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
|
|
217
|
+
cp = result.columns["salary"] # salary has ~10 % nulls by construction
|
|
218
|
+
assert cp.missingness is not None
|
|
219
|
+
assert cp.missingness.standard_null_count > 0
|
tests/unit/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from datetime import date, timedelta
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
_BASE_DATE = date(2023, 1, 1)
|
|
7
|
+
_N = 60
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@pytest.fixture(scope="session")
|
|
11
|
+
def empty_df() -> pl.DataFrame:
|
|
12
|
+
return pl.DataFrame(
|
|
13
|
+
{
|
|
14
|
+
"score": pl.Series([], dtype=pl.Float64),
|
|
15
|
+
"count": pl.Series([], dtype=pl.Int64),
|
|
16
|
+
"category": pl.Series([], dtype=pl.Utf8),
|
|
17
|
+
"active": pl.Series([], dtype=pl.Boolean),
|
|
18
|
+
"event_date": pl.Series([], dtype=pl.Date),
|
|
19
|
+
}
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.fixture(scope="session")
|
|
24
|
+
def all_null_df() -> pl.DataFrame:
|
|
25
|
+
nulls = [None] * _N
|
|
26
|
+
return pl.DataFrame(
|
|
27
|
+
{
|
|
28
|
+
"float_col": pl.Series(nulls, dtype=pl.Float64),
|
|
29
|
+
"int_col": pl.Series(nulls, dtype=pl.Int64),
|
|
30
|
+
"str_col": pl.Series(nulls, dtype=pl.Utf8),
|
|
31
|
+
"bool_col": pl.Series(nulls, dtype=pl.Boolean),
|
|
32
|
+
}
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@pytest.fixture(scope="session")
|
|
37
|
+
def single_value_df() -> pl.DataFrame:
|
|
38
|
+
return pl.DataFrame(
|
|
39
|
+
{
|
|
40
|
+
"score": pl.Series([5.0] * _N, dtype=pl.Float64),
|
|
41
|
+
"count": pl.Series([1] * _N, dtype=pl.Int64),
|
|
42
|
+
"category": pl.Series(["X"] * _N, dtype=pl.Utf8),
|
|
43
|
+
"active": pl.Series([True] * _N, dtype=pl.Boolean),
|
|
44
|
+
}
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@pytest.fixture(scope="session")
|
|
49
|
+
def single_row_df() -> pl.DataFrame:
|
|
50
|
+
return pl.DataFrame(
|
|
51
|
+
{
|
|
52
|
+
"score": pl.Series([42.0], dtype=pl.Float64),
|
|
53
|
+
"count": pl.Series([7], dtype=pl.Int64),
|
|
54
|
+
"category": pl.Series(["A"], dtype=pl.Utf8),
|
|
55
|
+
"active": pl.Series([True], dtype=pl.Boolean),
|
|
56
|
+
"event_date": pl.Series([_BASE_DATE], dtype=pl.Date),
|
|
57
|
+
}
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@pytest.fixture(scope="session")
|
|
62
|
+
def normal_mixed_df() -> pl.DataFrame:
|
|
63
|
+
_CATEGORIES = ["A", "B", "C", "D", "E"]
|
|
64
|
+
|
|
65
|
+
scores = [round(1.5 + i * 1.7 + (i % 7) * 0.3, 2) for i in range(_N)]
|
|
66
|
+
counts = [i % 20 for i in range(_N)]
|
|
67
|
+
categories = [_CATEGORIES[i % len(_CATEGORIES)] for i in range(_N)]
|
|
68
|
+
active = [i % 2 == 0 for i in range(_N)]
|
|
69
|
+
dates = [_BASE_DATE + timedelta(days=i) for i in range(_N)]
|
|
70
|
+
salary = [None if i % 10 == 0 else round(30_000.0 + i * 500.0, 2) for i in range(_N)]
|
|
71
|
+
|
|
72
|
+
return pl.DataFrame(
|
|
73
|
+
{
|
|
74
|
+
"score": pl.Series(scores, dtype=pl.Float64),
|
|
75
|
+
"count": pl.Series(counts, dtype=pl.Int64),
|
|
76
|
+
"category": pl.Series(categories, dtype=pl.Utf8),
|
|
77
|
+
"active": pl.Series(active, dtype=pl.Boolean),
|
|
78
|
+
"event_date": pl.Series(dates, dtype=pl.Date),
|
|
79
|
+
"salary": pl.Series(salary, dtype=pl.Float64),
|
|
80
|
+
}
|
|
81
|
+
)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
from ....profiling._boolean_profiler import BooleanProfiler
|
|
4
|
+
from ....profiling._boolean_config import BooleanProfileResult, BooleanStats
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# ---------------------------------------------------------------------------
|
|
8
|
+
# Result type & analysed_columns
|
|
9
|
+
# ---------------------------------------------------------------------------
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_result_type_and_analysed_columns():
|
|
13
|
+
df = pl.DataFrame(
|
|
14
|
+
{
|
|
15
|
+
"flag": pl.Series([True, False, True], dtype=pl.Boolean),
|
|
16
|
+
"score": pl.Series([1.0, 2.0, 3.0], dtype=pl.Float64),
|
|
17
|
+
}
|
|
18
|
+
)
|
|
19
|
+
result = BooleanProfiler().profile(df, ["flag", "score"])
|
|
20
|
+
assert isinstance(result, BooleanProfileResult)
|
|
21
|
+
assert "flag" in result.analysed_columns
|
|
22
|
+
assert "score" not in result.analysed_columns
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Counts
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_true_false_count_sum_equals_non_null_count():
|
|
31
|
+
values = [True, False, True, None, True, False, None]
|
|
32
|
+
df = pl.DataFrame({"flag": pl.Series(values, dtype=pl.Boolean)})
|
|
33
|
+
stats = BooleanProfiler().profile(df, ["flag"]).columns["flag"]
|
|
34
|
+
non_null_count = df["flag"].drop_nulls().len()
|
|
35
|
+
assert stats.true_count + stats.false_count == non_null_count
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
# Ratios
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_true_ratio_plus_false_ratio_equals_one():
|
|
44
|
+
values = [True, True, False, True, False, True]
|
|
45
|
+
df = pl.DataFrame({"flag": pl.Series(values, dtype=pl.Boolean)})
|
|
46
|
+
stats = BooleanProfiler().profile(df, ["flag"]).columns["flag"]
|
|
47
|
+
assert abs(stats.true_ratio + stats.false_ratio - 1.0) < 1e-10
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
# Mode
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_tied_column_mode_is_none():
|
|
56
|
+
values = [True] * 5 + [False] * 5
|
|
57
|
+
df = pl.DataFrame({"flag": pl.Series(values, dtype=pl.Boolean)})
|
|
58
|
+
stats = BooleanProfiler().profile(df, ["flag"]).columns["flag"]
|
|
59
|
+
assert stats.mode is None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# ---------------------------------------------------------------------------
|
|
63
|
+
# Integer {0, 1} columns
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_integer_01_eligible_with_correct_counts_and_ratios():
|
|
68
|
+
values = [1, 0, 1, 1, 0, None]
|
|
69
|
+
df = pl.DataFrame({"bin": pl.Series(values, dtype=pl.Int64)})
|
|
70
|
+
result = BooleanProfiler().profile(df, ["bin"])
|
|
71
|
+
assert "bin" in result.analysed_columns
|
|
72
|
+
stats = result.columns["bin"]
|
|
73
|
+
non_null = [v for v in values if v is not None]
|
|
74
|
+
expected_true = sum(non_null)
|
|
75
|
+
expected_false = len(non_null) - expected_true
|
|
76
|
+
assert stats.true_count == expected_true
|
|
77
|
+
assert stats.false_count == expected_false
|
|
78
|
+
assert abs(stats.true_ratio + stats.false_ratio - 1.0) < 1e-10
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# All-null boolean column
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_all_null_boolean_returns_default_stats_without_crashing():
|
|
87
|
+
df = pl.DataFrame({"flag": pl.Series([None, None, None], dtype=pl.Boolean)})
|
|
88
|
+
stats = BooleanProfiler().profile(df, ["flag"]).columns["flag"]
|
|
89
|
+
assert isinstance(stats, BooleanStats)
|
|
90
|
+
assert stats.true_count == 0
|
|
91
|
+
assert stats.false_count == 0
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from ....profiling._categorical import CategoricalProfiler
|
|
5
|
+
from ....profiling._categorical_config import (
|
|
6
|
+
CategoricalFlag,
|
|
7
|
+
CategoricalProfileResult,
|
|
8
|
+
CategoricalStats,
|
|
9
|
+
ImbalanceMetrics,
|
|
10
|
+
RareCategoryStats,
|
|
11
|
+
TopValueEntry,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
# Result type & column eligibility
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_result_type(normal_mixed_df):
|
|
21
|
+
result = CategoricalProfiler().profile(normal_mixed_df, ["category"])
|
|
22
|
+
assert isinstance(result, CategoricalProfileResult)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_analysed_columns_only_eligible(normal_mixed_df):
|
|
26
|
+
result = CategoricalProfiler().profile(normal_mixed_df, ["category", "score"])
|
|
27
|
+
assert "score" not in result.analysed_columns
|
|
28
|
+
assert "category" in result.analysed_columns
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_analysed_columns_matches_columns_dict(normal_mixed_df):
|
|
32
|
+
result = CategoricalProfiler().profile(normal_mixed_df, ["category"])
|
|
33
|
+
assert set(result.analysed_columns) == set(result.columns.keys())
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_stats_type_per_column(normal_mixed_df):
|
|
37
|
+
result = CategoricalProfiler().profile(normal_mixed_df, ["category"])
|
|
38
|
+
assert isinstance(result.columns["category"], CategoricalStats)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
# Cardinality & unique_ratio
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_cardinality_equals_distinct_non_null_count():
|
|
47
|
+
# 5 distinct values, no nulls
|
|
48
|
+
data = ["A", "B", "C", "D", "E"] * 12 # 60 rows, 5 distinct
|
|
49
|
+
df = pl.DataFrame({"cat": pl.Series(data, dtype=pl.Utf8)})
|
|
50
|
+
stats = CategoricalProfiler().profile(df, ["cat"]).columns["cat"]
|
|
51
|
+
assert stats.cardinality == 5
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_cardinality_excludes_nulls():
|
|
55
|
+
# 4 distinct non-null + some nulls
|
|
56
|
+
data = ["A", "B", "C", "D"] * 10 + [None] * 20
|
|
57
|
+
df = pl.DataFrame({"cat": pl.Series(data, dtype=pl.Utf8)})
|
|
58
|
+
stats = CategoricalProfiler().profile(df, ["cat"]).columns["cat"]
|
|
59
|
+
assert stats.cardinality == 4
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_unique_ratio_equals_cardinality_over_n_rows():
|
|
63
|
+
data = ["A", "B", "C", "D", "E"] * 12 # 5 distinct, 60 rows
|
|
64
|
+
df = pl.DataFrame({"cat": pl.Series(data, dtype=pl.Utf8)})
|
|
65
|
+
stats = CategoricalProfiler().profile(df, ["cat"]).columns["cat"]
|
|
66
|
+
assert abs(stats.unique_ratio - 5 / 60) < 1e-10
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
# top_values
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_top_values_at_most_ten_entries():
|
|
75
|
+
# 15 distinct categories → top_values capped at 10
|
|
76
|
+
data = [str(i) for i in range(15)] * 4 # 60 rows, 15 distinct
|
|
77
|
+
df = pl.DataFrame({"cat": pl.Series(data, dtype=pl.Utf8)})
|
|
78
|
+
stats = CategoricalProfiler().profile(df, ["cat"]).columns["cat"]
|
|
79
|
+
assert len(stats.top_values) == 10
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def test_top_values_descending_count_order():
|
|
83
|
+
# Uneven counts: A=30, B=20, C=10
|
|
84
|
+
data = ["A"] * 30 + ["B"] * 20 + ["C"] * 10
|
|
85
|
+
df = pl.DataFrame({"cat": pl.Series(data, dtype=pl.Utf8)})
|
|
86
|
+
stats = CategoricalProfiler().profile(df, ["cat"]).columns["cat"]
|
|
87
|
+
counts = [e.count for e in stats.top_values]
|
|
88
|
+
assert counts == sorted(counts, reverse=True)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def test_top_values_entries_are_top_value_entry_type():
|
|
92
|
+
data = ["X", "Y", "Z"] * 20
|
|
93
|
+
df = pl.DataFrame({"cat": pl.Series(data, dtype=pl.Utf8)})
|
|
94
|
+
stats = CategoricalProfiler().profile(df, ["cat"]).columns["cat"]
|
|
95
|
+
for entry in stats.top_values:
|
|
96
|
+
assert isinstance(entry, TopValueEntry)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# ---------------------------------------------------------------------------
|
|
100
|
+
# Imbalance metrics
|
|
101
|
+
# ---------------------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def test_imbalance_fields_present_for_multi_category():
|
|
105
|
+
data = ["A"] * 30 + ["B"] * 20 + ["C"] * 10
|
|
106
|
+
df = pl.DataFrame({"cat": pl.Series(data, dtype=pl.Utf8)})
|
|
107
|
+
stats = CategoricalProfiler().profile(df, ["cat"]).columns["cat"]
|
|
108
|
+
assert isinstance(stats.imbalance, ImbalanceMetrics)
|
|
109
|
+
assert stats.imbalance.shannon_entropy > 0.0
|
|
110
|
+
assert stats.imbalance.gini_impurity > 0.0
|
|
111
|
+
assert stats.imbalance.class_ratio >= 1.0
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def test_imbalance_class_ratio_is_one_for_balanced():
|
|
115
|
+
# Equal counts → max_freq == min_freq → class_ratio = 1.0
|
|
116
|
+
data = ["A"] * 20 + ["B"] * 20 + ["C"] * 20
|
|
117
|
+
df = pl.DataFrame({"cat": pl.Series(data, dtype=pl.Utf8)})
|
|
118
|
+
stats = CategoricalProfiler().profile(df, ["cat"]).columns["cat"]
|
|
119
|
+
assert abs(stats.imbalance.class_ratio - 1.0) < 1e-10
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ---------------------------------------------------------------------------
|
|
123
|
+
# NearConstant flag
|
|
124
|
+
# ---------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_near_constant_flag_set():
|
|
128
|
+
# 55/60 = 0.917 > 0.90 → NearConstant
|
|
129
|
+
data = ["A"] * 55 + ["B"] * 5
|
|
130
|
+
df = pl.DataFrame({"cat": pl.Series(data, dtype=pl.Utf8)})
|
|
131
|
+
stats = CategoricalProfiler().profile(df, ["cat"]).columns["cat"]
|
|
132
|
+
assert CategoricalFlag.NearConstant in stats.flags
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def test_near_constant_flag_absent_for_balanced():
|
|
136
|
+
data = ["A"] * 20 + ["B"] * 20 + ["C"] * 20
|
|
137
|
+
df = pl.DataFrame({"cat": pl.Series(data, dtype=pl.Utf8)})
|
|
138
|
+
stats = CategoricalProfiler().profile(df, ["cat"]).columns["cat"]
|
|
139
|
+
assert CategoricalFlag.NearConstant not in stats.flags
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# ---------------------------------------------------------------------------
|
|
143
|
+
# Rare categories
|
|
144
|
+
# ---------------------------------------------------------------------------
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def test_rare_category_count_correct():
|
|
148
|
+
# 200 rows: threshold_abs = max(1, floor(0.01*200)) = 2
|
|
149
|
+
# "C" appears once → count=1 < 2 → rare
|
|
150
|
+
data = ["A"] * 190 + ["B"] * 9 + ["C"] * 1
|
|
151
|
+
df = pl.DataFrame({"cat": pl.Series(data, dtype=pl.Utf8)})
|
|
152
|
+
stats = CategoricalProfiler().profile(df, ["cat"]).columns["cat"]
|
|
153
|
+
assert stats.rare_categories.rare_category_count == 1
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def test_rare_category_count_zero_when_none_rare():
|
|
157
|
+
# All categories appear frequently
|
|
158
|
+
data = ["A"] * 100 + ["B"] * 100
|
|
159
|
+
df = pl.DataFrame({"cat": pl.Series(data, dtype=pl.Utf8)})
|
|
160
|
+
stats = CategoricalProfiler().profile(df, ["cat"]).columns["cat"]
|
|
161
|
+
assert stats.rare_categories.rare_category_count == 0
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# ---------------------------------------------------------------------------
|
|
165
|
+
# MixedType flag
|
|
166
|
+
# ---------------------------------------------------------------------------
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def test_mixed_type_flag_set():
|
|
170
|
+
# 10 numeric-looking strings + 50 non-numeric → minority pct ≈ 16.7%
|
|
171
|
+
# Wilson lower bound well above 5% threshold → MixedType
|
|
172
|
+
data = ["apple"] * 50 + ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
|
|
173
|
+
df = pl.DataFrame({"cat": pl.Series(data, dtype=pl.Utf8)})
|
|
174
|
+
stats = CategoricalProfiler().profile(df, ["cat"]).columns["cat"]
|
|
175
|
+
assert CategoricalFlag.MixedType in stats.flags
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def test_mixed_type_flag_absent_for_pure_strings():
|
|
179
|
+
data = ["apple", "banana", "cherry"] * 20
|
|
180
|
+
df = pl.DataFrame({"cat": pl.Series(data, dtype=pl.Utf8)})
|
|
181
|
+
stats = CategoricalProfiler().profile(df, ["cat"]).columns["cat"]
|
|
182
|
+
assert CategoricalFlag.MixedType not in stats.flags
|