dataforge-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge_ml-0.1.0.dist-info/METADATA +34 -0
- dataforge_ml-0.1.0.dist-info/RECORD +54 -0
- dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
- dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
- dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
- models/__init__.py +0 -0
- models/_data_structure.py +7 -0
- models/_data_types.py +12 -0
- profiling/__init__.py +35 -0
- profiling/_base.py +101 -0
- profiling/_boolean_config.py +37 -0
- profiling/_boolean_profiler.py +191 -0
- profiling/_categorical.py +315 -0
- profiling/_categorical_config.py +87 -0
- profiling/_correlation_config.py +225 -0
- profiling/_correlation_profiler.py +544 -0
- profiling/_datetime_config.py +98 -0
- profiling/_datetime_profiler.py +406 -0
- profiling/_missingness_config.py +137 -0
- profiling/_missingness_profiler.py +252 -0
- profiling/_numeric_config.py +116 -0
- profiling/_numeric_profiler.py +403 -0
- profiling/_tabular.py +249 -0
- profiling/_target_config.py +74 -0
- profiling/_target_profiler.py +156 -0
- profiling/_text_config.py +40 -0
- profiling/_text_profiler.py +194 -0
- profiling/_type_detector.py +463 -0
- profiling/config.py +236 -0
- profiling/structural.py +280 -0
- splitting/__init__.py +4 -0
- splitting/_config.py +56 -0
- splitting/_splitter.py +202 -0
- tests/__init__.py +0 -0
- tests/conftest.py +7 -0
- tests/integration/__init__.py +0 -0
- tests/integration/conftest.py +82 -0
- tests/integration/test_structural_end_to_end.py +219 -0
- tests/unit/__init__.py +0 -0
- tests/unit/profiling/__init__.py +0 -0
- tests/unit/profiling/conftest.py +81 -0
- tests/unit/profiling/test_boolean_profiler.py +91 -0
- tests/unit/profiling/test_categorical_profiler.py +182 -0
- tests/unit/profiling/test_correlation_profiler.py +124 -0
- tests/unit/profiling/test_datetime_profiler.py +133 -0
- tests/unit/profiling/test_missingness_profiler.py +51 -0
- tests/unit/profiling/test_numeric_profiler.py +212 -0
- tests/unit/profiling/test_target_profiler.py +44 -0
- tests/unit/profiling/test_text_profiler.py +61 -0
- tests/unit/profiling/test_type_detector.py +32 -0
- tests/unit/splitting/__init__.py +0 -0
- tests/unit/splitting/test_data_splitter.py +417 -0
- utils/__init__.py +0 -0
- utils/data_loader.py +110 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
from ....profiling._correlation_profiler import CorrelationProfiler
|
|
4
|
+
from ....profiling._correlation_config import CorrelationProfileResult
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# ---------------------------------------------------------------------------
|
|
8
|
+
# Helpers
|
|
9
|
+
# ---------------------------------------------------------------------------
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _make_df_three_cols():
|
|
13
|
+
"""Three numeric columns with moderate (non-trivial) correlations."""
|
|
14
|
+
n = 50
|
|
15
|
+
a = [float(i % 7) for i in range(n)]
|
|
16
|
+
b = [float((i * 3) % 11) for i in range(n)]
|
|
17
|
+
c = [float(i % 13) for i in range(n)]
|
|
18
|
+
return pl.DataFrame(
|
|
19
|
+
{
|
|
20
|
+
"a": pl.Series(a, dtype=pl.Float64),
|
|
21
|
+
"b": pl.Series(b, dtype=pl.Float64),
|
|
22
|
+
"c": pl.Series(c, dtype=pl.Float64),
|
|
23
|
+
}
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _make_df_with_duplicate():
|
|
28
|
+
"""Two identical columns plus one independent column."""
|
|
29
|
+
n = 40
|
|
30
|
+
vals = [float(i) for i in range(n)]
|
|
31
|
+
return pl.DataFrame(
|
|
32
|
+
{
|
|
33
|
+
"x": pl.Series(vals, dtype=pl.Float64),
|
|
34
|
+
"x_copy": pl.Series(vals, dtype=pl.Float64),
|
|
35
|
+
"y": pl.Series([v * 0.3 + 7.0 for v in vals], dtype=pl.Float64),
|
|
36
|
+
}
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# Pearson matrix symmetry (profile_features)
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_pearson_matrix_is_symmetric():
|
|
46
|
+
df = _make_df_three_cols()
|
|
47
|
+
cols = ["a", "b", "c"]
|
|
48
|
+
result = CorrelationProfiler(numeric_columns=cols).profile_features(df, cols)
|
|
49
|
+
for col_x in cols:
|
|
50
|
+
for col_y in cols:
|
|
51
|
+
assert result.pearson_matrix[col_x][col_y] == result.pearson_matrix[col_y][col_x]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
# Spearman matrix symmetry (profile_features)
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_spearman_matrix_is_symmetric():
|
|
60
|
+
df = _make_df_three_cols()
|
|
61
|
+
cols = ["a", "b", "c"]
|
|
62
|
+
result = CorrelationProfiler(numeric_columns=cols).profile_features(df, cols)
|
|
63
|
+
for col_x in cols:
|
|
64
|
+
for col_y in cols:
|
|
65
|
+
assert result.spearman_matrix[col_x][col_y] == result.spearman_matrix[col_y][col_x]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
# Every group column appears in at least one near-redundant pairwise entry
|
|
70
|
+
# (profile_features)
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_near_redundancy_group_columns_have_redundant_pairs():
|
|
75
|
+
df = _make_df_with_duplicate()
|
|
76
|
+
cols = ["x", "x_copy", "y"]
|
|
77
|
+
result = CorrelationProfiler(numeric_columns=cols).profile_features(df, cols)
|
|
78
|
+
for group in result.near_redundancy_groups:
|
|
79
|
+
for col in group.columns:
|
|
80
|
+
has_pair = any(
|
|
81
|
+
(p.col_a == col or p.col_b == col) and p.near_redundant
|
|
82
|
+
for p in result.pairwise
|
|
83
|
+
)
|
|
84
|
+
assert has_pair
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
# suggested_drop is a strict subset of the group's columns (profile_features)
|
|
89
|
+
# ---------------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def test_suggested_drop_is_strict_subset_of_group_columns():
|
|
93
|
+
df = _make_df_with_duplicate()
|
|
94
|
+
cols = ["x", "x_copy", "y"]
|
|
95
|
+
result = CorrelationProfiler(numeric_columns=cols).profile_features(df, cols)
|
|
96
|
+
for group in result.near_redundancy_groups:
|
|
97
|
+
drop_set = set(group.suggested_drop)
|
|
98
|
+
col_set = set(group.columns)
|
|
99
|
+
assert drop_set < col_set # non-empty and strictly smaller
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ---------------------------------------------------------------------------
|
|
103
|
+
# Identical columns produce a near-redundant pair; profile_target also works
|
|
104
|
+
# ---------------------------------------------------------------------------
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def test_identical_columns_produce_near_redundant_pair():
|
|
108
|
+
df = _make_df_with_duplicate()
|
|
109
|
+
feature_cols = ["x", "x_copy", "y"]
|
|
110
|
+
profiler = CorrelationProfiler(numeric_columns=feature_cols)
|
|
111
|
+
|
|
112
|
+
# profile_features: identical pair must be flagged near-redundant
|
|
113
|
+
feature_result = profiler.profile_features(df, feature_cols)
|
|
114
|
+
assert any(
|
|
115
|
+
{p.col_a, p.col_b} == {"x", "x_copy"}
|
|
116
|
+
for p in feature_result.near_redundant_pairs
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# profile_target: second entry point must run without error and attach target info
|
|
120
|
+
target_result = profiler.profile_target(
|
|
121
|
+
df, feature_result, feature_cols, [], "y"
|
|
122
|
+
)
|
|
123
|
+
assert isinstance(target_result, CorrelationProfileResult)
|
|
124
|
+
assert target_result.target_column == "y"
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
from datetime import date, datetime, timedelta, timezone
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
|
|
5
|
+
from ....profiling._datetime_profiler import DatetimeProfiler
|
|
6
|
+
from ....profiling._datetime_config import (
|
|
7
|
+
DatetimeFlag,
|
|
8
|
+
DatetimeProfileResult,
|
|
9
|
+
InferredGranularity,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# ---------------------------------------------------------------------------
|
|
14
|
+
# Result type & analysed_columns
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_result_type_and_analysed_columns():
|
|
19
|
+
df = pl.DataFrame(
|
|
20
|
+
{
|
|
21
|
+
"event_date": pl.Series(
|
|
22
|
+
[date(2024, 1, i + 1) for i in range(5)], dtype=pl.Date
|
|
23
|
+
),
|
|
24
|
+
"score": pl.Series([1.0, 2.0, 3.0, 4.0, 5.0], dtype=pl.Float64),
|
|
25
|
+
}
|
|
26
|
+
)
|
|
27
|
+
result = DatetimeProfiler().profile(df, ["event_date", "score"])
|
|
28
|
+
assert isinstance(result, DatetimeProfileResult)
|
|
29
|
+
assert "event_date" in result.analysed_columns
|
|
30
|
+
assert "score" not in result.analysed_columns
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
# Range: min_date <= max_date
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_min_date_lte_max_date():
|
|
39
|
+
dates = [date(2024, 1, 1), date(2024, 6, 15), date(2024, 3, 10)]
|
|
40
|
+
df = pl.DataFrame({"ts": pl.Series(dates, dtype=pl.Date)})
|
|
41
|
+
stats = DatetimeProfiler().profile(df, ["ts"]).columns["ts"]
|
|
42
|
+
assert stats.min_date <= stats.max_date
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
# Range: range_days non-negative and consistent with min/max
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_range_days_non_negative_and_consistent():
|
|
51
|
+
dates = [date(2024, 1, 1) + timedelta(days=i) for i in range(30)]
|
|
52
|
+
df = pl.DataFrame({"ts": pl.Series(dates, dtype=pl.Date)})
|
|
53
|
+
stats = DatetimeProfiler().profile(df, ["ts"]).columns["ts"]
|
|
54
|
+
assert stats.date_range_days >= 0
|
|
55
|
+
expected_days = (stats.max_date - stats.min_date).total_seconds() / 86_400.0
|
|
56
|
+
assert abs(stats.date_range_days - expected_days) < 1e-6
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
# FutureDates flag present and absent
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_future_dates_flag_present_and_absent():
|
|
65
|
+
past_dates = [date(2020, 1, 1), date(2021, 6, 1), date(2022, 12, 31)]
|
|
66
|
+
future_dates = [date(2099, 1, 1), date(2100, 6, 1)]
|
|
67
|
+
|
|
68
|
+
df_future = pl.DataFrame(
|
|
69
|
+
{"ts": pl.Series(past_dates + future_dates, dtype=pl.Date)}
|
|
70
|
+
)
|
|
71
|
+
stats_future = DatetimeProfiler().profile(df_future, ["ts"]).columns["ts"]
|
|
72
|
+
assert DatetimeFlag.FutureDates in stats_future.flags
|
|
73
|
+
|
|
74
|
+
df_past = pl.DataFrame({"ts": pl.Series(past_dates, dtype=pl.Date)})
|
|
75
|
+
stats_past = DatetimeProfiler().profile(df_past, ["ts"]).columns["ts"]
|
|
76
|
+
assert DatetimeFlag.FutureDates not in stats_past.flags
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
# Inferred granularity bands: daily, hourly, monthly
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def test_inferred_granularity_daily_hourly_monthly():
|
|
85
|
+
base = datetime(2024, 1, 1, tzinfo=timezone.utc)
|
|
86
|
+
|
|
87
|
+
# Daily: 1-day gaps → median gap ≈ 86 400 s → Daily band
|
|
88
|
+
daily_ts = [base + timedelta(days=i) for i in range(30)]
|
|
89
|
+
df_daily = pl.DataFrame(
|
|
90
|
+
{"ts": pl.Series(daily_ts, dtype=pl.Datetime("us", "UTC"))}
|
|
91
|
+
)
|
|
92
|
+
stats_daily = DatetimeProfiler().profile(df_daily, ["ts"]).columns["ts"]
|
|
93
|
+
assert stats_daily.inferred_granularity == InferredGranularity.Daily
|
|
94
|
+
|
|
95
|
+
# Hourly: 1-hour gaps → median gap ≈ 3 600 s → Hourly band
|
|
96
|
+
hourly_ts = [base + timedelta(hours=i) for i in range(48)]
|
|
97
|
+
df_hourly = pl.DataFrame(
|
|
98
|
+
{"ts": pl.Series(hourly_ts, dtype=pl.Datetime("us", "UTC"))}
|
|
99
|
+
)
|
|
100
|
+
stats_hourly = DatetimeProfiler().profile(df_hourly, ["ts"]).columns["ts"]
|
|
101
|
+
assert stats_hourly.inferred_granularity == InferredGranularity.Hourly
|
|
102
|
+
|
|
103
|
+
# Monthly: first day of each month → median gap ≈ 30 days → Monthly band
|
|
104
|
+
monthly_ts = [datetime(2024, m, 1, tzinfo=timezone.utc) for m in range(1, 13)]
|
|
105
|
+
df_monthly = pl.DataFrame(
|
|
106
|
+
{"ts": pl.Series(monthly_ts, dtype=pl.Datetime("us", "UTC"))}
|
|
107
|
+
)
|
|
108
|
+
stats_monthly = DatetimeProfiler().profile(df_monthly, ["ts"]).columns["ts"]
|
|
109
|
+
assert stats_monthly.inferred_granularity == InferredGranularity.Monthly
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# ---------------------------------------------------------------------------
|
|
113
|
+
# Temporal signals: month and day-of-week vary, year does not
|
|
114
|
+
# ---------------------------------------------------------------------------
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def test_temporal_signals_month_and_dow_vary_not_year():
|
|
118
|
+
# Six dates in 2024 across distinct months and distinct weekdays, same year.
|
|
119
|
+
# 2024-01-01 = Monday (0), 2024-02-04 = Sunday (6), 2024-03-06 = Wednesday (2)
|
|
120
|
+
# 2024-04-12 = Friday (4), 2024-05-14 = Tuesday (1), 2024-06-22 = Saturday (5)
|
|
121
|
+
dates_2024 = [
|
|
122
|
+
date(2024, 1, 1),
|
|
123
|
+
date(2024, 2, 4),
|
|
124
|
+
date(2024, 3, 6),
|
|
125
|
+
date(2024, 4, 12),
|
|
126
|
+
date(2024, 5, 14),
|
|
127
|
+
date(2024, 6, 22),
|
|
128
|
+
]
|
|
129
|
+
df = pl.DataFrame({"ts": pl.Series(dates_2024, dtype=pl.Date)})
|
|
130
|
+
stats = DatetimeProfiler().profile(df, ["ts"]).columns["ts"]
|
|
131
|
+
assert stats.signals.has_year is False
|
|
132
|
+
assert stats.signals.has_month is True
|
|
133
|
+
assert stats.signals.has_day_of_week is True
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
from ....profiling._missingness_profiler import MissingnessProfiler
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# ---------------------------------------------------------------------------
|
|
7
|
+
# null_count equals actual null count in the series
|
|
8
|
+
# ---------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_null_count_equals_actual_null_count():
|
|
12
|
+
values = [1, None, 3, None, None, 6, 7, None]
|
|
13
|
+
df = pl.DataFrame({"x": pl.Series(values, dtype=pl.Int64)})
|
|
14
|
+
profile = MissingnessProfiler().profile(df, ["x"]).columns["x"]
|
|
15
|
+
assert profile.standard_null_count == df["x"].null_count()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# null_ratio equals null_count / n_rows
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_null_ratio_equals_null_count_over_n_rows():
|
|
24
|
+
values = [10, None, 30, None, 50]
|
|
25
|
+
df = pl.DataFrame({"x": pl.Series(values, dtype=pl.Int64)})
|
|
26
|
+
profile = MissingnessProfiler().profile(df, ["x"]).columns["x"]
|
|
27
|
+
expected_ratio = profile.standard_null_count / df.height
|
|
28
|
+
assert abs(profile.standard_null_ratio - expected_ratio) < 1e-10
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
# All-null column produces null_ratio == 1.0 without crashing
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_all_null_column_ratio_is_one():
|
|
37
|
+
df = pl.DataFrame({"x": pl.Series([None, None, None, None], dtype=pl.Int64)})
|
|
38
|
+
profile = MissingnessProfiler().profile(df, ["x"]).columns["x"]
|
|
39
|
+
assert profile.effective_null_ratio == 1.0
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
# Fully populated column has null_count == 0 and null_ratio == 0.0
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_fully_populated_column_has_zero_nulls():
|
|
48
|
+
df = pl.DataFrame({"x": pl.Series([1, 2, 3, 4, 5], dtype=pl.Int64)})
|
|
49
|
+
profile = MissingnessProfiler().profile(df, ["x"]).columns["x"]
|
|
50
|
+
assert profile.standard_null_count == 0
|
|
51
|
+
assert profile.standard_null_ratio == 0.0
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from ....profiling._numeric_profiler import NumericProfiler
|
|
5
|
+
from ....profiling._numeric_config import (
|
|
6
|
+
KurtosisTag,
|
|
7
|
+
NumericFlag,
|
|
8
|
+
NumericProfileResult,
|
|
9
|
+
NumericStats,
|
|
10
|
+
PercentileSnapshot,
|
|
11
|
+
SkewSeverity,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
# Result type & column eligibility
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_result_type(normal_mixed_df):
|
|
21
|
+
result = NumericProfiler().profile(normal_mixed_df, ["score"])
|
|
22
|
+
assert isinstance(result, NumericProfileResult)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_analysed_columns_only_eligible(normal_mixed_df):
|
|
26
|
+
result = NumericProfiler().profile(normal_mixed_df, ["score", "salary", "category"])
|
|
27
|
+
assert "category" not in result.analysed_columns
|
|
28
|
+
assert "score" in result.analysed_columns
|
|
29
|
+
assert "salary" in result.analysed_columns
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_analysed_columns_matches_columns_dict(normal_mixed_df):
|
|
33
|
+
result = NumericProfiler().profile(normal_mixed_df, ["score", "salary"])
|
|
34
|
+
assert set(result.analysed_columns) == set(result.columns.keys())
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# Core stats present for a normal float column
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_core_stats_non_null_for_float(normal_mixed_df):
|
|
43
|
+
stats = NumericProfiler().profile(normal_mixed_df, ["score"]).columns["score"]
|
|
44
|
+
assert stats.mean is not None
|
|
45
|
+
assert stats.median is not None
|
|
46
|
+
assert stats.std is not None
|
|
47
|
+
assert stats.min is not None
|
|
48
|
+
assert stats.max is not None
|
|
49
|
+
assert stats.mean_median_ratio is not None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_min_lte_max(normal_mixed_df):
|
|
53
|
+
stats = NumericProfiler().profile(normal_mixed_df, ["score"]).columns["score"]
|
|
54
|
+
assert stats.min <= stats.max
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
# All-null column
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_all_null_column_no_crash(all_null_df):
|
|
63
|
+
result = NumericProfiler().profile(all_null_df, ["float_col"])
|
|
64
|
+
assert "float_col" in result.analysed_columns
|
|
65
|
+
stats = result.columns["float_col"]
|
|
66
|
+
assert isinstance(stats, NumericStats)
|
|
67
|
+
assert stats.mean is None
|
|
68
|
+
assert stats.std is None
|
|
69
|
+
assert stats.min is None
|
|
70
|
+
assert stats.max is None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# Single-value column
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def test_single_value_std_and_skewness_zero(single_value_df):
|
|
79
|
+
stats = NumericProfiler().profile(single_value_df, ["score"]).columns["score"]
|
|
80
|
+
assert stats.std == 0.0
|
|
81
|
+
assert stats.skewness == 0.0
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# ---------------------------------------------------------------------------
|
|
85
|
+
# ScaleAnomaly flag
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def test_scale_anomaly_flag_set():
|
|
90
|
+
# 0.5 to 5000 → ratio = 10 000 ≥ 10^3 → flag
|
|
91
|
+
df = pl.DataFrame({"v": pl.Series([0.5, 1.0, 1.5, 2.0, 5000.0] * 12, dtype=pl.Float64)})
|
|
92
|
+
stats = NumericProfiler().profile(df, ["v"]).columns["v"]
|
|
93
|
+
assert NumericFlag.ScaleAnomaly in stats.flags
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def test_scale_anomaly_flag_absent_normal_range():
|
|
97
|
+
df = pl.DataFrame({"v": pl.Series([10.0, 20.0, 30.0, 40.0, 50.0] * 12, dtype=pl.Float64)})
|
|
98
|
+
stats = NumericProfiler().profile(df, ["v"]).columns["v"]
|
|
99
|
+
assert NumericFlag.ScaleAnomaly not in stats.flags
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ---------------------------------------------------------------------------
|
|
103
|
+
# NearConstant flag
|
|
104
|
+
# ---------------------------------------------------------------------------
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def test_near_constant_flag_set():
|
|
108
|
+
# 55/60 = 0.917 > 0.90 → flag
|
|
109
|
+
data = [5.0] * 55 + [1.0, 2.0, 3.0, 4.0, 6.0]
|
|
110
|
+
df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
|
|
111
|
+
stats = NumericProfiler().profile(df, ["v"]).columns["v"]
|
|
112
|
+
assert NumericFlag.NearConstant in stats.flags
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def test_near_constant_flag_absent():
|
|
116
|
+
# 30/60 = 0.50 ≤ 0.90 → no flag
|
|
117
|
+
data = [5.0] * 30 + [6.0] * 30
|
|
118
|
+
df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
|
|
119
|
+
stats = NumericProfiler().profile(df, ["v"]).columns["v"]
|
|
120
|
+
assert NumericFlag.NearConstant not in stats.flags
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# ---------------------------------------------------------------------------
|
|
124
|
+
# Skewness severity bands
|
|
125
|
+
# ---------------------------------------------------------------------------
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def test_skewness_severity_normal():
|
|
129
|
+
# Symmetric uniform 1–60 → |skew| ≈ 0 → Normal
|
|
130
|
+
data = [float(i) for i in range(1, 61)]
|
|
131
|
+
df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
|
|
132
|
+
stats = NumericProfiler().profile(df, ["v"]).columns["v"]
|
|
133
|
+
assert stats.skewness_severity == SkewSeverity.Normal
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test_skewness_severity_severe():
|
|
137
|
+
# 57 near-zero values + 3 extreme values → |skew| >> 2.0 → Severe
|
|
138
|
+
data = [0.1] * 57 + [100.0, 200.0, 300.0]
|
|
139
|
+
df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
|
|
140
|
+
stats = NumericProfiler().profile(df, ["v"]).columns["v"]
|
|
141
|
+
assert stats.skewness_severity == SkewSeverity.Severe
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ---------------------------------------------------------------------------
|
|
145
|
+
# Kurtosis tag bands
|
|
146
|
+
# ---------------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def test_kurtosis_tag_leptokurtic():
|
|
150
|
+
# Mass concentrated at 5.0 with symmetric outliers → excess kurtosis >> 3.0
|
|
151
|
+
data = [5.0] * 54 + [0.1, 0.1, 0.1, 9.9, 9.9, 9.9]
|
|
152
|
+
df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
|
|
153
|
+
stats = NumericProfiler().profile(df, ["v"]).columns["v"]
|
|
154
|
+
assert stats.kurtosis_tag == KurtosisTag.Leptokurtic
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_kurtosis_tag_platykurtic():
|
|
158
|
+
# Uniform over 4 equally-spaced values → excess kurtosis < -1.0
|
|
159
|
+
data = [1.0] * 15 + [4.0] * 15 + [7.0] * 15 + [10.0] * 15
|
|
160
|
+
df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
|
|
161
|
+
stats = NumericProfiler().profile(df, ["v"]).columns["v"]
|
|
162
|
+
assert stats.kurtosis_tag == KurtosisTag.Platykurtic
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def test_kurtosis_tag_mesokurtic():
|
|
166
|
+
# Bell-curve approximation (discrete triangular) → excess kurtosis in (-1, 3)
|
|
167
|
+
data = [1.0]*3 + [2.0]*7 + [3.0]*12 + [4.0]*16 + [5.0]*12 + [6.0]*7 + [7.0]*3
|
|
168
|
+
df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
|
|
169
|
+
stats = NumericProfiler().profile(df, ["v"]).columns["v"]
|
|
170
|
+
assert stats.kurtosis_tag == KurtosisTag.Mesokurtic
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# ---------------------------------------------------------------------------
|
|
174
|
+
# Percentiles
|
|
175
|
+
# ---------------------------------------------------------------------------
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def test_percentiles_type_and_all_fields_present(normal_mixed_df):
|
|
179
|
+
stats = NumericProfiler().profile(normal_mixed_df, ["score"]).columns["score"]
|
|
180
|
+
p = stats.percentiles
|
|
181
|
+
assert isinstance(p, PercentileSnapshot)
|
|
182
|
+
for val in (p.p1, p.p5, p.p25, p.p50, p.p75, p.p95, p.p99):
|
|
183
|
+
assert val is not None
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def test_percentiles_monotonically_non_decreasing(normal_mixed_df):
|
|
187
|
+
p = NumericProfiler().profile(normal_mixed_df, ["score"]).columns["score"].percentiles
|
|
188
|
+
vals = [p.p1, p.p5, p.p25, p.p50, p.p75, p.p95, p.p99]
|
|
189
|
+
assert vals == sorted(vals)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# ---------------------------------------------------------------------------
|
|
193
|
+
# Discrete vs continuous distribution representation
|
|
194
|
+
# ---------------------------------------------------------------------------
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def test_integer_column_produces_top_values():
|
|
198
|
+
# Int64 dtype always triggers the discrete path
|
|
199
|
+
data = [i % 5 for i in range(60)]
|
|
200
|
+
df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Int64)})
|
|
201
|
+
stats = NumericProfiler().profile(df, ["v"]).columns["v"]
|
|
202
|
+
assert len(stats.top_values) > 0
|
|
203
|
+
assert len(stats.histogram) == 0
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def test_continuous_float_produces_histogram():
|
|
207
|
+
# 60 distinct floats → n_unique > _DISCRETE_MAX_UNIQUE (20) → continuous path
|
|
208
|
+
data = [round(i * 0.37, 4) for i in range(60)]
|
|
209
|
+
df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
|
|
210
|
+
stats = NumericProfiler().profile(df, ["v"]).columns["v"]
|
|
211
|
+
assert len(stats.histogram) > 0
|
|
212
|
+
assert len(stats.top_values) == 0
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from ....profiling._target_profiler import TargetProfiler
|
|
5
|
+
from ....profiling._target_config import TargetProblemType
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# ---------------------------------------------------------------------------
|
|
9
|
+
# Regression: continuous float target
|
|
10
|
+
# ---------------------------------------------------------------------------
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_regression_for_continuous_float_target():
|
|
14
|
+
df = pl.DataFrame(
|
|
15
|
+
{"price": pl.Series([float(i) * 1.5 for i in range(50)], dtype=pl.Float64)}
|
|
16
|
+
)
|
|
17
|
+
result = TargetProfiler(target_column="price").profile(df)
|
|
18
|
+
assert result.problem_type == TargetProblemType.Regression
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
# Classification: low-cardinality string target
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_classification_for_low_cardinality_string_target():
|
|
27
|
+
vals = ["cat", "dog", "bird"] * 20
|
|
28
|
+
df = pl.DataFrame({"label": pl.Series(vals, dtype=pl.Utf8)})
|
|
29
|
+
result = TargetProfiler(target_column="label").profile(df)
|
|
30
|
+
assert result.problem_type in (
|
|
31
|
+
TargetProblemType.BinaryClassification,
|
|
32
|
+
TargetProblemType.MulticlassClassification,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# Missing target column raises ValueError
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_missing_target_column_raises_value_error():
|
|
42
|
+
df = pl.DataFrame({"feature": pl.Series([1, 2, 3], dtype=pl.Int64)})
|
|
43
|
+
with pytest.raises(ValueError):
|
|
44
|
+
TargetProfiler(target_column="nonexistent").profile(df)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
from ....profiling._text_profiler import TextProfiler
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# ---------------------------------------------------------------------------
|
|
7
|
+
# vocabulary_size > 0 for a column with distinct tokens
|
|
8
|
+
# ---------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_vocabulary_size_positive_for_distinct_tokens():
|
|
12
|
+
df = pl.DataFrame(
|
|
13
|
+
{"text": pl.Series(["apple banana", "cherry", "date elderberry"], dtype=pl.Utf8)}
|
|
14
|
+
)
|
|
15
|
+
stats = TextProfiler().profile(df, ["text"]).columns["text"]
|
|
16
|
+
assert stats.vocabulary_size > 0
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
# char_length_max >= char_length_min for any non-empty text column
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_char_length_max_gte_min():
|
|
25
|
+
df = pl.DataFrame(
|
|
26
|
+
{"text": pl.Series(["hi", "hello world", "a"], dtype=pl.Utf8)}
|
|
27
|
+
)
|
|
28
|
+
stats = TextProfiler().profile(df, ["text"]).columns["text"]
|
|
29
|
+
assert stats.char_length_max >= stats.char_length_min
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# empty_ratio == 0.0 for no empty strings; == 1.0 for all-empty column
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_empty_ratio_absent_and_full():
|
|
38
|
+
df_no_empty = pl.DataFrame(
|
|
39
|
+
{"text": pl.Series(["hello", "world", "foo"], dtype=pl.Utf8)}
|
|
40
|
+
)
|
|
41
|
+
stats_no_empty = TextProfiler().profile(df_no_empty, ["text"]).columns["text"]
|
|
42
|
+
assert stats_no_empty.empty_ratio == 0.0
|
|
43
|
+
|
|
44
|
+
df_all_empty = pl.DataFrame(
|
|
45
|
+
{"text": pl.Series(["", "", ""], dtype=pl.Utf8)}
|
|
46
|
+
)
|
|
47
|
+
stats_all_empty = TextProfiler().profile(df_all_empty, ["text"]).columns["text"]
|
|
48
|
+
assert stats_all_empty.empty_ratio == 1.0
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
# avg_token_count > 0 for a column with multi-word entries
|
|
53
|
+
# ---------------------------------------------------------------------------
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_avg_token_count_positive_for_multiword():
|
|
57
|
+
df = pl.DataFrame(
|
|
58
|
+
{"text": pl.Series(["the quick brown fox", "hello world", "one two three"], dtype=pl.Utf8)}
|
|
59
|
+
)
|
|
60
|
+
stats = TextProfiler().profile(df, ["text"]).columns["text"]
|
|
61
|
+
assert stats.avg_token_count > 0
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
from ....profiling._type_detector import TypeDetector
|
|
4
|
+
from ....profiling.config import SemanticType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# ---------------------------------------------------------------------------
|
|
8
|
+
# Native pl.Boolean resolves to SemanticType.Boolean
|
|
9
|
+
# ---------------------------------------------------------------------------
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_native_boolean_column_resolves_to_boolean():
|
|
13
|
+
df = pl.DataFrame(
|
|
14
|
+
{"flag": pl.Series([True, False, True, True, False], dtype=pl.Boolean)}
|
|
15
|
+
)
|
|
16
|
+
info = TypeDetector(columns=["flag"]).detect(df)["flag"]
|
|
17
|
+
assert info.semantic_type == SemanticType.Boolean
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
# High-cardinality string column resolves to Categorical or Text (not Numeric)
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_high_cardinality_string_not_numeric():
|
|
26
|
+
# 80 rows, 40 distinct short strings — high cardinality but below
|
|
27
|
+
# the 99% identifier threshold so it stays Categorical/Text.
|
|
28
|
+
vals = ["item_" + str(i % 40) for i in range(80)]
|
|
29
|
+
df = pl.DataFrame({"name": pl.Series(vals, dtype=pl.Utf8)})
|
|
30
|
+
info = TypeDetector(columns=["name"]).detect(df)["name"]
|
|
31
|
+
assert info.semantic_type in (SemanticType.Categorical, SemanticType.Text)
|
|
32
|
+
assert info.semantic_type != SemanticType.Numeric
|
|
File without changes
|