dataforge-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. dataforge_ml-0.1.0.dist-info/METADATA +34 -0
  2. dataforge_ml-0.1.0.dist-info/RECORD +54 -0
  3. dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
  4. dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
  5. dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
  6. models/__init__.py +0 -0
  7. models/_data_structure.py +7 -0
  8. models/_data_types.py +12 -0
  9. profiling/__init__.py +35 -0
  10. profiling/_base.py +101 -0
  11. profiling/_boolean_config.py +37 -0
  12. profiling/_boolean_profiler.py +191 -0
  13. profiling/_categorical.py +315 -0
  14. profiling/_categorical_config.py +87 -0
  15. profiling/_correlation_config.py +225 -0
  16. profiling/_correlation_profiler.py +544 -0
  17. profiling/_datetime_config.py +98 -0
  18. profiling/_datetime_profiler.py +406 -0
  19. profiling/_missingness_config.py +137 -0
  20. profiling/_missingness_profiler.py +252 -0
  21. profiling/_numeric_config.py +116 -0
  22. profiling/_numeric_profiler.py +403 -0
  23. profiling/_tabular.py +249 -0
  24. profiling/_target_config.py +74 -0
  25. profiling/_target_profiler.py +156 -0
  26. profiling/_text_config.py +40 -0
  27. profiling/_text_profiler.py +194 -0
  28. profiling/_type_detector.py +463 -0
  29. profiling/config.py +236 -0
  30. profiling/structural.py +280 -0
  31. splitting/__init__.py +4 -0
  32. splitting/_config.py +56 -0
  33. splitting/_splitter.py +202 -0
  34. tests/__init__.py +0 -0
  35. tests/conftest.py +7 -0
  36. tests/integration/__init__.py +0 -0
  37. tests/integration/conftest.py +82 -0
  38. tests/integration/test_structural_end_to_end.py +219 -0
  39. tests/unit/__init__.py +0 -0
  40. tests/unit/profiling/__init__.py +0 -0
  41. tests/unit/profiling/conftest.py +81 -0
  42. tests/unit/profiling/test_boolean_profiler.py +91 -0
  43. tests/unit/profiling/test_categorical_profiler.py +182 -0
  44. tests/unit/profiling/test_correlation_profiler.py +124 -0
  45. tests/unit/profiling/test_datetime_profiler.py +133 -0
  46. tests/unit/profiling/test_missingness_profiler.py +51 -0
  47. tests/unit/profiling/test_numeric_profiler.py +212 -0
  48. tests/unit/profiling/test_target_profiler.py +44 -0
  49. tests/unit/profiling/test_text_profiler.py +61 -0
  50. tests/unit/profiling/test_type_detector.py +32 -0
  51. tests/unit/splitting/__init__.py +0 -0
  52. tests/unit/splitting/test_data_splitter.py +417 -0
  53. utils/__init__.py +0 -0
  54. utils/data_loader.py +110 -0
@@ -0,0 +1,124 @@
1
+ import polars as pl
2
+
3
+ from ....profiling._correlation_profiler import CorrelationProfiler
4
+ from ....profiling._correlation_config import CorrelationProfileResult
5
+
6
+
7
+ # ---------------------------------------------------------------------------
8
+ # Helpers
9
+ # ---------------------------------------------------------------------------
10
+
11
+
12
+ def _make_df_three_cols():
13
+ """Three numeric columns with moderate (non-trivial) correlations."""
14
+ n = 50
15
+ a = [float(i % 7) for i in range(n)]
16
+ b = [float((i * 3) % 11) for i in range(n)]
17
+ c = [float(i % 13) for i in range(n)]
18
+ return pl.DataFrame(
19
+ {
20
+ "a": pl.Series(a, dtype=pl.Float64),
21
+ "b": pl.Series(b, dtype=pl.Float64),
22
+ "c": pl.Series(c, dtype=pl.Float64),
23
+ }
24
+ )
25
+
26
+
27
+ def _make_df_with_duplicate():
28
+ """Two identical columns plus one independent column."""
29
+ n = 40
30
+ vals = [float(i) for i in range(n)]
31
+ return pl.DataFrame(
32
+ {
33
+ "x": pl.Series(vals, dtype=pl.Float64),
34
+ "x_copy": pl.Series(vals, dtype=pl.Float64),
35
+ "y": pl.Series([v * 0.3 + 7.0 for v in vals], dtype=pl.Float64),
36
+ }
37
+ )
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Pearson matrix symmetry (profile_features)
42
+ # ---------------------------------------------------------------------------
43
+
44
+
45
+ def test_pearson_matrix_is_symmetric():
46
+ df = _make_df_three_cols()
47
+ cols = ["a", "b", "c"]
48
+ result = CorrelationProfiler(numeric_columns=cols).profile_features(df, cols)
49
+ for col_x in cols:
50
+ for col_y in cols:
51
+ assert result.pearson_matrix[col_x][col_y] == result.pearson_matrix[col_y][col_x]
52
+
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Spearman matrix symmetry (profile_features)
56
+ # ---------------------------------------------------------------------------
57
+
58
+
59
+ def test_spearman_matrix_is_symmetric():
60
+ df = _make_df_three_cols()
61
+ cols = ["a", "b", "c"]
62
+ result = CorrelationProfiler(numeric_columns=cols).profile_features(df, cols)
63
+ for col_x in cols:
64
+ for col_y in cols:
65
+ assert result.spearman_matrix[col_x][col_y] == result.spearman_matrix[col_y][col_x]
66
+
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # Every group column appears in at least one near-redundant pairwise entry
70
+ # (profile_features)
71
+ # ---------------------------------------------------------------------------
72
+
73
+
74
+ def test_near_redundancy_group_columns_have_redundant_pairs():
75
+ df = _make_df_with_duplicate()
76
+ cols = ["x", "x_copy", "y"]
77
+ result = CorrelationProfiler(numeric_columns=cols).profile_features(df, cols)
78
+ for group in result.near_redundancy_groups:
79
+ for col in group.columns:
80
+ has_pair = any(
81
+ (p.col_a == col or p.col_b == col) and p.near_redundant
82
+ for p in result.pairwise
83
+ )
84
+ assert has_pair
85
+
86
+
87
+ # ---------------------------------------------------------------------------
88
+ # suggested_drop is a strict subset of the group's columns (profile_features)
89
+ # ---------------------------------------------------------------------------
90
+
91
+
92
+ def test_suggested_drop_is_strict_subset_of_group_columns():
93
+ df = _make_df_with_duplicate()
94
+ cols = ["x", "x_copy", "y"]
95
+ result = CorrelationProfiler(numeric_columns=cols).profile_features(df, cols)
96
+ for group in result.near_redundancy_groups:
97
+ drop_set = set(group.suggested_drop)
98
+ col_set = set(group.columns)
99
+ assert drop_set < col_set # non-empty and strictly smaller
100
+
101
+
102
+ # ---------------------------------------------------------------------------
103
+ # Identical columns produce a near-redundant pair; profile_target also works
104
+ # ---------------------------------------------------------------------------
105
+
106
+
107
+ def test_identical_columns_produce_near_redundant_pair():
108
+ df = _make_df_with_duplicate()
109
+ feature_cols = ["x", "x_copy", "y"]
110
+ profiler = CorrelationProfiler(numeric_columns=feature_cols)
111
+
112
+ # profile_features: identical pair must be flagged near-redundant
113
+ feature_result = profiler.profile_features(df, feature_cols)
114
+ assert any(
115
+ {p.col_a, p.col_b} == {"x", "x_copy"}
116
+ for p in feature_result.near_redundant_pairs
117
+ )
118
+
119
+ # profile_target: second entry point must run without error and attach target info
120
+ target_result = profiler.profile_target(
121
+ df, feature_result, feature_cols, [], "y"
122
+ )
123
+ assert isinstance(target_result, CorrelationProfileResult)
124
+ assert target_result.target_column == "y"
@@ -0,0 +1,133 @@
1
+ from datetime import date, datetime, timedelta, timezone
2
+
3
+ import polars as pl
4
+
5
+ from ....profiling._datetime_profiler import DatetimeProfiler
6
+ from ....profiling._datetime_config import (
7
+ DatetimeFlag,
8
+ DatetimeProfileResult,
9
+ InferredGranularity,
10
+ )
11
+
12
+
13
+ # ---------------------------------------------------------------------------
14
+ # Result type & analysed_columns
15
+ # ---------------------------------------------------------------------------
16
+
17
+
18
+ def test_result_type_and_analysed_columns():
19
+ df = pl.DataFrame(
20
+ {
21
+ "event_date": pl.Series(
22
+ [date(2024, 1, i + 1) for i in range(5)], dtype=pl.Date
23
+ ),
24
+ "score": pl.Series([1.0, 2.0, 3.0, 4.0, 5.0], dtype=pl.Float64),
25
+ }
26
+ )
27
+ result = DatetimeProfiler().profile(df, ["event_date", "score"])
28
+ assert isinstance(result, DatetimeProfileResult)
29
+ assert "event_date" in result.analysed_columns
30
+ assert "score" not in result.analysed_columns
31
+
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Range: min_date <= max_date
35
+ # ---------------------------------------------------------------------------
36
+
37
+
38
+ def test_min_date_lte_max_date():
39
+ dates = [date(2024, 1, 1), date(2024, 6, 15), date(2024, 3, 10)]
40
+ df = pl.DataFrame({"ts": pl.Series(dates, dtype=pl.Date)})
41
+ stats = DatetimeProfiler().profile(df, ["ts"]).columns["ts"]
42
+ assert stats.min_date <= stats.max_date
43
+
44
+
45
+ # ---------------------------------------------------------------------------
46
+ # Range: range_days non-negative and consistent with min/max
47
+ # ---------------------------------------------------------------------------
48
+
49
+
50
+ def test_range_days_non_negative_and_consistent():
51
+ dates = [date(2024, 1, 1) + timedelta(days=i) for i in range(30)]
52
+ df = pl.DataFrame({"ts": pl.Series(dates, dtype=pl.Date)})
53
+ stats = DatetimeProfiler().profile(df, ["ts"]).columns["ts"]
54
+ assert stats.date_range_days >= 0
55
+ expected_days = (stats.max_date - stats.min_date).total_seconds() / 86_400.0
56
+ assert abs(stats.date_range_days - expected_days) < 1e-6
57
+
58
+
59
+ # ---------------------------------------------------------------------------
60
+ # FutureDates flag present and absent
61
+ # ---------------------------------------------------------------------------
62
+
63
+
64
+ def test_future_dates_flag_present_and_absent():
65
+ past_dates = [date(2020, 1, 1), date(2021, 6, 1), date(2022, 12, 31)]
66
+ future_dates = [date(2099, 1, 1), date(2100, 6, 1)]
67
+
68
+ df_future = pl.DataFrame(
69
+ {"ts": pl.Series(past_dates + future_dates, dtype=pl.Date)}
70
+ )
71
+ stats_future = DatetimeProfiler().profile(df_future, ["ts"]).columns["ts"]
72
+ assert DatetimeFlag.FutureDates in stats_future.flags
73
+
74
+ df_past = pl.DataFrame({"ts": pl.Series(past_dates, dtype=pl.Date)})
75
+ stats_past = DatetimeProfiler().profile(df_past, ["ts"]).columns["ts"]
76
+ assert DatetimeFlag.FutureDates not in stats_past.flags
77
+
78
+
79
+ # ---------------------------------------------------------------------------
80
+ # Inferred granularity bands: daily, hourly, monthly
81
+ # ---------------------------------------------------------------------------
82
+
83
+
84
+ def test_inferred_granularity_daily_hourly_monthly():
85
+ base = datetime(2024, 1, 1, tzinfo=timezone.utc)
86
+
87
+ # Daily: 1-day gaps → median gap ≈ 86 400 s → Daily band
88
+ daily_ts = [base + timedelta(days=i) for i in range(30)]
89
+ df_daily = pl.DataFrame(
90
+ {"ts": pl.Series(daily_ts, dtype=pl.Datetime("us", "UTC"))}
91
+ )
92
+ stats_daily = DatetimeProfiler().profile(df_daily, ["ts"]).columns["ts"]
93
+ assert stats_daily.inferred_granularity == InferredGranularity.Daily
94
+
95
+ # Hourly: 1-hour gaps → median gap ≈ 3 600 s → Hourly band
96
+ hourly_ts = [base + timedelta(hours=i) for i in range(48)]
97
+ df_hourly = pl.DataFrame(
98
+ {"ts": pl.Series(hourly_ts, dtype=pl.Datetime("us", "UTC"))}
99
+ )
100
+ stats_hourly = DatetimeProfiler().profile(df_hourly, ["ts"]).columns["ts"]
101
+ assert stats_hourly.inferred_granularity == InferredGranularity.Hourly
102
+
103
+ # Monthly: first day of each month → median gap ≈ 30 days → Monthly band
104
+ monthly_ts = [datetime(2024, m, 1, tzinfo=timezone.utc) for m in range(1, 13)]
105
+ df_monthly = pl.DataFrame(
106
+ {"ts": pl.Series(monthly_ts, dtype=pl.Datetime("us", "UTC"))}
107
+ )
108
+ stats_monthly = DatetimeProfiler().profile(df_monthly, ["ts"]).columns["ts"]
109
+ assert stats_monthly.inferred_granularity == InferredGranularity.Monthly
110
+
111
+
112
+ # ---------------------------------------------------------------------------
113
+ # Temporal signals: month and day-of-week vary, year does not
114
+ # ---------------------------------------------------------------------------
115
+
116
+
117
+ def test_temporal_signals_month_and_dow_vary_not_year():
118
+ # Six dates in 2024 across distinct months and distinct weekdays, same year.
119
+ # 2024-01-01 = Monday (0), 2024-02-04 = Sunday (6), 2024-03-06 = Wednesday (2)
120
+ # 2024-04-12 = Friday (4), 2024-05-14 = Tuesday (1), 2024-06-22 = Saturday (5)
121
+ dates_2024 = [
122
+ date(2024, 1, 1),
123
+ date(2024, 2, 4),
124
+ date(2024, 3, 6),
125
+ date(2024, 4, 12),
126
+ date(2024, 5, 14),
127
+ date(2024, 6, 22),
128
+ ]
129
+ df = pl.DataFrame({"ts": pl.Series(dates_2024, dtype=pl.Date)})
130
+ stats = DatetimeProfiler().profile(df, ["ts"]).columns["ts"]
131
+ assert stats.signals.has_year is False
132
+ assert stats.signals.has_month is True
133
+ assert stats.signals.has_day_of_week is True
@@ -0,0 +1,51 @@
1
+ import polars as pl
2
+
3
+ from ....profiling._missingness_profiler import MissingnessProfiler
4
+
5
+
6
+ # ---------------------------------------------------------------------------
7
+ # null_count equals actual null count in the series
8
+ # ---------------------------------------------------------------------------
9
+
10
+
11
+ def test_null_count_equals_actual_null_count():
12
+ values = [1, None, 3, None, None, 6, 7, None]
13
+ df = pl.DataFrame({"x": pl.Series(values, dtype=pl.Int64)})
14
+ profile = MissingnessProfiler().profile(df, ["x"]).columns["x"]
15
+ assert profile.standard_null_count == df["x"].null_count()
16
+
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # null_ratio equals null_count / n_rows
20
+ # ---------------------------------------------------------------------------
21
+
22
+
23
+ def test_null_ratio_equals_null_count_over_n_rows():
24
+ values = [10, None, 30, None, 50]
25
+ df = pl.DataFrame({"x": pl.Series(values, dtype=pl.Int64)})
26
+ profile = MissingnessProfiler().profile(df, ["x"]).columns["x"]
27
+ expected_ratio = profile.standard_null_count / df.height
28
+ assert abs(profile.standard_null_ratio - expected_ratio) < 1e-10
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # All-null column produces null_ratio == 1.0 without crashing
33
+ # ---------------------------------------------------------------------------
34
+
35
+
36
+ def test_all_null_column_ratio_is_one():
37
+ df = pl.DataFrame({"x": pl.Series([None, None, None, None], dtype=pl.Int64)})
38
+ profile = MissingnessProfiler().profile(df, ["x"]).columns["x"]
39
+ assert profile.effective_null_ratio == 1.0
40
+
41
+
42
+ # ---------------------------------------------------------------------------
43
+ # Fully populated column has null_count == 0 and null_ratio == 0.0
44
+ # ---------------------------------------------------------------------------
45
+
46
+
47
+ def test_fully_populated_column_has_zero_nulls():
48
+ df = pl.DataFrame({"x": pl.Series([1, 2, 3, 4, 5], dtype=pl.Int64)})
49
+ profile = MissingnessProfiler().profile(df, ["x"]).columns["x"]
50
+ assert profile.standard_null_count == 0
51
+ assert profile.standard_null_ratio == 0.0
@@ -0,0 +1,212 @@
1
+ import polars as pl
2
+ import pytest
3
+
4
+ from ....profiling._numeric_profiler import NumericProfiler
5
+ from ....profiling._numeric_config import (
6
+ KurtosisTag,
7
+ NumericFlag,
8
+ NumericProfileResult,
9
+ NumericStats,
10
+ PercentileSnapshot,
11
+ SkewSeverity,
12
+ )
13
+
14
+
15
+ # ---------------------------------------------------------------------------
16
+ # Result type & column eligibility
17
+ # ---------------------------------------------------------------------------
18
+
19
+
20
+ def test_result_type(normal_mixed_df):
21
+ result = NumericProfiler().profile(normal_mixed_df, ["score"])
22
+ assert isinstance(result, NumericProfileResult)
23
+
24
+
25
+ def test_analysed_columns_only_eligible(normal_mixed_df):
26
+ result = NumericProfiler().profile(normal_mixed_df, ["score", "salary", "category"])
27
+ assert "category" not in result.analysed_columns
28
+ assert "score" in result.analysed_columns
29
+ assert "salary" in result.analysed_columns
30
+
31
+
32
+ def test_analysed_columns_matches_columns_dict(normal_mixed_df):
33
+ result = NumericProfiler().profile(normal_mixed_df, ["score", "salary"])
34
+ assert set(result.analysed_columns) == set(result.columns.keys())
35
+
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Core stats present for a normal float column
39
+ # ---------------------------------------------------------------------------
40
+
41
+
42
+ def test_core_stats_non_null_for_float(normal_mixed_df):
43
+ stats = NumericProfiler().profile(normal_mixed_df, ["score"]).columns["score"]
44
+ assert stats.mean is not None
45
+ assert stats.median is not None
46
+ assert stats.std is not None
47
+ assert stats.min is not None
48
+ assert stats.max is not None
49
+ assert stats.mean_median_ratio is not None
50
+
51
+
52
+ def test_min_lte_max(normal_mixed_df):
53
+ stats = NumericProfiler().profile(normal_mixed_df, ["score"]).columns["score"]
54
+ assert stats.min <= stats.max
55
+
56
+
57
+ # ---------------------------------------------------------------------------
58
+ # All-null column
59
+ # ---------------------------------------------------------------------------
60
+
61
+
62
+ def test_all_null_column_no_crash(all_null_df):
63
+ result = NumericProfiler().profile(all_null_df, ["float_col"])
64
+ assert "float_col" in result.analysed_columns
65
+ stats = result.columns["float_col"]
66
+ assert isinstance(stats, NumericStats)
67
+ assert stats.mean is None
68
+ assert stats.std is None
69
+ assert stats.min is None
70
+ assert stats.max is None
71
+
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # Single-value column
75
+ # ---------------------------------------------------------------------------
76
+
77
+
78
+ def test_single_value_std_and_skewness_zero(single_value_df):
79
+ stats = NumericProfiler().profile(single_value_df, ["score"]).columns["score"]
80
+ assert stats.std == 0.0
81
+ assert stats.skewness == 0.0
82
+
83
+
84
+ # ---------------------------------------------------------------------------
85
+ # ScaleAnomaly flag
86
+ # ---------------------------------------------------------------------------
87
+
88
+
89
+ def test_scale_anomaly_flag_set():
90
+ # 0.5 to 5000 → ratio = 10 000 ≥ 10^3 → flag
91
+ df = pl.DataFrame({"v": pl.Series([0.5, 1.0, 1.5, 2.0, 5000.0] * 12, dtype=pl.Float64)})
92
+ stats = NumericProfiler().profile(df, ["v"]).columns["v"]
93
+ assert NumericFlag.ScaleAnomaly in stats.flags
94
+
95
+
96
+ def test_scale_anomaly_flag_absent_normal_range():
97
+ df = pl.DataFrame({"v": pl.Series([10.0, 20.0, 30.0, 40.0, 50.0] * 12, dtype=pl.Float64)})
98
+ stats = NumericProfiler().profile(df, ["v"]).columns["v"]
99
+ assert NumericFlag.ScaleAnomaly not in stats.flags
100
+
101
+
102
+ # ---------------------------------------------------------------------------
103
+ # NearConstant flag
104
+ # ---------------------------------------------------------------------------
105
+
106
+
107
+ def test_near_constant_flag_set():
108
+ # 55/60 = 0.917 > 0.90 → flag
109
+ data = [5.0] * 55 + [1.0, 2.0, 3.0, 4.0, 6.0]
110
+ df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
111
+ stats = NumericProfiler().profile(df, ["v"]).columns["v"]
112
+ assert NumericFlag.NearConstant in stats.flags
113
+
114
+
115
+ def test_near_constant_flag_absent():
116
+ # 30/60 = 0.50 ≤ 0.90 → no flag
117
+ data = [5.0] * 30 + [6.0] * 30
118
+ df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
119
+ stats = NumericProfiler().profile(df, ["v"]).columns["v"]
120
+ assert NumericFlag.NearConstant not in stats.flags
121
+
122
+
123
+ # ---------------------------------------------------------------------------
124
+ # Skewness severity bands
125
+ # ---------------------------------------------------------------------------
126
+
127
+
128
+ def test_skewness_severity_normal():
129
+ # Symmetric uniform 1–60 → |skew| ≈ 0 → Normal
130
+ data = [float(i) for i in range(1, 61)]
131
+ df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
132
+ stats = NumericProfiler().profile(df, ["v"]).columns["v"]
133
+ assert stats.skewness_severity == SkewSeverity.Normal
134
+
135
+
136
+ def test_skewness_severity_severe():
137
+ # 57 near-zero values + 3 extreme values → |skew| >> 2.0 → Severe
138
+ data = [0.1] * 57 + [100.0, 200.0, 300.0]
139
+ df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
140
+ stats = NumericProfiler().profile(df, ["v"]).columns["v"]
141
+ assert stats.skewness_severity == SkewSeverity.Severe
142
+
143
+
144
+ # ---------------------------------------------------------------------------
145
+ # Kurtosis tag bands
146
+ # ---------------------------------------------------------------------------
147
+
148
+
149
+ def test_kurtosis_tag_leptokurtic():
150
+ # Mass concentrated at 5.0 with symmetric outliers → excess kurtosis >> 3.0
151
+ data = [5.0] * 54 + [0.1, 0.1, 0.1, 9.9, 9.9, 9.9]
152
+ df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
153
+ stats = NumericProfiler().profile(df, ["v"]).columns["v"]
154
+ assert stats.kurtosis_tag == KurtosisTag.Leptokurtic
155
+
156
+
157
+ def test_kurtosis_tag_platykurtic():
158
+ # Uniform over 4 equally-spaced values → excess kurtosis < -1.0
159
+ data = [1.0] * 15 + [4.0] * 15 + [7.0] * 15 + [10.0] * 15
160
+ df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
161
+ stats = NumericProfiler().profile(df, ["v"]).columns["v"]
162
+ assert stats.kurtosis_tag == KurtosisTag.Platykurtic
163
+
164
+
165
+ def test_kurtosis_tag_mesokurtic():
166
+ # Bell-curve approximation (discrete triangular) → excess kurtosis in (-1, 3)
167
+ data = [1.0]*3 + [2.0]*7 + [3.0]*12 + [4.0]*16 + [5.0]*12 + [6.0]*7 + [7.0]*3
168
+ df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
169
+ stats = NumericProfiler().profile(df, ["v"]).columns["v"]
170
+ assert stats.kurtosis_tag == KurtosisTag.Mesokurtic
171
+
172
+
173
+ # ---------------------------------------------------------------------------
174
+ # Percentiles
175
+ # ---------------------------------------------------------------------------
176
+
177
+
178
+ def test_percentiles_type_and_all_fields_present(normal_mixed_df):
179
+ stats = NumericProfiler().profile(normal_mixed_df, ["score"]).columns["score"]
180
+ p = stats.percentiles
181
+ assert isinstance(p, PercentileSnapshot)
182
+ for val in (p.p1, p.p5, p.p25, p.p50, p.p75, p.p95, p.p99):
183
+ assert val is not None
184
+
185
+
186
+ def test_percentiles_monotonically_non_decreasing(normal_mixed_df):
187
+ p = NumericProfiler().profile(normal_mixed_df, ["score"]).columns["score"].percentiles
188
+ vals = [p.p1, p.p5, p.p25, p.p50, p.p75, p.p95, p.p99]
189
+ assert vals == sorted(vals)
190
+
191
+
192
+ # ---------------------------------------------------------------------------
193
+ # Discrete vs continuous distribution representation
194
+ # ---------------------------------------------------------------------------
195
+
196
+
197
+ def test_integer_column_produces_top_values():
198
+ # Int64 dtype always triggers the discrete path
199
+ data = [i % 5 for i in range(60)]
200
+ df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Int64)})
201
+ stats = NumericProfiler().profile(df, ["v"]).columns["v"]
202
+ assert len(stats.top_values) > 0
203
+ assert len(stats.histogram) == 0
204
+
205
+
206
+ def test_continuous_float_produces_histogram():
207
+ # 60 distinct floats → n_unique > _DISCRETE_MAX_UNIQUE (20) → continuous path
208
+ data = [round(i * 0.37, 4) for i in range(60)]
209
+ df = pl.DataFrame({"v": pl.Series(data, dtype=pl.Float64)})
210
+ stats = NumericProfiler().profile(df, ["v"]).columns["v"]
211
+ assert len(stats.histogram) > 0
212
+ assert len(stats.top_values) == 0
@@ -0,0 +1,44 @@
1
+ import polars as pl
2
+ import pytest
3
+
4
+ from ....profiling._target_profiler import TargetProfiler
5
+ from ....profiling._target_config import TargetProblemType
6
+
7
+
8
+ # ---------------------------------------------------------------------------
9
+ # Regression: continuous float target
10
+ # ---------------------------------------------------------------------------
11
+
12
+
13
+ def test_regression_for_continuous_float_target():
14
+ df = pl.DataFrame(
15
+ {"price": pl.Series([float(i) * 1.5 for i in range(50)], dtype=pl.Float64)}
16
+ )
17
+ result = TargetProfiler(target_column="price").profile(df)
18
+ assert result.problem_type == TargetProblemType.Regression
19
+
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # Classification: low-cardinality string target
23
+ # ---------------------------------------------------------------------------
24
+
25
+
26
+ def test_classification_for_low_cardinality_string_target():
27
+ vals = ["cat", "dog", "bird"] * 20
28
+ df = pl.DataFrame({"label": pl.Series(vals, dtype=pl.Utf8)})
29
+ result = TargetProfiler(target_column="label").profile(df)
30
+ assert result.problem_type in (
31
+ TargetProblemType.BinaryClassification,
32
+ TargetProblemType.MulticlassClassification,
33
+ )
34
+
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Missing target column raises ValueError
38
+ # ---------------------------------------------------------------------------
39
+
40
+
41
+ def test_missing_target_column_raises_value_error():
42
+ df = pl.DataFrame({"feature": pl.Series([1, 2, 3], dtype=pl.Int64)})
43
+ with pytest.raises(ValueError):
44
+ TargetProfiler(target_column="nonexistent").profile(df)
@@ -0,0 +1,61 @@
1
+ import polars as pl
2
+
3
+ from ....profiling._text_profiler import TextProfiler
4
+
5
+
6
+ # ---------------------------------------------------------------------------
7
+ # vocabulary_size > 0 for a column with distinct tokens
8
+ # ---------------------------------------------------------------------------
9
+
10
+
11
+ def test_vocabulary_size_positive_for_distinct_tokens():
12
+ df = pl.DataFrame(
13
+ {"text": pl.Series(["apple banana", "cherry", "date elderberry"], dtype=pl.Utf8)}
14
+ )
15
+ stats = TextProfiler().profile(df, ["text"]).columns["text"]
16
+ assert stats.vocabulary_size > 0
17
+
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # char_length_max >= char_length_min for any non-empty text column
21
+ # ---------------------------------------------------------------------------
22
+
23
+
24
+ def test_char_length_max_gte_min():
25
+ df = pl.DataFrame(
26
+ {"text": pl.Series(["hi", "hello world", "a"], dtype=pl.Utf8)}
27
+ )
28
+ stats = TextProfiler().profile(df, ["text"]).columns["text"]
29
+ assert stats.char_length_max >= stats.char_length_min
30
+
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # empty_ratio == 0.0 for no empty strings; == 1.0 for all-empty column
34
+ # ---------------------------------------------------------------------------
35
+
36
+
37
+ def test_empty_ratio_absent_and_full():
38
+ df_no_empty = pl.DataFrame(
39
+ {"text": pl.Series(["hello", "world", "foo"], dtype=pl.Utf8)}
40
+ )
41
+ stats_no_empty = TextProfiler().profile(df_no_empty, ["text"]).columns["text"]
42
+ assert stats_no_empty.empty_ratio == 0.0
43
+
44
+ df_all_empty = pl.DataFrame(
45
+ {"text": pl.Series(["", "", ""], dtype=pl.Utf8)}
46
+ )
47
+ stats_all_empty = TextProfiler().profile(df_all_empty, ["text"]).columns["text"]
48
+ assert stats_all_empty.empty_ratio == 1.0
49
+
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # avg_token_count > 0 for a column with multi-word entries
53
+ # ---------------------------------------------------------------------------
54
+
55
+
56
+ def test_avg_token_count_positive_for_multiword():
57
+ df = pl.DataFrame(
58
+ {"text": pl.Series(["the quick brown fox", "hello world", "one two three"], dtype=pl.Utf8)}
59
+ )
60
+ stats = TextProfiler().profile(df, ["text"]).columns["text"]
61
+ assert stats.avg_token_count > 0
@@ -0,0 +1,32 @@
1
+ import polars as pl
2
+
3
+ from ....profiling._type_detector import TypeDetector
4
+ from ....profiling.config import SemanticType
5
+
6
+
7
+ # ---------------------------------------------------------------------------
8
+ # Native pl.Boolean resolves to SemanticType.Boolean
9
+ # ---------------------------------------------------------------------------
10
+
11
+
12
+ def test_native_boolean_column_resolves_to_boolean():
13
+ df = pl.DataFrame(
14
+ {"flag": pl.Series([True, False, True, True, False], dtype=pl.Boolean)}
15
+ )
16
+ info = TypeDetector(columns=["flag"]).detect(df)["flag"]
17
+ assert info.semantic_type == SemanticType.Boolean
18
+
19
+
20
+ # ---------------------------------------------------------------------------
21
+ # High-cardinality string column resolves to Categorical or Text (not Numeric)
22
+ # ---------------------------------------------------------------------------
23
+
24
+
25
+ def test_high_cardinality_string_not_numeric():
26
+ # 80 rows, 40 distinct short strings — high cardinality but below
27
+ # the 99% identifier threshold so it stays Categorical/Text.
28
+ vals = ["item_" + str(i % 40) for i in range(80)]
29
+ df = pl.DataFrame({"name": pl.Series(vals, dtype=pl.Utf8)})
30
+ info = TypeDetector(columns=["name"]).detect(df)["name"]
31
+ assert info.semantic_type in (SemanticType.Categorical, SemanticType.Text)
32
+ assert info.semantic_type != SemanticType.Numeric
File without changes