dataforge-ml 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0}/PKG-INFO +1 -1
  2. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0}/pyproject.toml +2 -2
  3. dataforge_ml-0.3.0/src/dataforge_ml/__init__.py +21 -0
  4. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/utils/data_loader.py +0 -5
  5. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src}/dataforge_ml.egg-info/PKG-INFO +1 -1
  6. dataforge_ml-0.3.0/src/dataforge_ml.egg-info/SOURCES.txt +39 -0
  7. dataforge_ml-0.3.0/src/dataforge_ml.egg-info/top_level.txt +1 -0
  8. dataforge_ml-0.1.0/dataforge_ml.egg-info/SOURCES.txt +0 -57
  9. dataforge_ml-0.1.0/dataforge_ml.egg-info/top_level.txt +0 -6
  10. dataforge_ml-0.1.0/tests/conftest.py +0 -7
  11. dataforge_ml-0.1.0/tests/integration/__init__.py +0 -0
  12. dataforge_ml-0.1.0/tests/integration/conftest.py +0 -82
  13. dataforge_ml-0.1.0/tests/integration/test_structural_end_to_end.py +0 -219
  14. dataforge_ml-0.1.0/tests/unit/__init__.py +0 -0
  15. dataforge_ml-0.1.0/tests/unit/profiling/__init__.py +0 -0
  16. dataforge_ml-0.1.0/tests/unit/profiling/conftest.py +0 -81
  17. dataforge_ml-0.1.0/tests/unit/profiling/test_boolean_profiler.py +0 -91
  18. dataforge_ml-0.1.0/tests/unit/profiling/test_categorical_profiler.py +0 -182
  19. dataforge_ml-0.1.0/tests/unit/profiling/test_correlation_profiler.py +0 -124
  20. dataforge_ml-0.1.0/tests/unit/profiling/test_datetime_profiler.py +0 -133
  21. dataforge_ml-0.1.0/tests/unit/profiling/test_missingness_profiler.py +0 -51
  22. dataforge_ml-0.1.0/tests/unit/profiling/test_numeric_profiler.py +0 -212
  23. dataforge_ml-0.1.0/tests/unit/profiling/test_target_profiler.py +0 -44
  24. dataforge_ml-0.1.0/tests/unit/profiling/test_text_profiler.py +0 -61
  25. dataforge_ml-0.1.0/tests/unit/profiling/test_type_detector.py +0 -32
  26. dataforge_ml-0.1.0/tests/unit/splitting/__init__.py +0 -0
  27. dataforge_ml-0.1.0/tests/unit/splitting/test_data_splitter.py +0 -417
  28. dataforge_ml-0.1.0/utils/__init__.py +0 -0
  29. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0}/LICENSE +0 -0
  30. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0}/README.md +0 -0
  31. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0}/setup.cfg +0 -0
  32. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/models/__init__.py +0 -0
  33. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/models/_data_structure.py +0 -0
  34. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/models/_data_types.py +0 -0
  35. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/__init__.py +0 -0
  36. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_base.py +0 -0
  37. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_boolean_config.py +0 -0
  38. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_boolean_profiler.py +0 -0
  39. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_categorical.py +0 -0
  40. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_categorical_config.py +0 -0
  41. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_correlation_config.py +0 -0
  42. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_correlation_profiler.py +0 -0
  43. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_datetime_config.py +0 -0
  44. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_datetime_profiler.py +0 -0
  45. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_missingness_config.py +0 -0
  46. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_missingness_profiler.py +0 -0
  47. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_numeric_config.py +0 -0
  48. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_numeric_profiler.py +0 -0
  49. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_tabular.py +0 -0
  50. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_target_config.py +0 -0
  51. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_target_profiler.py +0 -0
  52. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_text_config.py +0 -0
  53. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_text_profiler.py +0 -0
  54. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_type_detector.py +0 -0
  55. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/config.py +0 -0
  56. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/structural.py +0 -0
  57. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/splitting/__init__.py +0 -0
  58. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/splitting/_config.py +0 -0
  59. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/splitting/_splitter.py +0 -0
  60. {dataforge_ml-0.1.0/tests → dataforge_ml-0.3.0/src/dataforge_ml/utils}/__init__.py +0 -0
  61. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src}/dataforge_ml.egg-info/dependency_links.txt +0 -0
  62. {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src}/dataforge_ml.egg-info/requires.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "0.1.0"
7
+ version = "0.3.0"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -32,4 +32,4 @@ dev = [
32
32
  testpaths = ["tests"]
33
33
 
34
34
  [tool.setuptools.packages.find]
35
- where = ["."]
35
+ where = ["src"]
@@ -0,0 +1,21 @@
1
+ from .profiling.structural import StructuralProfiler
2
+ from .profiling.config import (
3
+ ProfileConfig,
4
+ SemanticType,
5
+ Modality,
6
+ StructuralProfileResult,
7
+ )
8
+ from .splitting import DataSplitter, SplitResult, FoldResult
9
+ from .utils.data_loader import DataLoader
10
+
11
+ __all__ = [
12
+ "StructuralProfiler",
13
+ "StructuralProfileResult",
14
+ "ProfileConfig",
15
+ "SemanticType",
16
+ "Modality",
17
+ "DataSplitter",
18
+ "SplitResult",
19
+ "FoldResult",
20
+ "DataLoader"
21
+ ]
@@ -103,8 +103,3 @@ class DataLoader:
103
103
 
104
104
  loader = _EXT_LOADERS[resolved_fmt]
105
105
  return loader(raw)
106
-
107
-
108
- def load(source: PathOrBuffer, fmt: str | None = None) -> pl.DataFrame:
109
- """Convenience wrapper — equivalent to ``DataLoader().load(source, fmt)``."""
110
- return DataLoader().load(source, fmt=fmt)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -0,0 +1,39 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/dataforge_ml/__init__.py
5
+ src/dataforge_ml.egg-info/PKG-INFO
6
+ src/dataforge_ml.egg-info/SOURCES.txt
7
+ src/dataforge_ml.egg-info/dependency_links.txt
8
+ src/dataforge_ml.egg-info/requires.txt
9
+ src/dataforge_ml.egg-info/top_level.txt
10
+ src/dataforge_ml/models/__init__.py
11
+ src/dataforge_ml/models/_data_structure.py
12
+ src/dataforge_ml/models/_data_types.py
13
+ src/dataforge_ml/profiling/__init__.py
14
+ src/dataforge_ml/profiling/_base.py
15
+ src/dataforge_ml/profiling/_boolean_config.py
16
+ src/dataforge_ml/profiling/_boolean_profiler.py
17
+ src/dataforge_ml/profiling/_categorical.py
18
+ src/dataforge_ml/profiling/_categorical_config.py
19
+ src/dataforge_ml/profiling/_correlation_config.py
20
+ src/dataforge_ml/profiling/_correlation_profiler.py
21
+ src/dataforge_ml/profiling/_datetime_config.py
22
+ src/dataforge_ml/profiling/_datetime_profiler.py
23
+ src/dataforge_ml/profiling/_missingness_config.py
24
+ src/dataforge_ml/profiling/_missingness_profiler.py
25
+ src/dataforge_ml/profiling/_numeric_config.py
26
+ src/dataforge_ml/profiling/_numeric_profiler.py
27
+ src/dataforge_ml/profiling/_tabular.py
28
+ src/dataforge_ml/profiling/_target_config.py
29
+ src/dataforge_ml/profiling/_target_profiler.py
30
+ src/dataforge_ml/profiling/_text_config.py
31
+ src/dataforge_ml/profiling/_text_profiler.py
32
+ src/dataforge_ml/profiling/_type_detector.py
33
+ src/dataforge_ml/profiling/config.py
34
+ src/dataforge_ml/profiling/structural.py
35
+ src/dataforge_ml/splitting/__init__.py
36
+ src/dataforge_ml/splitting/_config.py
37
+ src/dataforge_ml/splitting/_splitter.py
38
+ src/dataforge_ml/utils/__init__.py
39
+ src/dataforge_ml/utils/data_loader.py
@@ -0,0 +1 @@
1
+ dataforge_ml
@@ -1,57 +0,0 @@
1
- LICENSE
2
- README.md
3
- pyproject.toml
4
- dataforge_ml.egg-info/PKG-INFO
5
- dataforge_ml.egg-info/SOURCES.txt
6
- dataforge_ml.egg-info/dependency_links.txt
7
- dataforge_ml.egg-info/requires.txt
8
- dataforge_ml.egg-info/top_level.txt
9
- models/__init__.py
10
- models/_data_structure.py
11
- models/_data_types.py
12
- profiling/__init__.py
13
- profiling/_base.py
14
- profiling/_boolean_config.py
15
- profiling/_boolean_profiler.py
16
- profiling/_categorical.py
17
- profiling/_categorical_config.py
18
- profiling/_correlation_config.py
19
- profiling/_correlation_profiler.py
20
- profiling/_datetime_config.py
21
- profiling/_datetime_profiler.py
22
- profiling/_missingness_config.py
23
- profiling/_missingness_profiler.py
24
- profiling/_numeric_config.py
25
- profiling/_numeric_profiler.py
26
- profiling/_tabular.py
27
- profiling/_target_config.py
28
- profiling/_target_profiler.py
29
- profiling/_text_config.py
30
- profiling/_text_profiler.py
31
- profiling/_type_detector.py
32
- profiling/config.py
33
- profiling/structural.py
34
- splitting/__init__.py
35
- splitting/_config.py
36
- splitting/_splitter.py
37
- tests/__init__.py
38
- tests/conftest.py
39
- tests/integration/__init__.py
40
- tests/integration/conftest.py
41
- tests/integration/test_structural_end_to_end.py
42
- tests/unit/__init__.py
43
- tests/unit/profiling/__init__.py
44
- tests/unit/profiling/conftest.py
45
- tests/unit/profiling/test_boolean_profiler.py
46
- tests/unit/profiling/test_categorical_profiler.py
47
- tests/unit/profiling/test_correlation_profiler.py
48
- tests/unit/profiling/test_datetime_profiler.py
49
- tests/unit/profiling/test_missingness_profiler.py
50
- tests/unit/profiling/test_numeric_profiler.py
51
- tests/unit/profiling/test_target_profiler.py
52
- tests/unit/profiling/test_text_profiler.py
53
- tests/unit/profiling/test_type_detector.py
54
- tests/unit/splitting/__init__.py
55
- tests/unit/splitting/test_data_splitter.py
56
- utils/__init__.py
57
- utils/data_loader.py
@@ -1,6 +0,0 @@
1
- dist
2
- models
3
- profiling
4
- splitting
5
- tests
6
- utils
@@ -1,7 +0,0 @@
1
- import numpy as np
2
- import pytest
3
-
4
-
5
- @pytest.fixture(scope="session")
6
- def rng():
7
- return np.random.default_rng(42)
File without changes
@@ -1,82 +0,0 @@
1
-
2
- import polars as pl
3
- import pytest
4
-
5
-
6
- @pytest.fixture(scope="session")
7
- def override_df():
8
- n = 60
9
- return pl.DataFrame(
10
- {
11
- "score": pl.Series([float(i) for i in range(n)], dtype=pl.Float64),
12
- "category": pl.Series(["A", "B", "C"] * (n // 3), dtype=pl.Utf8),
13
- }
14
- )
15
-
16
-
17
- @pytest.fixture(scope="session")
18
- def target_df(rng):
19
- n = 100
20
- features = rng.normal(0, 1, size=n).tolist()
21
- labels = ["pos", "neg"] * (n // 2)
22
- return pl.DataFrame(
23
- {
24
- "feature": pl.Series(features, dtype=pl.Float64),
25
- "label": pl.Series(labels, dtype=pl.Utf8),
26
- }
27
- )
28
-
29
-
30
- @pytest.fixture(scope="session")
31
- def empty_df():
32
- return pl.DataFrame(
33
- {
34
- "x": pl.Series([], dtype=pl.Float64),
35
- "y": pl.Series([], dtype=pl.Utf8),
36
- }
37
- )
38
-
39
-
40
- @pytest.fixture(scope="session")
41
- def text_df():
42
- n = 200
43
- topics = ["science", "art", "history", "technology", "nature", "music"]
44
- texts = [
45
- f"A detailed description covering the topic of {topics[i % len(topics)]} "
46
- f"with multiple words that comfortably exceed the free-text threshold in row {i}"
47
- for i in range(n)
48
- ]
49
- return pl.DataFrame({"review": pl.Series(texts, dtype=pl.Utf8)})
50
-
51
-
52
- @pytest.fixture(scope="session")
53
- def mixed_df(rng):
54
- n = 300
55
-
56
- age = rng.integers(18, 75, size=n)
57
- income = age * 1200 + rng.normal(0, 5000, size=n)
58
-
59
- salary = rng.normal(50_000, 15_000, size=n).tolist()
60
- null_mask = rng.random(n) < 0.10
61
- salary = [None if null_mask[i] else salary[i] for i in range(n)]
62
-
63
- country_choices = ["US", "UK", "CA", "AU", "DE"]
64
- country = [country_choices[i % len(country_choices)] for i in range(n)]
65
-
66
- names = [f"person_{i}" for i in range(n)]
67
-
68
- is_active = [bool(v) for v in rng.integers(0, 2, size=n)]
69
-
70
- from datetime import date, timedelta
71
- base = date(2020, 1, 1)
72
- joined = [base + timedelta(days=int(d)) for d in rng.integers(0, 1460, size=n)]
73
-
74
- return pl.DataFrame({
75
- "age": pl.Series(age.tolist(), dtype=pl.Int64),
76
- "income": pl.Series(income.tolist(), dtype=pl.Float64),
77
- "salary": pl.Series(salary, dtype=pl.Float64),
78
- "country": pl.Series(country, dtype=pl.Utf8),
79
- "name": pl.Series(names, dtype=pl.Utf8),
80
- "is_active": pl.Series(is_active, dtype=pl.Boolean),
81
- "joined": pl.Series(joined, dtype=pl.Date),
82
- })
@@ -1,219 +0,0 @@
1
- import pytest
2
- from ...profiling.structural import StructuralProfiler
3
- from ...profiling.config import (
4
- ProfileConfig,
5
- StructuralProfileResult,
6
- SemanticType,
7
- )
8
- from ...profiling._numeric_config import NumericStats
9
- from ...profiling._categorical_config import CategoricalStats
10
- from ...profiling._datetime_config import DatetimeStats
11
- from ...profiling._boolean_config import BooleanStats
12
- from ...profiling._text_config import TextStats
13
- from ...profiling._target_config import TargetProfileResult
14
-
15
-
16
- def test_happy_path(mixed_df):
17
- config = ProfileConfig(compute_correlation=True)
18
- result = StructuralProfiler(config).profile(mixed_df)
19
-
20
- assert isinstance(result, StructuralProfileResult)
21
- assert set(result.columns.keys()) == set(mixed_df.columns)
22
- for col_profile in result.columns.values():
23
- assert (
24
- col_profile.semantic_type is not None
25
- ), f"column '{col_profile.name}' has no semantic_type"
26
-
27
-
28
- assert result.dataset.row_count == mixed_df.height
29
- assert result.dataset.feature_correlation is not None
30
-
31
-
32
- def test_no_correlation(mixed_df):
33
- config = ProfileConfig(compute_correlation=False)
34
- result = StructuralProfiler(config).profile(mixed_df)
35
-
36
- assert result.dataset.feature_correlation is None
37
-
38
-
39
- def test_boolean_handoff(mixed_df):
40
- result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
41
-
42
- cp = result.columns["is_active"]
43
- assert cp.semantic_type == SemanticType.Boolean
44
- assert cp.stats is not None
45
- assert isinstance(cp.stats, BooleanStats)
46
- assert cp.stats.mode in (True, False, None)
47
-
48
-
49
- def test_text_handoff(text_df):
50
- result = StructuralProfiler(ProfileConfig()).profile(text_df)
51
-
52
- cp = result.columns["review"]
53
- assert cp.semantic_type == SemanticType.Text
54
- assert cp.stats is not None
55
- assert isinstance(cp.stats, TextStats)
56
-
57
- assert cp.stats.vocabulary_size > 0
58
- assert cp.stats.char_length_max >= cp.stats.char_length_min
59
- assert cp.stats.avg_token_count > 0
60
- assert 0.0 <= cp.stats.empty_ratio <= 1.0
61
-
62
-
63
- def test_correlation_consistency(mixed_df):
64
- config = ProfileConfig(compute_correlation=True)
65
- result = StructuralProfiler(config).profile(mixed_df)
66
-
67
- fc = result.dataset.feature_correlation
68
- assert fc is not None
69
-
70
- # age and income are correlated by construction — forward invariant must not be vacuous
71
- assert len(fc.near_redundant_pairs) >= 1, (
72
- "expected at least one near-redundant pair (age/income are strongly correlated)"
73
- )
74
-
75
- # Forward invariant: every near_redundant pair must have both columns co-located
76
- # in the same NearRedundancyGroup
77
- for pair in fc.pairwise:
78
- if not pair.near_redundant:
79
- continue
80
- assert any(
81
- pair.col_a in group.columns and pair.col_b in group.columns
82
- for group in fc.near_redundancy_groups
83
- ), (
84
- f"near_redundant pair ({pair.col_a}, {pair.col_b}) "
85
- f"not co-located in any NearRedundancyGroup"
86
- )
87
-
88
- # Backward invariant: every column in a redundancy group must have at least
89
- # one near_redundant=True pair in pairwise
90
- for group in fc.near_redundancy_groups:
91
- for col in group.columns:
92
- assert any(
93
- (p.col_a == col or p.col_b == col) and p.near_redundant
94
- for p in fc.pairwise
95
- ), (
96
- f"column '{col}' is in a NearRedundancyGroup but has no "
97
- f"near_redundant=True pair in pairwise"
98
- )
99
-
100
- # Matrix symmetry — Pearson
101
- for col_a, row in fc.pearson_matrix.items():
102
- for col_b, val in row.items():
103
- mirror = fc.pearson_matrix.get(col_b, {}).get(col_a)
104
- assert mirror is not None and abs(val - mirror) < 1e-10, (
105
- f"Pearson matrix asymmetry: [{col_a}][{col_b}]={val} "
106
- f"vs [{col_b}][{col_a}]={mirror}"
107
- )
108
-
109
- # Matrix symmetry — Spearman
110
- for col_a, row in fc.spearman_matrix.items():
111
- for col_b, val in row.items():
112
- mirror = fc.spearman_matrix.get(col_b, {}).get(col_a)
113
- assert mirror is not None and abs(val - mirror) < 1e-10, (
114
- f"Spearman matrix asymmetry: [{col_a}][{col_b}]={val} "
115
- f"vs [{col_b}][{col_a}]={mirror}"
116
- )
117
-
118
- # Suggested drop is a strict subset of its group's columns
119
- for group in fc.near_redundancy_groups:
120
- group_cols = set(group.columns)
121
- drop_cols = set(group.suggested_drop)
122
- assert drop_cols < group_cols, (
123
- f"suggested_drop {drop_cols} is not a strict subset of "
124
- f"group columns {group_cols}"
125
- )
126
-
127
-
128
- def test_column_handoffs(mixed_df):
129
- result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
130
-
131
- stats_type_for = {
132
- SemanticType.Numeric: NumericStats,
133
- SemanticType.Categorical: CategoricalStats,
134
- SemanticType.Datetime: DatetimeStats,
135
- SemanticType.Boolean: BooleanStats,
136
- }
137
-
138
- for name, cp in result.columns.items():
139
- expected_type = stats_type_for.get(cp.semantic_type)
140
- if expected_type is None:
141
- continue
142
-
143
- assert cp.stats is not None, (
144
- f"column '{name}' has semantic_type={cp.semantic_type} but stats is None"
145
- )
146
- assert isinstance(cp.stats, expected_type), (
147
- f"column '{name}' has semantic_type={cp.semantic_type} "
148
- f"but stats type is {type(cp.stats).__name__}, expected {expected_type.__name__}"
149
- )
150
-
151
-
152
- # ---------------------------------------------------------------------------
153
- # Override: numeric column forced to Categorical via column_overrides
154
- # ---------------------------------------------------------------------------
155
-
156
-
157
- def test_column_override_changes_stats_type(override_df):
158
- config = ProfileConfig(column_overrides={"score": SemanticType.Categorical})
159
- result = StructuralProfiler(config).profile(override_df)
160
- cp = result.columns["score"]
161
- assert isinstance(cp.stats, CategoricalStats)
162
-
163
-
164
- # ---------------------------------------------------------------------------
165
- # Target profiling integration
166
- # ---------------------------------------------------------------------------
167
-
168
-
169
- def test_target_profiling_integration(target_df):
170
- config = ProfileConfig(target_columns=["label"])
171
- result = StructuralProfiler(config).profile(target_df)
172
- assert "label" in result.targets
173
- assert isinstance(result.targets["label"], TargetProfileResult)
174
-
175
-
176
- # ---------------------------------------------------------------------------
177
- # Empty DataFrame does not crash
178
- # ---------------------------------------------------------------------------
179
-
180
-
181
- def test_empty_dataframe_does_not_crash(empty_df):
182
- result = StructuralProfiler(ProfileConfig()).profile(empty_df)
183
- assert isinstance(result, StructuralProfileResult)
184
-
185
-
186
- # ---------------------------------------------------------------------------
187
- # Numeric handoff: float column produces NumericStats on ColumnProfile
188
- # ---------------------------------------------------------------------------
189
-
190
-
191
- def test_numeric_handoff(mixed_df):
192
- result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
193
- cp = result.columns["income"]
194
- assert cp.stats is not None
195
- assert isinstance(cp.stats, NumericStats)
196
-
197
-
198
- # ---------------------------------------------------------------------------
199
- # Datetime handoff: date column produces DatetimeStats on ColumnProfile
200
- # ---------------------------------------------------------------------------
201
-
202
-
203
- def test_datetime_handoff(mixed_df):
204
- result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
205
- cp = result.columns["joined"]
206
- assert cp.stats is not None
207
- assert isinstance(cp.stats, DatetimeStats)
208
-
209
-
210
- # ---------------------------------------------------------------------------
211
- # Missingness surfaced at column level for columns with nulls
212
- # ---------------------------------------------------------------------------
213
-
214
-
215
- def test_missingness_surfaced(mixed_df):
216
- result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
217
- cp = result.columns["salary"] # salary has ~10 % nulls by construction
218
- assert cp.missingness is not None
219
- assert cp.missingness.standard_null_count > 0
File without changes
File without changes
@@ -1,81 +0,0 @@
1
- from datetime import date, timedelta
2
-
3
- import polars as pl
4
- import pytest
5
-
6
- _BASE_DATE = date(2023, 1, 1)
7
- _N = 60
8
-
9
-
10
- @pytest.fixture(scope="session")
11
- def empty_df() -> pl.DataFrame:
12
- return pl.DataFrame(
13
- {
14
- "score": pl.Series([], dtype=pl.Float64),
15
- "count": pl.Series([], dtype=pl.Int64),
16
- "category": pl.Series([], dtype=pl.Utf8),
17
- "active": pl.Series([], dtype=pl.Boolean),
18
- "event_date": pl.Series([], dtype=pl.Date),
19
- }
20
- )
21
-
22
-
23
- @pytest.fixture(scope="session")
24
- def all_null_df() -> pl.DataFrame:
25
- nulls = [None] * _N
26
- return pl.DataFrame(
27
- {
28
- "float_col": pl.Series(nulls, dtype=pl.Float64),
29
- "int_col": pl.Series(nulls, dtype=pl.Int64),
30
- "str_col": pl.Series(nulls, dtype=pl.Utf8),
31
- "bool_col": pl.Series(nulls, dtype=pl.Boolean),
32
- }
33
- )
34
-
35
-
36
- @pytest.fixture(scope="session")
37
- def single_value_df() -> pl.DataFrame:
38
- return pl.DataFrame(
39
- {
40
- "score": pl.Series([5.0] * _N, dtype=pl.Float64),
41
- "count": pl.Series([1] * _N, dtype=pl.Int64),
42
- "category": pl.Series(["X"] * _N, dtype=pl.Utf8),
43
- "active": pl.Series([True] * _N, dtype=pl.Boolean),
44
- }
45
- )
46
-
47
-
48
- @pytest.fixture(scope="session")
49
- def single_row_df() -> pl.DataFrame:
50
- return pl.DataFrame(
51
- {
52
- "score": pl.Series([42.0], dtype=pl.Float64),
53
- "count": pl.Series([7], dtype=pl.Int64),
54
- "category": pl.Series(["A"], dtype=pl.Utf8),
55
- "active": pl.Series([True], dtype=pl.Boolean),
56
- "event_date": pl.Series([_BASE_DATE], dtype=pl.Date),
57
- }
58
- )
59
-
60
-
61
- @pytest.fixture(scope="session")
62
- def normal_mixed_df() -> pl.DataFrame:
63
- _CATEGORIES = ["A", "B", "C", "D", "E"]
64
-
65
- scores = [round(1.5 + i * 1.7 + (i % 7) * 0.3, 2) for i in range(_N)]
66
- counts = [i % 20 for i in range(_N)]
67
- categories = [_CATEGORIES[i % len(_CATEGORIES)] for i in range(_N)]
68
- active = [i % 2 == 0 for i in range(_N)]
69
- dates = [_BASE_DATE + timedelta(days=i) for i in range(_N)]
70
- salary = [None if i % 10 == 0 else round(30_000.0 + i * 500.0, 2) for i in range(_N)]
71
-
72
- return pl.DataFrame(
73
- {
74
- "score": pl.Series(scores, dtype=pl.Float64),
75
- "count": pl.Series(counts, dtype=pl.Int64),
76
- "category": pl.Series(categories, dtype=pl.Utf8),
77
- "active": pl.Series(active, dtype=pl.Boolean),
78
- "event_date": pl.Series(dates, dtype=pl.Date),
79
- "salary": pl.Series(salary, dtype=pl.Float64),
80
- }
81
- )
@@ -1,91 +0,0 @@
1
- import polars as pl
2
-
3
- from ....profiling._boolean_profiler import BooleanProfiler
4
- from ....profiling._boolean_config import BooleanProfileResult, BooleanStats
5
-
6
-
7
- # ---------------------------------------------------------------------------
8
- # Result type & analysed_columns
9
- # ---------------------------------------------------------------------------
10
-
11
-
12
- def test_result_type_and_analysed_columns():
13
- df = pl.DataFrame(
14
- {
15
- "flag": pl.Series([True, False, True], dtype=pl.Boolean),
16
- "score": pl.Series([1.0, 2.0, 3.0], dtype=pl.Float64),
17
- }
18
- )
19
- result = BooleanProfiler().profile(df, ["flag", "score"])
20
- assert isinstance(result, BooleanProfileResult)
21
- assert "flag" in result.analysed_columns
22
- assert "score" not in result.analysed_columns
23
-
24
-
25
- # ---------------------------------------------------------------------------
26
- # Counts
27
- # ---------------------------------------------------------------------------
28
-
29
-
30
- def test_true_false_count_sum_equals_non_null_count():
31
- values = [True, False, True, None, True, False, None]
32
- df = pl.DataFrame({"flag": pl.Series(values, dtype=pl.Boolean)})
33
- stats = BooleanProfiler().profile(df, ["flag"]).columns["flag"]
34
- non_null_count = df["flag"].drop_nulls().len()
35
- assert stats.true_count + stats.false_count == non_null_count
36
-
37
-
38
- # ---------------------------------------------------------------------------
39
- # Ratios
40
- # ---------------------------------------------------------------------------
41
-
42
-
43
- def test_true_ratio_plus_false_ratio_equals_one():
44
- values = [True, True, False, True, False, True]
45
- df = pl.DataFrame({"flag": pl.Series(values, dtype=pl.Boolean)})
46
- stats = BooleanProfiler().profile(df, ["flag"]).columns["flag"]
47
- assert abs(stats.true_ratio + stats.false_ratio - 1.0) < 1e-10
48
-
49
-
50
- # ---------------------------------------------------------------------------
51
- # Mode
52
- # ---------------------------------------------------------------------------
53
-
54
-
55
- def test_tied_column_mode_is_none():
56
- values = [True] * 5 + [False] * 5
57
- df = pl.DataFrame({"flag": pl.Series(values, dtype=pl.Boolean)})
58
- stats = BooleanProfiler().profile(df, ["flag"]).columns["flag"]
59
- assert stats.mode is None
60
-
61
-
62
- # ---------------------------------------------------------------------------
63
- # Integer {0, 1} columns
64
- # ---------------------------------------------------------------------------
65
-
66
-
67
- def test_integer_01_eligible_with_correct_counts_and_ratios():
68
- values = [1, 0, 1, 1, 0, None]
69
- df = pl.DataFrame({"bin": pl.Series(values, dtype=pl.Int64)})
70
- result = BooleanProfiler().profile(df, ["bin"])
71
- assert "bin" in result.analysed_columns
72
- stats = result.columns["bin"]
73
- non_null = [v for v in values if v is not None]
74
- expected_true = sum(non_null)
75
- expected_false = len(non_null) - expected_true
76
- assert stats.true_count == expected_true
77
- assert stats.false_count == expected_false
78
- assert abs(stats.true_ratio + stats.false_ratio - 1.0) < 1e-10
79
-
80
-
81
- # ---------------------------------------------------------------------------
82
- # All-null boolean column
83
- # ---------------------------------------------------------------------------
84
-
85
-
86
- def test_all_null_boolean_returns_default_stats_without_crashing():
87
- df = pl.DataFrame({"flag": pl.Series([None, None, None], dtype=pl.Boolean)})
88
- stats = BooleanProfiler().profile(df, ["flag"]).columns["flag"]
89
- assert isinstance(stats, BooleanStats)
90
- assert stats.true_count == 0
91
- assert stats.false_count == 0