dataforge-ml 0.1.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0}/PKG-INFO +1 -1
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0}/pyproject.toml +2 -2
- dataforge_ml-0.3.0/src/dataforge_ml/__init__.py +21 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/utils/data_loader.py +0 -5
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src}/dataforge_ml.egg-info/PKG-INFO +1 -1
- dataforge_ml-0.3.0/src/dataforge_ml.egg-info/SOURCES.txt +39 -0
- dataforge_ml-0.3.0/src/dataforge_ml.egg-info/top_level.txt +1 -0
- dataforge_ml-0.1.0/dataforge_ml.egg-info/SOURCES.txt +0 -57
- dataforge_ml-0.1.0/dataforge_ml.egg-info/top_level.txt +0 -6
- dataforge_ml-0.1.0/tests/conftest.py +0 -7
- dataforge_ml-0.1.0/tests/integration/__init__.py +0 -0
- dataforge_ml-0.1.0/tests/integration/conftest.py +0 -82
- dataforge_ml-0.1.0/tests/integration/test_structural_end_to_end.py +0 -219
- dataforge_ml-0.1.0/tests/unit/__init__.py +0 -0
- dataforge_ml-0.1.0/tests/unit/profiling/__init__.py +0 -0
- dataforge_ml-0.1.0/tests/unit/profiling/conftest.py +0 -81
- dataforge_ml-0.1.0/tests/unit/profiling/test_boolean_profiler.py +0 -91
- dataforge_ml-0.1.0/tests/unit/profiling/test_categorical_profiler.py +0 -182
- dataforge_ml-0.1.0/tests/unit/profiling/test_correlation_profiler.py +0 -124
- dataforge_ml-0.1.0/tests/unit/profiling/test_datetime_profiler.py +0 -133
- dataforge_ml-0.1.0/tests/unit/profiling/test_missingness_profiler.py +0 -51
- dataforge_ml-0.1.0/tests/unit/profiling/test_numeric_profiler.py +0 -212
- dataforge_ml-0.1.0/tests/unit/profiling/test_target_profiler.py +0 -44
- dataforge_ml-0.1.0/tests/unit/profiling/test_text_profiler.py +0 -61
- dataforge_ml-0.1.0/tests/unit/profiling/test_type_detector.py +0 -32
- dataforge_ml-0.1.0/tests/unit/splitting/__init__.py +0 -0
- dataforge_ml-0.1.0/tests/unit/splitting/test_data_splitter.py +0 -417
- dataforge_ml-0.1.0/utils/__init__.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0}/LICENSE +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0}/README.md +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0}/setup.cfg +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/models/__init__.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/models/_data_structure.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/models/_data_types.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/__init__.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_base.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_boolean_config.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_boolean_profiler.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_categorical.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_categorical_config.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_correlation_config.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_correlation_profiler.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_datetime_config.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_datetime_profiler.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_missingness_config.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_missingness_profiler.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_numeric_config.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_numeric_profiler.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_tabular.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_target_config.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_target_profiler.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_text_config.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_text_profiler.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/_type_detector.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/config.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/profiling/structural.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/splitting/__init__.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/splitting/_config.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src/dataforge_ml}/splitting/_splitter.py +0 -0
- {dataforge_ml-0.1.0/tests → dataforge_ml-0.3.0/src/dataforge_ml/utils}/__init__.py +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src}/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-0.1.0 → dataforge_ml-0.3.0/src}/dataforge_ml.egg-info/requires.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "dataforge-ml"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.0"
|
|
8
8
|
description = "A automated feature engineering and designing pipeline library"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -32,4 +32,4 @@ dev = [
|
|
|
32
32
|
testpaths = ["tests"]
|
|
33
33
|
|
|
34
34
|
[tool.setuptools.packages.find]
|
|
35
|
-
where = ["
|
|
35
|
+
where = ["src"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from .profiling.structural import StructuralProfiler
|
|
2
|
+
from .profiling.config import (
|
|
3
|
+
ProfileConfig,
|
|
4
|
+
SemanticType,
|
|
5
|
+
Modality,
|
|
6
|
+
StructuralProfileResult,
|
|
7
|
+
)
|
|
8
|
+
from .splitting import DataSplitter, SplitResult, FoldResult
|
|
9
|
+
from .utils.data_loader import DataLoader
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"StructuralProfiler",
|
|
13
|
+
"StructuralProfileResult",
|
|
14
|
+
"ProfileConfig",
|
|
15
|
+
"SemanticType",
|
|
16
|
+
"Modality",
|
|
17
|
+
"DataSplitter",
|
|
18
|
+
"SplitResult",
|
|
19
|
+
"FoldResult",
|
|
20
|
+
"DataLoader"
|
|
21
|
+
]
|
|
@@ -103,8 +103,3 @@ class DataLoader:
|
|
|
103
103
|
|
|
104
104
|
loader = _EXT_LOADERS[resolved_fmt]
|
|
105
105
|
return loader(raw)
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def load(source: PathOrBuffer, fmt: str | None = None) -> pl.DataFrame:
|
|
109
|
-
"""Convenience wrapper — equivalent to ``DataLoader().load(source, fmt)``."""
|
|
110
|
-
return DataLoader().load(source, fmt=fmt)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/dataforge_ml/__init__.py
|
|
5
|
+
src/dataforge_ml.egg-info/PKG-INFO
|
|
6
|
+
src/dataforge_ml.egg-info/SOURCES.txt
|
|
7
|
+
src/dataforge_ml.egg-info/dependency_links.txt
|
|
8
|
+
src/dataforge_ml.egg-info/requires.txt
|
|
9
|
+
src/dataforge_ml.egg-info/top_level.txt
|
|
10
|
+
src/dataforge_ml/models/__init__.py
|
|
11
|
+
src/dataforge_ml/models/_data_structure.py
|
|
12
|
+
src/dataforge_ml/models/_data_types.py
|
|
13
|
+
src/dataforge_ml/profiling/__init__.py
|
|
14
|
+
src/dataforge_ml/profiling/_base.py
|
|
15
|
+
src/dataforge_ml/profiling/_boolean_config.py
|
|
16
|
+
src/dataforge_ml/profiling/_boolean_profiler.py
|
|
17
|
+
src/dataforge_ml/profiling/_categorical.py
|
|
18
|
+
src/dataforge_ml/profiling/_categorical_config.py
|
|
19
|
+
src/dataforge_ml/profiling/_correlation_config.py
|
|
20
|
+
src/dataforge_ml/profiling/_correlation_profiler.py
|
|
21
|
+
src/dataforge_ml/profiling/_datetime_config.py
|
|
22
|
+
src/dataforge_ml/profiling/_datetime_profiler.py
|
|
23
|
+
src/dataforge_ml/profiling/_missingness_config.py
|
|
24
|
+
src/dataforge_ml/profiling/_missingness_profiler.py
|
|
25
|
+
src/dataforge_ml/profiling/_numeric_config.py
|
|
26
|
+
src/dataforge_ml/profiling/_numeric_profiler.py
|
|
27
|
+
src/dataforge_ml/profiling/_tabular.py
|
|
28
|
+
src/dataforge_ml/profiling/_target_config.py
|
|
29
|
+
src/dataforge_ml/profiling/_target_profiler.py
|
|
30
|
+
src/dataforge_ml/profiling/_text_config.py
|
|
31
|
+
src/dataforge_ml/profiling/_text_profiler.py
|
|
32
|
+
src/dataforge_ml/profiling/_type_detector.py
|
|
33
|
+
src/dataforge_ml/profiling/config.py
|
|
34
|
+
src/dataforge_ml/profiling/structural.py
|
|
35
|
+
src/dataforge_ml/splitting/__init__.py
|
|
36
|
+
src/dataforge_ml/splitting/_config.py
|
|
37
|
+
src/dataforge_ml/splitting/_splitter.py
|
|
38
|
+
src/dataforge_ml/utils/__init__.py
|
|
39
|
+
src/dataforge_ml/utils/data_loader.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dataforge_ml
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
LICENSE
|
|
2
|
-
README.md
|
|
3
|
-
pyproject.toml
|
|
4
|
-
dataforge_ml.egg-info/PKG-INFO
|
|
5
|
-
dataforge_ml.egg-info/SOURCES.txt
|
|
6
|
-
dataforge_ml.egg-info/dependency_links.txt
|
|
7
|
-
dataforge_ml.egg-info/requires.txt
|
|
8
|
-
dataforge_ml.egg-info/top_level.txt
|
|
9
|
-
models/__init__.py
|
|
10
|
-
models/_data_structure.py
|
|
11
|
-
models/_data_types.py
|
|
12
|
-
profiling/__init__.py
|
|
13
|
-
profiling/_base.py
|
|
14
|
-
profiling/_boolean_config.py
|
|
15
|
-
profiling/_boolean_profiler.py
|
|
16
|
-
profiling/_categorical.py
|
|
17
|
-
profiling/_categorical_config.py
|
|
18
|
-
profiling/_correlation_config.py
|
|
19
|
-
profiling/_correlation_profiler.py
|
|
20
|
-
profiling/_datetime_config.py
|
|
21
|
-
profiling/_datetime_profiler.py
|
|
22
|
-
profiling/_missingness_config.py
|
|
23
|
-
profiling/_missingness_profiler.py
|
|
24
|
-
profiling/_numeric_config.py
|
|
25
|
-
profiling/_numeric_profiler.py
|
|
26
|
-
profiling/_tabular.py
|
|
27
|
-
profiling/_target_config.py
|
|
28
|
-
profiling/_target_profiler.py
|
|
29
|
-
profiling/_text_config.py
|
|
30
|
-
profiling/_text_profiler.py
|
|
31
|
-
profiling/_type_detector.py
|
|
32
|
-
profiling/config.py
|
|
33
|
-
profiling/structural.py
|
|
34
|
-
splitting/__init__.py
|
|
35
|
-
splitting/_config.py
|
|
36
|
-
splitting/_splitter.py
|
|
37
|
-
tests/__init__.py
|
|
38
|
-
tests/conftest.py
|
|
39
|
-
tests/integration/__init__.py
|
|
40
|
-
tests/integration/conftest.py
|
|
41
|
-
tests/integration/test_structural_end_to_end.py
|
|
42
|
-
tests/unit/__init__.py
|
|
43
|
-
tests/unit/profiling/__init__.py
|
|
44
|
-
tests/unit/profiling/conftest.py
|
|
45
|
-
tests/unit/profiling/test_boolean_profiler.py
|
|
46
|
-
tests/unit/profiling/test_categorical_profiler.py
|
|
47
|
-
tests/unit/profiling/test_correlation_profiler.py
|
|
48
|
-
tests/unit/profiling/test_datetime_profiler.py
|
|
49
|
-
tests/unit/profiling/test_missingness_profiler.py
|
|
50
|
-
tests/unit/profiling/test_numeric_profiler.py
|
|
51
|
-
tests/unit/profiling/test_target_profiler.py
|
|
52
|
-
tests/unit/profiling/test_text_profiler.py
|
|
53
|
-
tests/unit/profiling/test_type_detector.py
|
|
54
|
-
tests/unit/splitting/__init__.py
|
|
55
|
-
tests/unit/splitting/test_data_splitter.py
|
|
56
|
-
utils/__init__.py
|
|
57
|
-
utils/data_loader.py
|
|
File without changes
|
|
@@ -1,82 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
import polars as pl
|
|
3
|
-
import pytest
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
@pytest.fixture(scope="session")
|
|
7
|
-
def override_df():
|
|
8
|
-
n = 60
|
|
9
|
-
return pl.DataFrame(
|
|
10
|
-
{
|
|
11
|
-
"score": pl.Series([float(i) for i in range(n)], dtype=pl.Float64),
|
|
12
|
-
"category": pl.Series(["A", "B", "C"] * (n // 3), dtype=pl.Utf8),
|
|
13
|
-
}
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@pytest.fixture(scope="session")
|
|
18
|
-
def target_df(rng):
|
|
19
|
-
n = 100
|
|
20
|
-
features = rng.normal(0, 1, size=n).tolist()
|
|
21
|
-
labels = ["pos", "neg"] * (n // 2)
|
|
22
|
-
return pl.DataFrame(
|
|
23
|
-
{
|
|
24
|
-
"feature": pl.Series(features, dtype=pl.Float64),
|
|
25
|
-
"label": pl.Series(labels, dtype=pl.Utf8),
|
|
26
|
-
}
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@pytest.fixture(scope="session")
|
|
31
|
-
def empty_df():
|
|
32
|
-
return pl.DataFrame(
|
|
33
|
-
{
|
|
34
|
-
"x": pl.Series([], dtype=pl.Float64),
|
|
35
|
-
"y": pl.Series([], dtype=pl.Utf8),
|
|
36
|
-
}
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@pytest.fixture(scope="session")
|
|
41
|
-
def text_df():
|
|
42
|
-
n = 200
|
|
43
|
-
topics = ["science", "art", "history", "technology", "nature", "music"]
|
|
44
|
-
texts = [
|
|
45
|
-
f"A detailed description covering the topic of {topics[i % len(topics)]} "
|
|
46
|
-
f"with multiple words that comfortably exceed the free-text threshold in row {i}"
|
|
47
|
-
for i in range(n)
|
|
48
|
-
]
|
|
49
|
-
return pl.DataFrame({"review": pl.Series(texts, dtype=pl.Utf8)})
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
@pytest.fixture(scope="session")
|
|
53
|
-
def mixed_df(rng):
|
|
54
|
-
n = 300
|
|
55
|
-
|
|
56
|
-
age = rng.integers(18, 75, size=n)
|
|
57
|
-
income = age * 1200 + rng.normal(0, 5000, size=n)
|
|
58
|
-
|
|
59
|
-
salary = rng.normal(50_000, 15_000, size=n).tolist()
|
|
60
|
-
null_mask = rng.random(n) < 0.10
|
|
61
|
-
salary = [None if null_mask[i] else salary[i] for i in range(n)]
|
|
62
|
-
|
|
63
|
-
country_choices = ["US", "UK", "CA", "AU", "DE"]
|
|
64
|
-
country = [country_choices[i % len(country_choices)] for i in range(n)]
|
|
65
|
-
|
|
66
|
-
names = [f"person_{i}" for i in range(n)]
|
|
67
|
-
|
|
68
|
-
is_active = [bool(v) for v in rng.integers(0, 2, size=n)]
|
|
69
|
-
|
|
70
|
-
from datetime import date, timedelta
|
|
71
|
-
base = date(2020, 1, 1)
|
|
72
|
-
joined = [base + timedelta(days=int(d)) for d in rng.integers(0, 1460, size=n)]
|
|
73
|
-
|
|
74
|
-
return pl.DataFrame({
|
|
75
|
-
"age": pl.Series(age.tolist(), dtype=pl.Int64),
|
|
76
|
-
"income": pl.Series(income.tolist(), dtype=pl.Float64),
|
|
77
|
-
"salary": pl.Series(salary, dtype=pl.Float64),
|
|
78
|
-
"country": pl.Series(country, dtype=pl.Utf8),
|
|
79
|
-
"name": pl.Series(names, dtype=pl.Utf8),
|
|
80
|
-
"is_active": pl.Series(is_active, dtype=pl.Boolean),
|
|
81
|
-
"joined": pl.Series(joined, dtype=pl.Date),
|
|
82
|
-
})
|
|
@@ -1,219 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
from ...profiling.structural import StructuralProfiler
|
|
3
|
-
from ...profiling.config import (
|
|
4
|
-
ProfileConfig,
|
|
5
|
-
StructuralProfileResult,
|
|
6
|
-
SemanticType,
|
|
7
|
-
)
|
|
8
|
-
from ...profiling._numeric_config import NumericStats
|
|
9
|
-
from ...profiling._categorical_config import CategoricalStats
|
|
10
|
-
from ...profiling._datetime_config import DatetimeStats
|
|
11
|
-
from ...profiling._boolean_config import BooleanStats
|
|
12
|
-
from ...profiling._text_config import TextStats
|
|
13
|
-
from ...profiling._target_config import TargetProfileResult
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def test_happy_path(mixed_df):
|
|
17
|
-
config = ProfileConfig(compute_correlation=True)
|
|
18
|
-
result = StructuralProfiler(config).profile(mixed_df)
|
|
19
|
-
|
|
20
|
-
assert isinstance(result, StructuralProfileResult)
|
|
21
|
-
assert set(result.columns.keys()) == set(mixed_df.columns)
|
|
22
|
-
for col_profile in result.columns.values():
|
|
23
|
-
assert (
|
|
24
|
-
col_profile.semantic_type is not None
|
|
25
|
-
), f"column '{col_profile.name}' has no semantic_type"
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
assert result.dataset.row_count == mixed_df.height
|
|
29
|
-
assert result.dataset.feature_correlation is not None
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def test_no_correlation(mixed_df):
|
|
33
|
-
config = ProfileConfig(compute_correlation=False)
|
|
34
|
-
result = StructuralProfiler(config).profile(mixed_df)
|
|
35
|
-
|
|
36
|
-
assert result.dataset.feature_correlation is None
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def test_boolean_handoff(mixed_df):
|
|
40
|
-
result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
|
|
41
|
-
|
|
42
|
-
cp = result.columns["is_active"]
|
|
43
|
-
assert cp.semantic_type == SemanticType.Boolean
|
|
44
|
-
assert cp.stats is not None
|
|
45
|
-
assert isinstance(cp.stats, BooleanStats)
|
|
46
|
-
assert cp.stats.mode in (True, False, None)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def test_text_handoff(text_df):
|
|
50
|
-
result = StructuralProfiler(ProfileConfig()).profile(text_df)
|
|
51
|
-
|
|
52
|
-
cp = result.columns["review"]
|
|
53
|
-
assert cp.semantic_type == SemanticType.Text
|
|
54
|
-
assert cp.stats is not None
|
|
55
|
-
assert isinstance(cp.stats, TextStats)
|
|
56
|
-
|
|
57
|
-
assert cp.stats.vocabulary_size > 0
|
|
58
|
-
assert cp.stats.char_length_max >= cp.stats.char_length_min
|
|
59
|
-
assert cp.stats.avg_token_count > 0
|
|
60
|
-
assert 0.0 <= cp.stats.empty_ratio <= 1.0
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def test_correlation_consistency(mixed_df):
|
|
64
|
-
config = ProfileConfig(compute_correlation=True)
|
|
65
|
-
result = StructuralProfiler(config).profile(mixed_df)
|
|
66
|
-
|
|
67
|
-
fc = result.dataset.feature_correlation
|
|
68
|
-
assert fc is not None
|
|
69
|
-
|
|
70
|
-
# age and income are correlated by construction — forward invariant must not be vacuous
|
|
71
|
-
assert len(fc.near_redundant_pairs) >= 1, (
|
|
72
|
-
"expected at least one near-redundant pair (age/income are strongly correlated)"
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
# Forward invariant: every near_redundant pair must have both columns co-located
|
|
76
|
-
# in the same NearRedundancyGroup
|
|
77
|
-
for pair in fc.pairwise:
|
|
78
|
-
if not pair.near_redundant:
|
|
79
|
-
continue
|
|
80
|
-
assert any(
|
|
81
|
-
pair.col_a in group.columns and pair.col_b in group.columns
|
|
82
|
-
for group in fc.near_redundancy_groups
|
|
83
|
-
), (
|
|
84
|
-
f"near_redundant pair ({pair.col_a}, {pair.col_b}) "
|
|
85
|
-
f"not co-located in any NearRedundancyGroup"
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
# Backward invariant: every column in a redundancy group must have at least
|
|
89
|
-
# one near_redundant=True pair in pairwise
|
|
90
|
-
for group in fc.near_redundancy_groups:
|
|
91
|
-
for col in group.columns:
|
|
92
|
-
assert any(
|
|
93
|
-
(p.col_a == col or p.col_b == col) and p.near_redundant
|
|
94
|
-
for p in fc.pairwise
|
|
95
|
-
), (
|
|
96
|
-
f"column '{col}' is in a NearRedundancyGroup but has no "
|
|
97
|
-
f"near_redundant=True pair in pairwise"
|
|
98
|
-
)
|
|
99
|
-
|
|
100
|
-
# Matrix symmetry — Pearson
|
|
101
|
-
for col_a, row in fc.pearson_matrix.items():
|
|
102
|
-
for col_b, val in row.items():
|
|
103
|
-
mirror = fc.pearson_matrix.get(col_b, {}).get(col_a)
|
|
104
|
-
assert mirror is not None and abs(val - mirror) < 1e-10, (
|
|
105
|
-
f"Pearson matrix asymmetry: [{col_a}][{col_b}]={val} "
|
|
106
|
-
f"vs [{col_b}][{col_a}]={mirror}"
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
# Matrix symmetry — Spearman
|
|
110
|
-
for col_a, row in fc.spearman_matrix.items():
|
|
111
|
-
for col_b, val in row.items():
|
|
112
|
-
mirror = fc.spearman_matrix.get(col_b, {}).get(col_a)
|
|
113
|
-
assert mirror is not None and abs(val - mirror) < 1e-10, (
|
|
114
|
-
f"Spearman matrix asymmetry: [{col_a}][{col_b}]={val} "
|
|
115
|
-
f"vs [{col_b}][{col_a}]={mirror}"
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
# Suggested drop is a strict subset of its group's columns
|
|
119
|
-
for group in fc.near_redundancy_groups:
|
|
120
|
-
group_cols = set(group.columns)
|
|
121
|
-
drop_cols = set(group.suggested_drop)
|
|
122
|
-
assert drop_cols < group_cols, (
|
|
123
|
-
f"suggested_drop {drop_cols} is not a strict subset of "
|
|
124
|
-
f"group columns {group_cols}"
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
def test_column_handoffs(mixed_df):
|
|
129
|
-
result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
|
|
130
|
-
|
|
131
|
-
stats_type_for = {
|
|
132
|
-
SemanticType.Numeric: NumericStats,
|
|
133
|
-
SemanticType.Categorical: CategoricalStats,
|
|
134
|
-
SemanticType.Datetime: DatetimeStats,
|
|
135
|
-
SemanticType.Boolean: BooleanStats,
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
for name, cp in result.columns.items():
|
|
139
|
-
expected_type = stats_type_for.get(cp.semantic_type)
|
|
140
|
-
if expected_type is None:
|
|
141
|
-
continue
|
|
142
|
-
|
|
143
|
-
assert cp.stats is not None, (
|
|
144
|
-
f"column '{name}' has semantic_type={cp.semantic_type} but stats is None"
|
|
145
|
-
)
|
|
146
|
-
assert isinstance(cp.stats, expected_type), (
|
|
147
|
-
f"column '{name}' has semantic_type={cp.semantic_type} "
|
|
148
|
-
f"but stats type is {type(cp.stats).__name__}, expected {expected_type.__name__}"
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
# ---------------------------------------------------------------------------
|
|
153
|
-
# Override: numeric column forced to Categorical via column_overrides
|
|
154
|
-
# ---------------------------------------------------------------------------
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
def test_column_override_changes_stats_type(override_df):
|
|
158
|
-
config = ProfileConfig(column_overrides={"score": SemanticType.Categorical})
|
|
159
|
-
result = StructuralProfiler(config).profile(override_df)
|
|
160
|
-
cp = result.columns["score"]
|
|
161
|
-
assert isinstance(cp.stats, CategoricalStats)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
# ---------------------------------------------------------------------------
|
|
165
|
-
# Target profiling integration
|
|
166
|
-
# ---------------------------------------------------------------------------
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
def test_target_profiling_integration(target_df):
|
|
170
|
-
config = ProfileConfig(target_columns=["label"])
|
|
171
|
-
result = StructuralProfiler(config).profile(target_df)
|
|
172
|
-
assert "label" in result.targets
|
|
173
|
-
assert isinstance(result.targets["label"], TargetProfileResult)
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
# ---------------------------------------------------------------------------
|
|
177
|
-
# Empty DataFrame does not crash
|
|
178
|
-
# ---------------------------------------------------------------------------
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
def test_empty_dataframe_does_not_crash(empty_df):
|
|
182
|
-
result = StructuralProfiler(ProfileConfig()).profile(empty_df)
|
|
183
|
-
assert isinstance(result, StructuralProfileResult)
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
# ---------------------------------------------------------------------------
|
|
187
|
-
# Numeric handoff: float column produces NumericStats on ColumnProfile
|
|
188
|
-
# ---------------------------------------------------------------------------
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
def test_numeric_handoff(mixed_df):
|
|
192
|
-
result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
|
|
193
|
-
cp = result.columns["income"]
|
|
194
|
-
assert cp.stats is not None
|
|
195
|
-
assert isinstance(cp.stats, NumericStats)
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
# ---------------------------------------------------------------------------
|
|
199
|
-
# Datetime handoff: date column produces DatetimeStats on ColumnProfile
|
|
200
|
-
# ---------------------------------------------------------------------------
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
def test_datetime_handoff(mixed_df):
|
|
204
|
-
result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
|
|
205
|
-
cp = result.columns["joined"]
|
|
206
|
-
assert cp.stats is not None
|
|
207
|
-
assert isinstance(cp.stats, DatetimeStats)
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
# ---------------------------------------------------------------------------
|
|
211
|
-
# Missingness surfaced at column level for columns with nulls
|
|
212
|
-
# ---------------------------------------------------------------------------
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
def test_missingness_surfaced(mixed_df):
|
|
216
|
-
result = StructuralProfiler(ProfileConfig()).profile(mixed_df)
|
|
217
|
-
cp = result.columns["salary"] # salary has ~10 % nulls by construction
|
|
218
|
-
assert cp.missingness is not None
|
|
219
|
-
assert cp.missingness.standard_null_count > 0
|
|
File without changes
|
|
File without changes
|
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
from datetime import date, timedelta
|
|
2
|
-
|
|
3
|
-
import polars as pl
|
|
4
|
-
import pytest
|
|
5
|
-
|
|
6
|
-
_BASE_DATE = date(2023, 1, 1)
|
|
7
|
-
_N = 60
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@pytest.fixture(scope="session")
|
|
11
|
-
def empty_df() -> pl.DataFrame:
|
|
12
|
-
return pl.DataFrame(
|
|
13
|
-
{
|
|
14
|
-
"score": pl.Series([], dtype=pl.Float64),
|
|
15
|
-
"count": pl.Series([], dtype=pl.Int64),
|
|
16
|
-
"category": pl.Series([], dtype=pl.Utf8),
|
|
17
|
-
"active": pl.Series([], dtype=pl.Boolean),
|
|
18
|
-
"event_date": pl.Series([], dtype=pl.Date),
|
|
19
|
-
}
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@pytest.fixture(scope="session")
|
|
24
|
-
def all_null_df() -> pl.DataFrame:
|
|
25
|
-
nulls = [None] * _N
|
|
26
|
-
return pl.DataFrame(
|
|
27
|
-
{
|
|
28
|
-
"float_col": pl.Series(nulls, dtype=pl.Float64),
|
|
29
|
-
"int_col": pl.Series(nulls, dtype=pl.Int64),
|
|
30
|
-
"str_col": pl.Series(nulls, dtype=pl.Utf8),
|
|
31
|
-
"bool_col": pl.Series(nulls, dtype=pl.Boolean),
|
|
32
|
-
}
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
@pytest.fixture(scope="session")
|
|
37
|
-
def single_value_df() -> pl.DataFrame:
|
|
38
|
-
return pl.DataFrame(
|
|
39
|
-
{
|
|
40
|
-
"score": pl.Series([5.0] * _N, dtype=pl.Float64),
|
|
41
|
-
"count": pl.Series([1] * _N, dtype=pl.Int64),
|
|
42
|
-
"category": pl.Series(["X"] * _N, dtype=pl.Utf8),
|
|
43
|
-
"active": pl.Series([True] * _N, dtype=pl.Boolean),
|
|
44
|
-
}
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
@pytest.fixture(scope="session")
|
|
49
|
-
def single_row_df() -> pl.DataFrame:
|
|
50
|
-
return pl.DataFrame(
|
|
51
|
-
{
|
|
52
|
-
"score": pl.Series([42.0], dtype=pl.Float64),
|
|
53
|
-
"count": pl.Series([7], dtype=pl.Int64),
|
|
54
|
-
"category": pl.Series(["A"], dtype=pl.Utf8),
|
|
55
|
-
"active": pl.Series([True], dtype=pl.Boolean),
|
|
56
|
-
"event_date": pl.Series([_BASE_DATE], dtype=pl.Date),
|
|
57
|
-
}
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
@pytest.fixture(scope="session")
|
|
62
|
-
def normal_mixed_df() -> pl.DataFrame:
|
|
63
|
-
_CATEGORIES = ["A", "B", "C", "D", "E"]
|
|
64
|
-
|
|
65
|
-
scores = [round(1.5 + i * 1.7 + (i % 7) * 0.3, 2) for i in range(_N)]
|
|
66
|
-
counts = [i % 20 for i in range(_N)]
|
|
67
|
-
categories = [_CATEGORIES[i % len(_CATEGORIES)] for i in range(_N)]
|
|
68
|
-
active = [i % 2 == 0 for i in range(_N)]
|
|
69
|
-
dates = [_BASE_DATE + timedelta(days=i) for i in range(_N)]
|
|
70
|
-
salary = [None if i % 10 == 0 else round(30_000.0 + i * 500.0, 2) for i in range(_N)]
|
|
71
|
-
|
|
72
|
-
return pl.DataFrame(
|
|
73
|
-
{
|
|
74
|
-
"score": pl.Series(scores, dtype=pl.Float64),
|
|
75
|
-
"count": pl.Series(counts, dtype=pl.Int64),
|
|
76
|
-
"category": pl.Series(categories, dtype=pl.Utf8),
|
|
77
|
-
"active": pl.Series(active, dtype=pl.Boolean),
|
|
78
|
-
"event_date": pl.Series(dates, dtype=pl.Date),
|
|
79
|
-
"salary": pl.Series(salary, dtype=pl.Float64),
|
|
80
|
-
}
|
|
81
|
-
)
|
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
import polars as pl
|
|
2
|
-
|
|
3
|
-
from ....profiling._boolean_profiler import BooleanProfiler
|
|
4
|
-
from ....profiling._boolean_config import BooleanProfileResult, BooleanStats
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
# ---------------------------------------------------------------------------
|
|
8
|
-
# Result type & analysed_columns
|
|
9
|
-
# ---------------------------------------------------------------------------
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def test_result_type_and_analysed_columns():
|
|
13
|
-
df = pl.DataFrame(
|
|
14
|
-
{
|
|
15
|
-
"flag": pl.Series([True, False, True], dtype=pl.Boolean),
|
|
16
|
-
"score": pl.Series([1.0, 2.0, 3.0], dtype=pl.Float64),
|
|
17
|
-
}
|
|
18
|
-
)
|
|
19
|
-
result = BooleanProfiler().profile(df, ["flag", "score"])
|
|
20
|
-
assert isinstance(result, BooleanProfileResult)
|
|
21
|
-
assert "flag" in result.analysed_columns
|
|
22
|
-
assert "score" not in result.analysed_columns
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
# ---------------------------------------------------------------------------
|
|
26
|
-
# Counts
|
|
27
|
-
# ---------------------------------------------------------------------------
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def test_true_false_count_sum_equals_non_null_count():
|
|
31
|
-
values = [True, False, True, None, True, False, None]
|
|
32
|
-
df = pl.DataFrame({"flag": pl.Series(values, dtype=pl.Boolean)})
|
|
33
|
-
stats = BooleanProfiler().profile(df, ["flag"]).columns["flag"]
|
|
34
|
-
non_null_count = df["flag"].drop_nulls().len()
|
|
35
|
-
assert stats.true_count + stats.false_count == non_null_count
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
# ---------------------------------------------------------------------------
|
|
39
|
-
# Ratios
|
|
40
|
-
# ---------------------------------------------------------------------------
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def test_true_ratio_plus_false_ratio_equals_one():
|
|
44
|
-
values = [True, True, False, True, False, True]
|
|
45
|
-
df = pl.DataFrame({"flag": pl.Series(values, dtype=pl.Boolean)})
|
|
46
|
-
stats = BooleanProfiler().profile(df, ["flag"]).columns["flag"]
|
|
47
|
-
assert abs(stats.true_ratio + stats.false_ratio - 1.0) < 1e-10
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
# ---------------------------------------------------------------------------
|
|
51
|
-
# Mode
|
|
52
|
-
# ---------------------------------------------------------------------------
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def test_tied_column_mode_is_none():
|
|
56
|
-
values = [True] * 5 + [False] * 5
|
|
57
|
-
df = pl.DataFrame({"flag": pl.Series(values, dtype=pl.Boolean)})
|
|
58
|
-
stats = BooleanProfiler().profile(df, ["flag"]).columns["flag"]
|
|
59
|
-
assert stats.mode is None
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
# ---------------------------------------------------------------------------
|
|
63
|
-
# Integer {0, 1} columns
|
|
64
|
-
# ---------------------------------------------------------------------------
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def test_integer_01_eligible_with_correct_counts_and_ratios():
|
|
68
|
-
values = [1, 0, 1, 1, 0, None]
|
|
69
|
-
df = pl.DataFrame({"bin": pl.Series(values, dtype=pl.Int64)})
|
|
70
|
-
result = BooleanProfiler().profile(df, ["bin"])
|
|
71
|
-
assert "bin" in result.analysed_columns
|
|
72
|
-
stats = result.columns["bin"]
|
|
73
|
-
non_null = [v for v in values if v is not None]
|
|
74
|
-
expected_true = sum(non_null)
|
|
75
|
-
expected_false = len(non_null) - expected_true
|
|
76
|
-
assert stats.true_count == expected_true
|
|
77
|
-
assert stats.false_count == expected_false
|
|
78
|
-
assert abs(stats.true_ratio + stats.false_ratio - 1.0) < 1e-10
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
# ---------------------------------------------------------------------------
|
|
82
|
-
# All-null boolean column
|
|
83
|
-
# ---------------------------------------------------------------------------
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def test_all_null_boolean_returns_default_stats_without_crashing():
|
|
87
|
-
df = pl.DataFrame({"flag": pl.Series([None, None, None], dtype=pl.Boolean)})
|
|
88
|
-
stats = BooleanProfiler().profile(df, ["flag"]).columns["flag"]
|
|
89
|
-
assert isinstance(stats, BooleanStats)
|
|
90
|
-
assert stats.true_count == 0
|
|
91
|
-
assert stats.false_count == 0
|