dataforge-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge_ml-0.1.0.dist-info/METADATA +34 -0
- dataforge_ml-0.1.0.dist-info/RECORD +54 -0
- dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
- dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
- dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
- models/__init__.py +0 -0
- models/_data_structure.py +7 -0
- models/_data_types.py +12 -0
- profiling/__init__.py +35 -0
- profiling/_base.py +101 -0
- profiling/_boolean_config.py +37 -0
- profiling/_boolean_profiler.py +191 -0
- profiling/_categorical.py +315 -0
- profiling/_categorical_config.py +87 -0
- profiling/_correlation_config.py +225 -0
- profiling/_correlation_profiler.py +544 -0
- profiling/_datetime_config.py +98 -0
- profiling/_datetime_profiler.py +406 -0
- profiling/_missingness_config.py +137 -0
- profiling/_missingness_profiler.py +252 -0
- profiling/_numeric_config.py +116 -0
- profiling/_numeric_profiler.py +403 -0
- profiling/_tabular.py +249 -0
- profiling/_target_config.py +74 -0
- profiling/_target_profiler.py +156 -0
- profiling/_text_config.py +40 -0
- profiling/_text_profiler.py +194 -0
- profiling/_type_detector.py +463 -0
- profiling/config.py +236 -0
- profiling/structural.py +280 -0
- splitting/__init__.py +4 -0
- splitting/_config.py +56 -0
- splitting/_splitter.py +202 -0
- tests/__init__.py +0 -0
- tests/conftest.py +7 -0
- tests/integration/__init__.py +0 -0
- tests/integration/conftest.py +82 -0
- tests/integration/test_structural_end_to_end.py +219 -0
- tests/unit/__init__.py +0 -0
- tests/unit/profiling/__init__.py +0 -0
- tests/unit/profiling/conftest.py +81 -0
- tests/unit/profiling/test_boolean_profiler.py +91 -0
- tests/unit/profiling/test_categorical_profiler.py +182 -0
- tests/unit/profiling/test_correlation_profiler.py +124 -0
- tests/unit/profiling/test_datetime_profiler.py +133 -0
- tests/unit/profiling/test_missingness_profiler.py +51 -0
- tests/unit/profiling/test_numeric_profiler.py +212 -0
- tests/unit/profiling/test_target_profiler.py +44 -0
- tests/unit/profiling/test_text_profiler.py +61 -0
- tests/unit/profiling/test_type_detector.py +32 -0
- tests/unit/splitting/__init__.py +0 -0
- tests/unit/splitting/test_data_splitter.py +417 -0
- utils/__init__.py +0 -0
- utils/data_loader.py +110 -0
profiling/structural.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""
|
|
2
|
+
StructuralProfiler – unified Phase 1 entry point.
|
|
3
|
+
|
|
4
|
+
Execution order inside profile(df):
|
|
5
|
+
1. ModalityProfiler → result.dataset (DatasetStats)
|
|
6
|
+
2. MissingnessProfiler → ColumnProfile.missingness + dataset.missingness_matrix
|
|
7
|
+
3. Row-missingness dist → dataset.row_distribution
|
|
8
|
+
4. TypeDetector → ColumnProfile.semantic_type / type_flags / dtypes
|
|
9
|
+
5. column_overrides → replace SemanticType on existing ColumnProfiles
|
|
10
|
+
6. ColumnTypeProfiler → route each column to its profiler by SemanticType;
|
|
11
|
+
Identifier columns: skip, stats stays None
|
|
12
|
+
7. target_columns → TargetProfiler; mark ColumnProfile.is_target=True
|
|
13
|
+
8. Correlation → if compute_correlation=True:
|
|
14
|
+
a. profile_features() → dataset.feature_correlation (computed once)
|
|
15
|
+
b. profile_target() → dataset.target_correlations[target]
|
|
16
|
+
(once per declared target column)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import math
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
import polars as pl
|
|
25
|
+
|
|
26
|
+
from ._base import ModalityProfiler, ColumnBatchProfiler
|
|
27
|
+
from ._tabular import TabularProfiler
|
|
28
|
+
from ._categorical import CategoricalProfiler
|
|
29
|
+
from ._datetime_profiler import DatetimeProfiler
|
|
30
|
+
from ._numeric_profiler import NumericProfiler
|
|
31
|
+
from ._boolean_profiler import BooleanProfiler
|
|
32
|
+
from ._text_profiler import TextProfiler
|
|
33
|
+
from ._missingness_profiler import MissingnessProfiler
|
|
34
|
+
from ._target_profiler import TargetProfiler
|
|
35
|
+
from ._correlation_profiler import CorrelationProfiler
|
|
36
|
+
from ._type_detector import TypeDetector
|
|
37
|
+
from .config import (
|
|
38
|
+
ProfileConfig,
|
|
39
|
+
ColumnProfile,
|
|
40
|
+
StructuralProfileResult,
|
|
41
|
+
RowMissingnessDistribution,
|
|
42
|
+
SemanticType,
|
|
43
|
+
Modality,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
_ROW_DROP_THRESHOLD = 0.50
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# Registry: SemanticType → ColumnTypeProfiler class
|
|
50
|
+
#
|
|
51
|
+
# Stateless between profile(series, df) calls, so one instance per
|
|
52
|
+
# SemanticType safely handles all columns of that type in one run.
|
|
53
|
+
# Add Boolean / Text profilers here when implemented.
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
_COLUMN_PROFILER_REGISTRY: dict[SemanticType, type[ColumnBatchProfiler]] = { # type: ignore[type-arg]
|
|
56
|
+
SemanticType.Numeric: NumericProfiler,
|
|
57
|
+
SemanticType.Categorical: CategoricalProfiler,
|
|
58
|
+
SemanticType.Datetime: DatetimeProfiler,
|
|
59
|
+
SemanticType.Boolean: BooleanProfiler,
|
|
60
|
+
SemanticType.Text: TextProfiler,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class StructuralProfiler:
|
|
65
|
+
|
|
66
|
+
def __init__(self, config: ProfileConfig | None = None) -> None:
|
|
67
|
+
self.config = config or ProfileConfig()
|
|
68
|
+
|
|
69
|
+
if self.config.modality == Modality.Tabular:
|
|
70
|
+
self.modality_profiler: ModalityProfiler = TabularProfiler(self.config)
|
|
71
|
+
else:
|
|
72
|
+
raise NotImplementedError(
|
|
73
|
+
f"modality {self.config.modality} not supported yet"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# ------------------------------------------------------------------
|
|
77
|
+
# Public entry point
|
|
78
|
+
# ------------------------------------------------------------------
|
|
79
|
+
|
|
80
|
+
def profile(self, data: Any) -> StructuralProfileResult:
|
|
81
|
+
if not isinstance(data, pl.DataFrame):
|
|
82
|
+
raise TypeError(
|
|
83
|
+
f"StructuralProfiler expects a Polars DataFrame, "
|
|
84
|
+
f"got {type(data).__name__}."
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
result = StructuralProfileResult()
|
|
88
|
+
|
|
89
|
+
active_cols = [c for c in data.columns if c not in self.config.exclude_columns]
|
|
90
|
+
|
|
91
|
+
# ── 1. Modality profiler ─────────────────────────────────────────
|
|
92
|
+
# Replaces default DatasetStats with the real one (row_count, memory,
|
|
93
|
+
# duplicates, etc.). Must run before anything writes to result.dataset.
|
|
94
|
+
result.dataset = self.modality_profiler.profile(data)
|
|
95
|
+
|
|
96
|
+
# ── 2. Missingness pre-pass ──────────────────────────────────────
|
|
97
|
+
# setdefault creates ColumnProfile entries; subsequent steps mutate
|
|
98
|
+
# the same objects via the same setdefault pattern.
|
|
99
|
+
missingness_result = MissingnessProfiler(config=self.config).profile(
|
|
100
|
+
data, columns=active_cols
|
|
101
|
+
)
|
|
102
|
+
for col_name in missingness_result.analysed_columns:
|
|
103
|
+
cp = result.columns.setdefault(col_name, ColumnProfile(name=col_name))
|
|
104
|
+
cp.missingness = missingness_result.columns.get(col_name)
|
|
105
|
+
|
|
106
|
+
if missingness_result.correlation_matrix:
|
|
107
|
+
result.dataset.missingness_matrix = missingness_result.correlation_matrix
|
|
108
|
+
|
|
109
|
+
# ── 3. Row-missingness distribution ─────────────────────────────
|
|
110
|
+
result.dataset.row_distribution = self._compute_row_distribution(
|
|
111
|
+
df=data,
|
|
112
|
+
cols=active_cols,
|
|
113
|
+
n_rows=data.height,
|
|
114
|
+
overrides=self.config.column_overrides,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# ── 4. Type detection ────────────────────────────────────────────
|
|
118
|
+
# setdefault returns the existing ColumnProfile from step 2, so
|
|
119
|
+
# missingness and type info land on the same object.
|
|
120
|
+
type_info = TypeDetector(columns=active_cols).detect(data)
|
|
121
|
+
for col_name, info in type_info.items():
|
|
122
|
+
cp = result.columns.setdefault(col_name, ColumnProfile(name=col_name))
|
|
123
|
+
cp.semantic_type = info.semantic_type
|
|
124
|
+
cp.type_flags = list(info.flags)
|
|
125
|
+
cp.original_dtype = info.original_dtype
|
|
126
|
+
cp.inferred_dtype = info.inferred_dtype
|
|
127
|
+
|
|
128
|
+
# ── 5. Apply column_overrides ────────────────────────────────────
|
|
129
|
+
# All active columns are in result.columns by now (steps 2 + 4).
|
|
130
|
+
# Overrides for excluded / non-existent columns are silently ignored.
|
|
131
|
+
for col_name, override_type in self.config.column_overrides.items():
|
|
132
|
+
if col_name in result.columns:
|
|
133
|
+
result.columns[col_name].semantic_type = override_type
|
|
134
|
+
|
|
135
|
+
# ── 6. Per-column profiling routed by SemanticType ───────────────
|
|
136
|
+
# Batch all columns of the same SemanticType together and call each
|
|
137
|
+
# profiler once with (df, column_list) — matching the profiler API.
|
|
138
|
+
type_to_cols: dict[SemanticType, list[str]] = {}
|
|
139
|
+
for col_name in active_cols:
|
|
140
|
+
cp = result.columns.get(col_name)
|
|
141
|
+
if cp is None or cp.semantic_type is None:
|
|
142
|
+
continue
|
|
143
|
+
if cp.semantic_type == SemanticType.Identifier:
|
|
144
|
+
continue
|
|
145
|
+
sem_type = cp.semantic_type
|
|
146
|
+
type_to_cols.setdefault(sem_type, []).append(col_name)
|
|
147
|
+
|
|
148
|
+
for sem_type, cols in type_to_cols.items():
|
|
149
|
+
profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type) # type: ignore[arg-type]
|
|
150
|
+
if profiler_cls is None:
|
|
151
|
+
continue
|
|
152
|
+
profiler = profiler_cls(config=self.config)
|
|
153
|
+
try:
|
|
154
|
+
batch = profiler.profile(data, columns=cols)
|
|
155
|
+
for col_name in batch.analysed_columns:
|
|
156
|
+
if col_name in result.columns:
|
|
157
|
+
result.columns[col_name].stats = batch.columns.get(col_name)
|
|
158
|
+
except Exception:
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
# ── 7. Target columns ────────────────────────────────────────────
|
|
162
|
+
# TargetProfiler produces target-specific analysis stored in
|
|
163
|
+
# result.targets. cp.stats is NOT overwritten — step 6 already set it.
|
|
164
|
+
if self.config.target_columns:
|
|
165
|
+
for target in self.config.target_columns:
|
|
166
|
+
if target not in data.columns:
|
|
167
|
+
continue
|
|
168
|
+
target_result = TargetProfiler(
|
|
169
|
+
target_column=target,
|
|
170
|
+
config=self.config,
|
|
171
|
+
).profile(data)
|
|
172
|
+
result.targets[target] = target_result
|
|
173
|
+
|
|
174
|
+
# setdefault returns the existing ColumnProfile.
|
|
175
|
+
cp = result.columns.setdefault(target, ColumnProfile(name=target))
|
|
176
|
+
cp.is_target = True
|
|
177
|
+
|
|
178
|
+
# ── 8. Correlation ───────────────────────────────────────────────
|
|
179
|
+
if self.config.compute_correlation:
|
|
180
|
+
# Resolve column lists by detected SemanticType (post-override).
|
|
181
|
+
numeric_cols = [
|
|
182
|
+
c
|
|
183
|
+
for c in active_cols
|
|
184
|
+
if result.columns.get(c)
|
|
185
|
+
and result.columns[c].semantic_type == SemanticType.Numeric
|
|
186
|
+
]
|
|
187
|
+
categorical_cols = [
|
|
188
|
+
c
|
|
189
|
+
for c in active_cols
|
|
190
|
+
if result.columns.get(c)
|
|
191
|
+
and result.columns[c].semantic_type == SemanticType.Categorical
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
corr_profiler = CorrelationProfiler(
|
|
195
|
+
numeric_columns=numeric_cols,
|
|
196
|
+
categorical_columns=categorical_cols,
|
|
197
|
+
config=self.config,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# 8a. Feature-feature matrices — computed ONCE, target-independent.
|
|
201
|
+
feature_corr = corr_profiler.profile_features(
|
|
202
|
+
data, numeric_cols
|
|
203
|
+
)
|
|
204
|
+
result.dataset.feature_correlation = feature_corr
|
|
205
|
+
|
|
206
|
+
# 8b. Per-target analysis — matrices are NOT recomputed; each call
|
|
207
|
+
# shallow-copies feature_corr and appends target-specific fields.
|
|
208
|
+
for target in self.config.target_columns:
|
|
209
|
+
if target not in data.columns:
|
|
210
|
+
continue
|
|
211
|
+
result.dataset.target_correlations[target] = (
|
|
212
|
+
corr_profiler.profile_target(
|
|
213
|
+
data, feature_corr, numeric_cols, categorical_cols, target
|
|
214
|
+
)
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
return result
|
|
218
|
+
|
|
219
|
+
# ------------------------------------------------------------------
|
|
220
|
+
# Helpers
|
|
221
|
+
# ------------------------------------------------------------------
|
|
222
|
+
|
|
223
|
+
@staticmethod
|
|
224
|
+
def _compute_row_distribution(
|
|
225
|
+
df: pl.DataFrame,
|
|
226
|
+
cols: list[str],
|
|
227
|
+
n_rows: int,
|
|
228
|
+
overrides: dict[str, SemanticType],
|
|
229
|
+
) -> RowMissingnessDistribution:
|
|
230
|
+
from ._missingness_profiler import (
|
|
231
|
+
_sentinel_eligible,
|
|
232
|
+
_inf_eligible,
|
|
233
|
+
_SENTINEL_STRINGS,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
dist = RowMissingnessDistribution()
|
|
237
|
+
if n_rows == 0 or not cols:
|
|
238
|
+
return dist
|
|
239
|
+
|
|
240
|
+
n_cols = len(cols)
|
|
241
|
+
per_col_exprs = []
|
|
242
|
+
|
|
243
|
+
for col_name in cols:
|
|
244
|
+
dtype = df[col_name].dtype
|
|
245
|
+
override = overrides.get(col_name)
|
|
246
|
+
null_e = pl.col(col_name).is_null()
|
|
247
|
+
|
|
248
|
+
if _sentinel_eligible(dtype, override):
|
|
249
|
+
eff = (
|
|
250
|
+
null_e
|
|
251
|
+
| (pl.col(col_name).str.strip_chars() == "")
|
|
252
|
+
| pl.col(col_name).str.to_uppercase().is_in(list(_SENTINEL_STRINGS))
|
|
253
|
+
)
|
|
254
|
+
elif _inf_eligible(dtype):
|
|
255
|
+
eff = (
|
|
256
|
+
null_e | pl.col(col_name).is_nan() | pl.col(col_name).is_infinite()
|
|
257
|
+
)
|
|
258
|
+
else:
|
|
259
|
+
eff = null_e
|
|
260
|
+
|
|
261
|
+
per_col_exprs.append(eff.cast(pl.Int8).alias(col_name))
|
|
262
|
+
|
|
263
|
+
row_missing: pl.Series = df.select(per_col_exprs).select(
|
|
264
|
+
pl.sum_horizontal(pl.all()).alias("row_missing")
|
|
265
|
+
)["row_missing"]
|
|
266
|
+
|
|
267
|
+
half_threshold = math.ceil(n_cols * _ROW_DROP_THRESHOLD)
|
|
268
|
+
|
|
269
|
+
dist.pct_zero_missing = float((row_missing == 0).sum()) / n_rows
|
|
270
|
+
dist.pct_one_to_two = (
|
|
271
|
+
float(((row_missing >= 1) & (row_missing <= 2)).sum()) / n_rows
|
|
272
|
+
)
|
|
273
|
+
dist.pct_three_to_five = (
|
|
274
|
+
float(((row_missing >= 3) & (row_missing <= 5)).sum()) / n_rows
|
|
275
|
+
)
|
|
276
|
+
dist.pct_over_five = float((row_missing > 5).sum()) / n_rows
|
|
277
|
+
dist.drop_candidate_row_count = int((row_missing >= half_threshold).sum())
|
|
278
|
+
dist.pct_over_half_missing = dist.drop_candidate_row_count / n_rows
|
|
279
|
+
|
|
280
|
+
return dist
|
splitting/__init__.py
ADDED
splitting/_config.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
import polars as pl
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class SplitResult:
|
|
10
|
+
"""
|
|
11
|
+
Attributes
|
|
12
|
+
----------
|
|
13
|
+
train : pl.DataFrame
|
|
14
|
+
Training partition.
|
|
15
|
+
test : pl.DataFrame
|
|
16
|
+
Test/hold-out partition.
|
|
17
|
+
train_size : int
|
|
18
|
+
Number of rows in the training partition.
|
|
19
|
+
test_size : int
|
|
20
|
+
Number of rows in the test partition.
|
|
21
|
+
train_ratio : float
|
|
22
|
+
Fraction of total rows assigned to training (0.0–1.0).
|
|
23
|
+
test_ratio : float
|
|
24
|
+
Fraction of total rows assigned to testing (0.0–1.0).
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
train: pl.DataFrame
|
|
28
|
+
test: pl.DataFrame
|
|
29
|
+
train_size: int
|
|
30
|
+
test_size: int
|
|
31
|
+
train_ratio: float
|
|
32
|
+
test_ratio: float
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class FoldResult:
|
|
37
|
+
"""
|
|
38
|
+
Attributes
|
|
39
|
+
----------
|
|
40
|
+
train : pl.DataFrame
|
|
41
|
+
Training partition for this fold.
|
|
42
|
+
val : pl.DataFrame
|
|
43
|
+
Validation partition for this fold.
|
|
44
|
+
fold_index : int
|
|
45
|
+
Zero-based index of this fold within the CV run.
|
|
46
|
+
train_size : int
|
|
47
|
+
Number of rows in the training partition.
|
|
48
|
+
val_size : int
|
|
49
|
+
Number of rows in the validation partition.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
train: pl.DataFrame
|
|
53
|
+
val: pl.DataFrame
|
|
54
|
+
fold_index: int
|
|
55
|
+
train_size: int
|
|
56
|
+
val_size: int
|
splitting/_splitter.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DataSplitter: constructor, random_split, time_split, and kfold implementation.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import math
|
|
8
|
+
from typing import Any, List, Optional
|
|
9
|
+
|
|
10
|
+
import polars as pl
|
|
11
|
+
from sklearn.model_selection import KFold, ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit
|
|
12
|
+
|
|
13
|
+
from ._config import FoldResult, SplitResult
|
|
14
|
+
|
|
15
|
+
_UNSET = object()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DataSplitter:
|
|
19
|
+
"""
|
|
20
|
+
Splits a Polars DataFrame into train/test or cross-validation folds.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
df : pl.DataFrame
|
|
25
|
+
Source data. Must be non-empty.
|
|
26
|
+
target : str, optional
|
|
27
|
+
Name of the target column. Required for stratified splits.
|
|
28
|
+
random_seed : int, optional
|
|
29
|
+
Seed forwarded to sklearn splitters for reproducibility.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
df: pl.DataFrame,
|
|
35
|
+
target: Optional[str] = None,
|
|
36
|
+
random_seed: Optional[int] = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
if not isinstance(df, pl.DataFrame):
|
|
39
|
+
raise TypeError(f"df must be a polars DataFrame, got {type(df).__name__}")
|
|
40
|
+
if df.is_empty():
|
|
41
|
+
raise ValueError("df must not be empty")
|
|
42
|
+
if target is not None and target not in df.columns:
|
|
43
|
+
raise ValueError(f"target column '{target}' not found in df")
|
|
44
|
+
|
|
45
|
+
self._df = df
|
|
46
|
+
self._target = target
|
|
47
|
+
self._random_seed = random_seed
|
|
48
|
+
|
|
49
|
+
def random_split(self, test_size: float, stratify=_UNSET) -> SplitResult:
|
|
50
|
+
"""
|
|
51
|
+
Return a single randomised train/test split.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
test_size : float
|
|
56
|
+
Fraction of rows to reserve for the test set (0 < test_size < 1).
|
|
57
|
+
stratify : bool, optional
|
|
58
|
+
Whether to stratify on the target column.
|
|
59
|
+
Defaults to True when a target was provided, False otherwise.
|
|
60
|
+
|
|
61
|
+
Returns
|
|
62
|
+
-------
|
|
63
|
+
SplitResult
|
|
64
|
+
"""
|
|
65
|
+
if stratify is _UNSET:
|
|
66
|
+
stratify = self._target is not None
|
|
67
|
+
if stratify and self._target is None:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
"stratify=True requires a target column; "
|
|
70
|
+
"pass target= when constructing DataSplitter"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if stratify:
|
|
74
|
+
splitter = StratifiedShuffleSplit(
|
|
75
|
+
n_splits=1, test_size=test_size, random_state=self._random_seed
|
|
76
|
+
)
|
|
77
|
+
y = self._df[self._target].to_numpy()
|
|
78
|
+
train_idx, test_idx = next(splitter.split(self._df, y))
|
|
79
|
+
else:
|
|
80
|
+
splitter = ShuffleSplit(
|
|
81
|
+
n_splits=1, test_size=test_size, random_state=self._random_seed
|
|
82
|
+
)
|
|
83
|
+
train_idx, test_idx = next(splitter.split(self._df))
|
|
84
|
+
|
|
85
|
+
train_df = self._df[train_idx]
|
|
86
|
+
test_df = self._df[test_idx]
|
|
87
|
+
total = len(self._df)
|
|
88
|
+
|
|
89
|
+
return SplitResult(
|
|
90
|
+
train=train_df,
|
|
91
|
+
test=test_df,
|
|
92
|
+
train_size=len(train_df),
|
|
93
|
+
test_size=len(test_df),
|
|
94
|
+
train_ratio=len(train_df) / total,
|
|
95
|
+
test_ratio=len(test_df) / total,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def time_split(
|
|
99
|
+
self,
|
|
100
|
+
time_column: str,
|
|
101
|
+
test_size: Optional[float] = None,
|
|
102
|
+
cutoff: Optional[Any] = None,
|
|
103
|
+
) -> SplitResult:
|
|
104
|
+
"""
|
|
105
|
+
Return a chronological train/test split with no temporal leakage.
|
|
106
|
+
|
|
107
|
+
The DataFrame is sorted ascending by ``time_column`` before splitting.
|
|
108
|
+
``cutoff`` takes priority over ``test_size`` when both are supplied.
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
time_column : str
|
|
113
|
+
Column to sort by. Must exist in the DataFrame.
|
|
114
|
+
test_size : float, optional
|
|
115
|
+
Fraction of rows (from the end of the sorted series) to use as
|
|
116
|
+
the test set. ``floor(len(df) * test_size)`` rows are taken.
|
|
117
|
+
cutoff : scalar, optional
|
|
118
|
+
Threshold value. Rows where ``time_column >= cutoff`` go to
|
|
119
|
+
test; all earlier rows go to train.
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
SplitResult
|
|
124
|
+
"""
|
|
125
|
+
if time_column not in self._df.columns:
|
|
126
|
+
raise ValueError(f"time_column '{time_column}' not found in df")
|
|
127
|
+
if test_size is None and cutoff is None:
|
|
128
|
+
raise ValueError("Either test_size or cutoff must be provided")
|
|
129
|
+
|
|
130
|
+
sorted_df = self._df.sort(time_column)
|
|
131
|
+
total = len(sorted_df)
|
|
132
|
+
|
|
133
|
+
if cutoff is not None:
|
|
134
|
+
train_df = sorted_df.filter(pl.col(time_column) < cutoff)
|
|
135
|
+
test_df = sorted_df.filter(pl.col(time_column) >= cutoff)
|
|
136
|
+
else:
|
|
137
|
+
n_test = math.floor(total * test_size)
|
|
138
|
+
n_train = total - n_test
|
|
139
|
+
train_df = sorted_df[:n_train]
|
|
140
|
+
test_df = sorted_df[n_train:]
|
|
141
|
+
|
|
142
|
+
return SplitResult(
|
|
143
|
+
train=train_df,
|
|
144
|
+
test=test_df,
|
|
145
|
+
train_size=len(train_df),
|
|
146
|
+
test_size=len(test_df),
|
|
147
|
+
train_ratio=len(train_df) / total,
|
|
148
|
+
test_ratio=len(test_df) / total,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def kfold(self, k: int, stratify=_UNSET) -> List[FoldResult]:
|
|
152
|
+
"""
|
|
153
|
+
Return a list of ``k`` cross-validation folds.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
k : int
|
|
158
|
+
Number of folds.
|
|
159
|
+
stratify : bool, optional
|
|
160
|
+
Whether to stratify on the target column.
|
|
161
|
+
Defaults to True when a target was provided, False otherwise.
|
|
162
|
+
|
|
163
|
+
Returns
|
|
164
|
+
-------
|
|
165
|
+
list[FoldResult]
|
|
166
|
+
Exactly ``k`` folds with zero-based ``fold_index``.
|
|
167
|
+
"""
|
|
168
|
+
if stratify is _UNSET:
|
|
169
|
+
stratify = self._target is not None
|
|
170
|
+
if stratify and self._target is None:
|
|
171
|
+
raise ValueError(
|
|
172
|
+
"stratify=True requires a target column; "
|
|
173
|
+
"pass target= when constructing DataSplitter"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
if stratify:
|
|
177
|
+
folder = StratifiedKFold(
|
|
178
|
+
n_splits=k, shuffle=True, random_state=self._random_seed
|
|
179
|
+
)
|
|
180
|
+
y = self._df[self._target].to_numpy()
|
|
181
|
+
splits = folder.split(self._df, y)
|
|
182
|
+
else:
|
|
183
|
+
folder = KFold(
|
|
184
|
+
n_splits=k, shuffle=True, random_state=self._random_seed
|
|
185
|
+
)
|
|
186
|
+
splits = folder.split(self._df)
|
|
187
|
+
|
|
188
|
+
folds: List[FoldResult] = []
|
|
189
|
+
for fold_index, (train_idx, val_idx) in enumerate(splits):
|
|
190
|
+
train_df = self._df[train_idx]
|
|
191
|
+
val_df = self._df[val_idx]
|
|
192
|
+
folds.append(
|
|
193
|
+
FoldResult(
|
|
194
|
+
train=train_df,
|
|
195
|
+
val=val_df,
|
|
196
|
+
fold_index=fold_index,
|
|
197
|
+
train_size=len(train_df),
|
|
198
|
+
val_size=len(val_df),
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
return folds
|
tests/__init__.py
ADDED
|
File without changes
|
tests/conftest.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
|
|
2
|
+
import polars as pl
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@pytest.fixture(scope="session")
|
|
7
|
+
def override_df():
|
|
8
|
+
n = 60
|
|
9
|
+
return pl.DataFrame(
|
|
10
|
+
{
|
|
11
|
+
"score": pl.Series([float(i) for i in range(n)], dtype=pl.Float64),
|
|
12
|
+
"category": pl.Series(["A", "B", "C"] * (n // 3), dtype=pl.Utf8),
|
|
13
|
+
}
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.fixture(scope="session")
|
|
18
|
+
def target_df(rng):
|
|
19
|
+
n = 100
|
|
20
|
+
features = rng.normal(0, 1, size=n).tolist()
|
|
21
|
+
labels = ["pos", "neg"] * (n // 2)
|
|
22
|
+
return pl.DataFrame(
|
|
23
|
+
{
|
|
24
|
+
"feature": pl.Series(features, dtype=pl.Float64),
|
|
25
|
+
"label": pl.Series(labels, dtype=pl.Utf8),
|
|
26
|
+
}
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@pytest.fixture(scope="session")
|
|
31
|
+
def empty_df():
|
|
32
|
+
return pl.DataFrame(
|
|
33
|
+
{
|
|
34
|
+
"x": pl.Series([], dtype=pl.Float64),
|
|
35
|
+
"y": pl.Series([], dtype=pl.Utf8),
|
|
36
|
+
}
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@pytest.fixture(scope="session")
|
|
41
|
+
def text_df():
|
|
42
|
+
n = 200
|
|
43
|
+
topics = ["science", "art", "history", "technology", "nature", "music"]
|
|
44
|
+
texts = [
|
|
45
|
+
f"A detailed description covering the topic of {topics[i % len(topics)]} "
|
|
46
|
+
f"with multiple words that comfortably exceed the free-text threshold in row {i}"
|
|
47
|
+
for i in range(n)
|
|
48
|
+
]
|
|
49
|
+
return pl.DataFrame({"review": pl.Series(texts, dtype=pl.Utf8)})
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@pytest.fixture(scope="session")
|
|
53
|
+
def mixed_df(rng):
|
|
54
|
+
n = 300
|
|
55
|
+
|
|
56
|
+
age = rng.integers(18, 75, size=n)
|
|
57
|
+
income = age * 1200 + rng.normal(0, 5000, size=n)
|
|
58
|
+
|
|
59
|
+
salary = rng.normal(50_000, 15_000, size=n).tolist()
|
|
60
|
+
null_mask = rng.random(n) < 0.10
|
|
61
|
+
salary = [None if null_mask[i] else salary[i] for i in range(n)]
|
|
62
|
+
|
|
63
|
+
country_choices = ["US", "UK", "CA", "AU", "DE"]
|
|
64
|
+
country = [country_choices[i % len(country_choices)] for i in range(n)]
|
|
65
|
+
|
|
66
|
+
names = [f"person_{i}" for i in range(n)]
|
|
67
|
+
|
|
68
|
+
is_active = [bool(v) for v in rng.integers(0, 2, size=n)]
|
|
69
|
+
|
|
70
|
+
from datetime import date, timedelta
|
|
71
|
+
base = date(2020, 1, 1)
|
|
72
|
+
joined = [base + timedelta(days=int(d)) for d in rng.integers(0, 1460, size=n)]
|
|
73
|
+
|
|
74
|
+
return pl.DataFrame({
|
|
75
|
+
"age": pl.Series(age.tolist(), dtype=pl.Int64),
|
|
76
|
+
"income": pl.Series(income.tolist(), dtype=pl.Float64),
|
|
77
|
+
"salary": pl.Series(salary, dtype=pl.Float64),
|
|
78
|
+
"country": pl.Series(country, dtype=pl.Utf8),
|
|
79
|
+
"name": pl.Series(names, dtype=pl.Utf8),
|
|
80
|
+
"is_active": pl.Series(is_active, dtype=pl.Boolean),
|
|
81
|
+
"joined": pl.Series(joined, dtype=pl.Date),
|
|
82
|
+
})
|