dataforge-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. dataforge_ml-0.1.0.dist-info/METADATA +34 -0
  2. dataforge_ml-0.1.0.dist-info/RECORD +54 -0
  3. dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
  4. dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
  5. dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
  6. models/__init__.py +0 -0
  7. models/_data_structure.py +7 -0
  8. models/_data_types.py +12 -0
  9. profiling/__init__.py +35 -0
  10. profiling/_base.py +101 -0
  11. profiling/_boolean_config.py +37 -0
  12. profiling/_boolean_profiler.py +191 -0
  13. profiling/_categorical.py +315 -0
  14. profiling/_categorical_config.py +87 -0
  15. profiling/_correlation_config.py +225 -0
  16. profiling/_correlation_profiler.py +544 -0
  17. profiling/_datetime_config.py +98 -0
  18. profiling/_datetime_profiler.py +406 -0
  19. profiling/_missingness_config.py +137 -0
  20. profiling/_missingness_profiler.py +252 -0
  21. profiling/_numeric_config.py +116 -0
  22. profiling/_numeric_profiler.py +403 -0
  23. profiling/_tabular.py +249 -0
  24. profiling/_target_config.py +74 -0
  25. profiling/_target_profiler.py +156 -0
  26. profiling/_text_config.py +40 -0
  27. profiling/_text_profiler.py +194 -0
  28. profiling/_type_detector.py +463 -0
  29. profiling/config.py +236 -0
  30. profiling/structural.py +280 -0
  31. splitting/__init__.py +4 -0
  32. splitting/_config.py +56 -0
  33. splitting/_splitter.py +202 -0
  34. tests/__init__.py +0 -0
  35. tests/conftest.py +7 -0
  36. tests/integration/__init__.py +0 -0
  37. tests/integration/conftest.py +82 -0
  38. tests/integration/test_structural_end_to_end.py +219 -0
  39. tests/unit/__init__.py +0 -0
  40. tests/unit/profiling/__init__.py +0 -0
  41. tests/unit/profiling/conftest.py +81 -0
  42. tests/unit/profiling/test_boolean_profiler.py +91 -0
  43. tests/unit/profiling/test_categorical_profiler.py +182 -0
  44. tests/unit/profiling/test_correlation_profiler.py +124 -0
  45. tests/unit/profiling/test_datetime_profiler.py +133 -0
  46. tests/unit/profiling/test_missingness_profiler.py +51 -0
  47. tests/unit/profiling/test_numeric_profiler.py +212 -0
  48. tests/unit/profiling/test_target_profiler.py +44 -0
  49. tests/unit/profiling/test_text_profiler.py +61 -0
  50. tests/unit/profiling/test_type_detector.py +32 -0
  51. tests/unit/splitting/__init__.py +0 -0
  52. tests/unit/splitting/test_data_splitter.py +417 -0
  53. utils/__init__.py +0 -0
  54. utils/data_loader.py +110 -0
@@ -0,0 +1,280 @@
1
+ """
2
+ StructuralProfiler – unified Phase 1 entry point.
3
+
4
+ Execution order inside profile(df):
5
+ 1. ModalityProfiler → result.dataset (DatasetStats)
6
+ 2. MissingnessProfiler → ColumnProfile.missingness + dataset.missingness_matrix
7
+ 3. Row-missingness dist → dataset.row_distribution
8
+ 4. TypeDetector → ColumnProfile.semantic_type / type_flags / dtypes
9
+ 5. column_overrides → replace SemanticType on existing ColumnProfiles
10
+ 6. ColumnTypeProfiler → route each column to its profiler by SemanticType;
11
+ Identifier columns: skip, stats stays None
12
+ 7. target_columns → TargetProfiler; mark ColumnProfile.is_target=True
13
+ 8. Correlation → if compute_correlation=True:
14
+ a. profile_features() → dataset.feature_correlation (computed once)
15
+ b. profile_target() → dataset.target_correlations[target]
16
+ (once per declared target column)
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import math
22
+ from typing import Any
23
+
24
+ import polars as pl
25
+
26
+ from ._base import ModalityProfiler, ColumnBatchProfiler
27
+ from ._tabular import TabularProfiler
28
+ from ._categorical import CategoricalProfiler
29
+ from ._datetime_profiler import DatetimeProfiler
30
+ from ._numeric_profiler import NumericProfiler
31
+ from ._boolean_profiler import BooleanProfiler
32
+ from ._text_profiler import TextProfiler
33
+ from ._missingness_profiler import MissingnessProfiler
34
+ from ._target_profiler import TargetProfiler
35
+ from ._correlation_profiler import CorrelationProfiler
36
+ from ._type_detector import TypeDetector
37
+ from .config import (
38
+ ProfileConfig,
39
+ ColumnProfile,
40
+ StructuralProfileResult,
41
+ RowMissingnessDistribution,
42
+ SemanticType,
43
+ Modality,
44
+ )
45
+
46
+ _ROW_DROP_THRESHOLD = 0.50
47
+
48
+ # ---------------------------------------------------------------------------
49
+ # Registry: SemanticType → ColumnTypeProfiler class
50
+ #
51
+ # Stateless between profile(series, df) calls, so one instance per
52
+ # SemanticType safely handles all columns of that type in one run.
53
+ # Add Boolean / Text profilers here when implemented.
54
+ # ---------------------------------------------------------------------------
55
+ _COLUMN_PROFILER_REGISTRY: dict[SemanticType, type[ColumnBatchProfiler]] = { # type: ignore[type-arg]
56
+ SemanticType.Numeric: NumericProfiler,
57
+ SemanticType.Categorical: CategoricalProfiler,
58
+ SemanticType.Datetime: DatetimeProfiler,
59
+ SemanticType.Boolean: BooleanProfiler,
60
+ SemanticType.Text: TextProfiler,
61
+ }
62
+
63
+
64
+ class StructuralProfiler:
65
+
66
+ def __init__(self, config: ProfileConfig | None = None) -> None:
67
+ self.config = config or ProfileConfig()
68
+
69
+ if self.config.modality == Modality.Tabular:
70
+ self.modality_profiler: ModalityProfiler = TabularProfiler(self.config)
71
+ else:
72
+ raise NotImplementedError(
73
+ f"modality {self.config.modality} not supported yet"
74
+ )
75
+
76
+ # ------------------------------------------------------------------
77
+ # Public entry point
78
+ # ------------------------------------------------------------------
79
+
80
+ def profile(self, data: Any) -> StructuralProfileResult:
81
+ if not isinstance(data, pl.DataFrame):
82
+ raise TypeError(
83
+ f"StructuralProfiler expects a Polars DataFrame, "
84
+ f"got {type(data).__name__}."
85
+ )
86
+
87
+ result = StructuralProfileResult()
88
+
89
+ active_cols = [c for c in data.columns if c not in self.config.exclude_columns]
90
+
91
+ # ── 1. Modality profiler ─────────────────────────────────────────
92
+ # Replaces default DatasetStats with the real one (row_count, memory,
93
+ # duplicates, etc.). Must run before anything writes to result.dataset.
94
+ result.dataset = self.modality_profiler.profile(data)
95
+
96
+ # ── 2. Missingness pre-pass ──────────────────────────────────────
97
+ # setdefault creates ColumnProfile entries; subsequent steps mutate
98
+ # the same objects via the same setdefault pattern.
99
+ missingness_result = MissingnessProfiler(config=self.config).profile(
100
+ data, columns=active_cols
101
+ )
102
+ for col_name in missingness_result.analysed_columns:
103
+ cp = result.columns.setdefault(col_name, ColumnProfile(name=col_name))
104
+ cp.missingness = missingness_result.columns.get(col_name)
105
+
106
+ if missingness_result.correlation_matrix:
107
+ result.dataset.missingness_matrix = missingness_result.correlation_matrix
108
+
109
+ # ── 3. Row-missingness distribution ─────────────────────────────
110
+ result.dataset.row_distribution = self._compute_row_distribution(
111
+ df=data,
112
+ cols=active_cols,
113
+ n_rows=data.height,
114
+ overrides=self.config.column_overrides,
115
+ )
116
+
117
+ # ── 4. Type detection ────────────────────────────────────────────
118
+ # setdefault returns the existing ColumnProfile from step 2, so
119
+ # missingness and type info land on the same object.
120
+ type_info = TypeDetector(columns=active_cols).detect(data)
121
+ for col_name, info in type_info.items():
122
+ cp = result.columns.setdefault(col_name, ColumnProfile(name=col_name))
123
+ cp.semantic_type = info.semantic_type
124
+ cp.type_flags = list(info.flags)
125
+ cp.original_dtype = info.original_dtype
126
+ cp.inferred_dtype = info.inferred_dtype
127
+
128
+ # ── 5. Apply column_overrides ────────────────────────────────────
129
+ # All active columns are in result.columns by now (steps 2 + 4).
130
+ # Overrides for excluded / non-existent columns are silently ignored.
131
+ for col_name, override_type in self.config.column_overrides.items():
132
+ if col_name in result.columns:
133
+ result.columns[col_name].semantic_type = override_type
134
+
135
+ # ── 6. Per-column profiling routed by SemanticType ───────────────
136
+ # Batch all columns of the same SemanticType together and call each
137
+ # profiler once with (df, column_list) — matching the profiler API.
138
+ type_to_cols: dict[SemanticType, list[str]] = {}
139
+ for col_name in active_cols:
140
+ cp = result.columns.get(col_name)
141
+ if cp is None or cp.semantic_type is None:
142
+ continue
143
+ if cp.semantic_type == SemanticType.Identifier:
144
+ continue
145
+ sem_type = cp.semantic_type
146
+ type_to_cols.setdefault(sem_type, []).append(col_name)
147
+
148
+ for sem_type, cols in type_to_cols.items():
149
+ profiler_cls = _COLUMN_PROFILER_REGISTRY.get(sem_type) # type: ignore[arg-type]
150
+ if profiler_cls is None:
151
+ continue
152
+ profiler = profiler_cls(config=self.config)
153
+ try:
154
+ batch = profiler.profile(data, columns=cols)
155
+ for col_name in batch.analysed_columns:
156
+ if col_name in result.columns:
157
+ result.columns[col_name].stats = batch.columns.get(col_name)
158
+ except Exception:
159
+ pass
160
+
161
+ # ── 7. Target columns ────────────────────────────────────────────
162
+ # TargetProfiler produces target-specific analysis stored in
163
+ # result.targets. cp.stats is NOT overwritten — step 6 already set it.
164
+ if self.config.target_columns:
165
+ for target in self.config.target_columns:
166
+ if target not in data.columns:
167
+ continue
168
+ target_result = TargetProfiler(
169
+ target_column=target,
170
+ config=self.config,
171
+ ).profile(data)
172
+ result.targets[target] = target_result
173
+
174
+ # setdefault returns the existing ColumnProfile.
175
+ cp = result.columns.setdefault(target, ColumnProfile(name=target))
176
+ cp.is_target = True
177
+
178
+ # ── 8. Correlation ───────────────────────────────────────────────
179
+ if self.config.compute_correlation:
180
+ # Resolve column lists by detected SemanticType (post-override).
181
+ numeric_cols = [
182
+ c
183
+ for c in active_cols
184
+ if result.columns.get(c)
185
+ and result.columns[c].semantic_type == SemanticType.Numeric
186
+ ]
187
+ categorical_cols = [
188
+ c
189
+ for c in active_cols
190
+ if result.columns.get(c)
191
+ and result.columns[c].semantic_type == SemanticType.Categorical
192
+ ]
193
+
194
+ corr_profiler = CorrelationProfiler(
195
+ numeric_columns=numeric_cols,
196
+ categorical_columns=categorical_cols,
197
+ config=self.config,
198
+ )
199
+
200
+ # 8a. Feature-feature matrices — computed ONCE, target-independent.
201
+ feature_corr = corr_profiler.profile_features(
202
+ data, numeric_cols
203
+ )
204
+ result.dataset.feature_correlation = feature_corr
205
+
206
+ # 8b. Per-target analysis — matrices are NOT recomputed; each call
207
+ # shallow-copies feature_corr and appends target-specific fields.
208
+ for target in self.config.target_columns:
209
+ if target not in data.columns:
210
+ continue
211
+ result.dataset.target_correlations[target] = (
212
+ corr_profiler.profile_target(
213
+ data, feature_corr, numeric_cols, categorical_cols, target
214
+ )
215
+ )
216
+
217
+ return result
218
+
219
+ # ------------------------------------------------------------------
220
+ # Helpers
221
+ # ------------------------------------------------------------------
222
+
223
+ @staticmethod
224
+ def _compute_row_distribution(
225
+ df: pl.DataFrame,
226
+ cols: list[str],
227
+ n_rows: int,
228
+ overrides: dict[str, SemanticType],
229
+ ) -> RowMissingnessDistribution:
230
+ from ._missingness_profiler import (
231
+ _sentinel_eligible,
232
+ _inf_eligible,
233
+ _SENTINEL_STRINGS,
234
+ )
235
+
236
+ dist = RowMissingnessDistribution()
237
+ if n_rows == 0 or not cols:
238
+ return dist
239
+
240
+ n_cols = len(cols)
241
+ per_col_exprs = []
242
+
243
+ for col_name in cols:
244
+ dtype = df[col_name].dtype
245
+ override = overrides.get(col_name)
246
+ null_e = pl.col(col_name).is_null()
247
+
248
+ if _sentinel_eligible(dtype, override):
249
+ eff = (
250
+ null_e
251
+ | (pl.col(col_name).str.strip_chars() == "")
252
+ | pl.col(col_name).str.to_uppercase().is_in(list(_SENTINEL_STRINGS))
253
+ )
254
+ elif _inf_eligible(dtype):
255
+ eff = (
256
+ null_e | pl.col(col_name).is_nan() | pl.col(col_name).is_infinite()
257
+ )
258
+ else:
259
+ eff = null_e
260
+
261
+ per_col_exprs.append(eff.cast(pl.Int8).alias(col_name))
262
+
263
+ row_missing: pl.Series = df.select(per_col_exprs).select(
264
+ pl.sum_horizontal(pl.all()).alias("row_missing")
265
+ )["row_missing"]
266
+
267
+ half_threshold = math.ceil(n_cols * _ROW_DROP_THRESHOLD)
268
+
269
+ dist.pct_zero_missing = float((row_missing == 0).sum()) / n_rows
270
+ dist.pct_one_to_two = (
271
+ float(((row_missing >= 1) & (row_missing <= 2)).sum()) / n_rows
272
+ )
273
+ dist.pct_three_to_five = (
274
+ float(((row_missing >= 3) & (row_missing <= 5)).sum()) / n_rows
275
+ )
276
+ dist.pct_over_five = float((row_missing > 5).sum()) / n_rows
277
+ dist.drop_candidate_row_count = int((row_missing >= half_threshold).sum())
278
+ dist.pct_over_half_missing = dist.drop_candidate_row_count / n_rows
279
+
280
+ return dist
splitting/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from ._config import FoldResult, SplitResult
2
+ from ._splitter import DataSplitter
3
+
4
+ __all__ = ["DataSplitter", "SplitResult", "FoldResult"]
splitting/_config.py ADDED
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ import polars as pl
6
+
7
+
8
+ @dataclass
9
+ class SplitResult:
10
+ """
11
+ Attributes
12
+ ----------
13
+ train : pl.DataFrame
14
+ Training partition.
15
+ test : pl.DataFrame
16
+ Test/hold-out partition.
17
+ train_size : int
18
+ Number of rows in the training partition.
19
+ test_size : int
20
+ Number of rows in the test partition.
21
+ train_ratio : float
22
+ Fraction of total rows assigned to training (0.0–1.0).
23
+ test_ratio : float
24
+ Fraction of total rows assigned to testing (0.0–1.0).
25
+ """
26
+
27
+ train: pl.DataFrame
28
+ test: pl.DataFrame
29
+ train_size: int
30
+ test_size: int
31
+ train_ratio: float
32
+ test_ratio: float
33
+
34
+
35
+ @dataclass
36
+ class FoldResult:
37
+ """
38
+ Attributes
39
+ ----------
40
+ train : pl.DataFrame
41
+ Training partition for this fold.
42
+ val : pl.DataFrame
43
+ Validation partition for this fold.
44
+ fold_index : int
45
+ Zero-based index of this fold within the CV run.
46
+ train_size : int
47
+ Number of rows in the training partition.
48
+ val_size : int
49
+ Number of rows in the validation partition.
50
+ """
51
+
52
+ train: pl.DataFrame
53
+ val: pl.DataFrame
54
+ fold_index: int
55
+ train_size: int
56
+ val_size: int
splitting/_splitter.py ADDED
@@ -0,0 +1,202 @@
1
+ """
2
+ DataSplitter: constructor, random_split, time_split, and kfold implementation.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import math
8
+ from typing import Any, List, Optional
9
+
10
+ import polars as pl
11
+ from sklearn.model_selection import KFold, ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit
12
+
13
+ from ._config import FoldResult, SplitResult
14
+
15
+ _UNSET = object()
16
+
17
+
18
+ class DataSplitter:
19
+ """
20
+ Splits a Polars DataFrame into train/test or cross-validation folds.
21
+
22
+ Parameters
23
+ ----------
24
+ df : pl.DataFrame
25
+ Source data. Must be non-empty.
26
+ target : str, optional
27
+ Name of the target column. Required for stratified splits.
28
+ random_seed : int, optional
29
+ Seed forwarded to sklearn splitters for reproducibility.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ df: pl.DataFrame,
35
+ target: Optional[str] = None,
36
+ random_seed: Optional[int] = None,
37
+ ) -> None:
38
+ if not isinstance(df, pl.DataFrame):
39
+ raise TypeError(f"df must be a polars DataFrame, got {type(df).__name__}")
40
+ if df.is_empty():
41
+ raise ValueError("df must not be empty")
42
+ if target is not None and target not in df.columns:
43
+ raise ValueError(f"target column '{target}' not found in df")
44
+
45
+ self._df = df
46
+ self._target = target
47
+ self._random_seed = random_seed
48
+
49
+ def random_split(self, test_size: float, stratify=_UNSET) -> SplitResult:
50
+ """
51
+ Return a single randomised train/test split.
52
+
53
+ Parameters
54
+ ----------
55
+ test_size : float
56
+ Fraction of rows to reserve for the test set (0 < test_size < 1).
57
+ stratify : bool, optional
58
+ Whether to stratify on the target column.
59
+ Defaults to True when a target was provided, False otherwise.
60
+
61
+ Returns
62
+ -------
63
+ SplitResult
64
+ """
65
+ if stratify is _UNSET:
66
+ stratify = self._target is not None
67
+ if stratify and self._target is None:
68
+ raise ValueError(
69
+ "stratify=True requires a target column; "
70
+ "pass target= when constructing DataSplitter"
71
+ )
72
+
73
+ if stratify:
74
+ splitter = StratifiedShuffleSplit(
75
+ n_splits=1, test_size=test_size, random_state=self._random_seed
76
+ )
77
+ y = self._df[self._target].to_numpy()
78
+ train_idx, test_idx = next(splitter.split(self._df, y))
79
+ else:
80
+ splitter = ShuffleSplit(
81
+ n_splits=1, test_size=test_size, random_state=self._random_seed
82
+ )
83
+ train_idx, test_idx = next(splitter.split(self._df))
84
+
85
+ train_df = self._df[train_idx]
86
+ test_df = self._df[test_idx]
87
+ total = len(self._df)
88
+
89
+ return SplitResult(
90
+ train=train_df,
91
+ test=test_df,
92
+ train_size=len(train_df),
93
+ test_size=len(test_df),
94
+ train_ratio=len(train_df) / total,
95
+ test_ratio=len(test_df) / total,
96
+ )
97
+
98
+ def time_split(
99
+ self,
100
+ time_column: str,
101
+ test_size: Optional[float] = None,
102
+ cutoff: Optional[Any] = None,
103
+ ) -> SplitResult:
104
+ """
105
+ Return a chronological train/test split with no temporal leakage.
106
+
107
+ The DataFrame is sorted ascending by ``time_column`` before splitting.
108
+ ``cutoff`` takes priority over ``test_size`` when both are supplied.
109
+
110
+ Parameters
111
+ ----------
112
+ time_column : str
113
+ Column to sort by. Must exist in the DataFrame.
114
+ test_size : float, optional
115
+ Fraction of rows (from the end of the sorted series) to use as
116
+ the test set. ``floor(len(df) * test_size)`` rows are taken.
117
+ cutoff : scalar, optional
118
+ Threshold value. Rows where ``time_column >= cutoff`` go to
119
+ test; all earlier rows go to train.
120
+
121
+ Returns
122
+ -------
123
+ SplitResult
124
+ """
125
+ if time_column not in self._df.columns:
126
+ raise ValueError(f"time_column '{time_column}' not found in df")
127
+ if test_size is None and cutoff is None:
128
+ raise ValueError("Either test_size or cutoff must be provided")
129
+
130
+ sorted_df = self._df.sort(time_column)
131
+ total = len(sorted_df)
132
+
133
+ if cutoff is not None:
134
+ train_df = sorted_df.filter(pl.col(time_column) < cutoff)
135
+ test_df = sorted_df.filter(pl.col(time_column) >= cutoff)
136
+ else:
137
+ n_test = math.floor(total * test_size)
138
+ n_train = total - n_test
139
+ train_df = sorted_df[:n_train]
140
+ test_df = sorted_df[n_train:]
141
+
142
+ return SplitResult(
143
+ train=train_df,
144
+ test=test_df,
145
+ train_size=len(train_df),
146
+ test_size=len(test_df),
147
+ train_ratio=len(train_df) / total,
148
+ test_ratio=len(test_df) / total,
149
+ )
150
+
151
+ def kfold(self, k: int, stratify=_UNSET) -> List[FoldResult]:
152
+ """
153
+ Return a list of ``k`` cross-validation folds.
154
+
155
+ Parameters
156
+ ----------
157
+ k : int
158
+ Number of folds.
159
+ stratify : bool, optional
160
+ Whether to stratify on the target column.
161
+ Defaults to True when a target was provided, False otherwise.
162
+
163
+ Returns
164
+ -------
165
+ list[FoldResult]
166
+ Exactly ``k`` folds with zero-based ``fold_index``.
167
+ """
168
+ if stratify is _UNSET:
169
+ stratify = self._target is not None
170
+ if stratify and self._target is None:
171
+ raise ValueError(
172
+ "stratify=True requires a target column; "
173
+ "pass target= when constructing DataSplitter"
174
+ )
175
+
176
+ if stratify:
177
+ folder = StratifiedKFold(
178
+ n_splits=k, shuffle=True, random_state=self._random_seed
179
+ )
180
+ y = self._df[self._target].to_numpy()
181
+ splits = folder.split(self._df, y)
182
+ else:
183
+ folder = KFold(
184
+ n_splits=k, shuffle=True, random_state=self._random_seed
185
+ )
186
+ splits = folder.split(self._df)
187
+
188
+ folds: List[FoldResult] = []
189
+ for fold_index, (train_idx, val_idx) in enumerate(splits):
190
+ train_df = self._df[train_idx]
191
+ val_df = self._df[val_idx]
192
+ folds.append(
193
+ FoldResult(
194
+ train=train_df,
195
+ val=val_df,
196
+ fold_index=fold_index,
197
+ train_size=len(train_df),
198
+ val_size=len(val_df),
199
+ )
200
+ )
201
+
202
+ return folds
tests/__init__.py ADDED
File without changes
tests/conftest.py ADDED
@@ -0,0 +1,7 @@
1
+ import numpy as np
2
+ import pytest
3
+
4
+
5
+ @pytest.fixture(scope="session")
6
+ def rng():
7
+ return np.random.default_rng(42)
File without changes
@@ -0,0 +1,82 @@
1
+
2
+ import polars as pl
3
+ import pytest
4
+
5
+
6
+ @pytest.fixture(scope="session")
7
+ def override_df():
8
+ n = 60
9
+ return pl.DataFrame(
10
+ {
11
+ "score": pl.Series([float(i) for i in range(n)], dtype=pl.Float64),
12
+ "category": pl.Series(["A", "B", "C"] * (n // 3), dtype=pl.Utf8),
13
+ }
14
+ )
15
+
16
+
17
+ @pytest.fixture(scope="session")
18
+ def target_df(rng):
19
+ n = 100
20
+ features = rng.normal(0, 1, size=n).tolist()
21
+ labels = ["pos", "neg"] * (n // 2)
22
+ return pl.DataFrame(
23
+ {
24
+ "feature": pl.Series(features, dtype=pl.Float64),
25
+ "label": pl.Series(labels, dtype=pl.Utf8),
26
+ }
27
+ )
28
+
29
+
30
+ @pytest.fixture(scope="session")
31
+ def empty_df():
32
+ return pl.DataFrame(
33
+ {
34
+ "x": pl.Series([], dtype=pl.Float64),
35
+ "y": pl.Series([], dtype=pl.Utf8),
36
+ }
37
+ )
38
+
39
+
40
+ @pytest.fixture(scope="session")
41
+ def text_df():
42
+ n = 200
43
+ topics = ["science", "art", "history", "technology", "nature", "music"]
44
+ texts = [
45
+ f"A detailed description covering the topic of {topics[i % len(topics)]} "
46
+ f"with multiple words that comfortably exceed the free-text threshold in row {i}"
47
+ for i in range(n)
48
+ ]
49
+ return pl.DataFrame({"review": pl.Series(texts, dtype=pl.Utf8)})
50
+
51
+
52
+ @pytest.fixture(scope="session")
53
+ def mixed_df(rng):
54
+ n = 300
55
+
56
+ age = rng.integers(18, 75, size=n)
57
+ income = age * 1200 + rng.normal(0, 5000, size=n)
58
+
59
+ salary = rng.normal(50_000, 15_000, size=n).tolist()
60
+ null_mask = rng.random(n) < 0.10
61
+ salary = [None if null_mask[i] else salary[i] for i in range(n)]
62
+
63
+ country_choices = ["US", "UK", "CA", "AU", "DE"]
64
+ country = [country_choices[i % len(country_choices)] for i in range(n)]
65
+
66
+ names = [f"person_{i}" for i in range(n)]
67
+
68
+ is_active = [bool(v) for v in rng.integers(0, 2, size=n)]
69
+
70
+ from datetime import date, timedelta
71
+ base = date(2020, 1, 1)
72
+ joined = [base + timedelta(days=int(d)) for d in rng.integers(0, 1460, size=n)]
73
+
74
+ return pl.DataFrame({
75
+ "age": pl.Series(age.tolist(), dtype=pl.Int64),
76
+ "income": pl.Series(income.tolist(), dtype=pl.Float64),
77
+ "salary": pl.Series(salary, dtype=pl.Float64),
78
+ "country": pl.Series(country, dtype=pl.Utf8),
79
+ "name": pl.Series(names, dtype=pl.Utf8),
80
+ "is_active": pl.Series(is_active, dtype=pl.Boolean),
81
+ "joined": pl.Series(joined, dtype=pl.Date),
82
+ })