dataforge-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge_ml-0.1.0.dist-info/METADATA +34 -0
- dataforge_ml-0.1.0.dist-info/RECORD +54 -0
- dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
- dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
- dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
- models/__init__.py +0 -0
- models/_data_structure.py +7 -0
- models/_data_types.py +12 -0
- profiling/__init__.py +35 -0
- profiling/_base.py +101 -0
- profiling/_boolean_config.py +37 -0
- profiling/_boolean_profiler.py +191 -0
- profiling/_categorical.py +315 -0
- profiling/_categorical_config.py +87 -0
- profiling/_correlation_config.py +225 -0
- profiling/_correlation_profiler.py +544 -0
- profiling/_datetime_config.py +98 -0
- profiling/_datetime_profiler.py +406 -0
- profiling/_missingness_config.py +137 -0
- profiling/_missingness_profiler.py +252 -0
- profiling/_numeric_config.py +116 -0
- profiling/_numeric_profiler.py +403 -0
- profiling/_tabular.py +249 -0
- profiling/_target_config.py +74 -0
- profiling/_target_profiler.py +156 -0
- profiling/_text_config.py +40 -0
- profiling/_text_profiler.py +194 -0
- profiling/_type_detector.py +463 -0
- profiling/config.py +236 -0
- profiling/structural.py +280 -0
- splitting/__init__.py +4 -0
- splitting/_config.py +56 -0
- splitting/_splitter.py +202 -0
- tests/__init__.py +0 -0
- tests/conftest.py +7 -0
- tests/integration/__init__.py +0 -0
- tests/integration/conftest.py +82 -0
- tests/integration/test_structural_end_to_end.py +219 -0
- tests/unit/__init__.py +0 -0
- tests/unit/profiling/__init__.py +0 -0
- tests/unit/profiling/conftest.py +81 -0
- tests/unit/profiling/test_boolean_profiler.py +91 -0
- tests/unit/profiling/test_categorical_profiler.py +182 -0
- tests/unit/profiling/test_correlation_profiler.py +124 -0
- tests/unit/profiling/test_datetime_profiler.py +133 -0
- tests/unit/profiling/test_missingness_profiler.py +51 -0
- tests/unit/profiling/test_numeric_profiler.py +212 -0
- tests/unit/profiling/test_target_profiler.py +44 -0
- tests/unit/profiling/test_text_profiler.py +61 -0
- tests/unit/profiling/test_type_detector.py +32 -0
- tests/unit/splitting/__init__.py +0 -0
- tests/unit/splitting/test_data_splitter.py +417 -0
- utils/__init__.py +0 -0
- utils/data_loader.py +110 -0
|
@@ -0,0 +1,463 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TypeDetector – selective data-type detection for Polars DataFrames.
|
|
3
|
+
|
|
4
|
+
Detection is opt-in: only columns listed in ProfileConfig.type_detection_columns
|
|
5
|
+
are examined. The detector never mutates the original frame.
|
|
6
|
+
|
|
7
|
+
Detection pipeline (in order, applied per column):
|
|
8
|
+
1. Numeric coercion – object/Utf8 columns → try cast to Float64
|
|
9
|
+
2. Datetime coercion – object/Utf8 columns with date-like names/values
|
|
10
|
+
3. Boolean candidate – int {0,1} or string {"true","false","yes","no",…}
|
|
11
|
+
4. Encoded category – int with low cardinality (<15 unique values)
|
|
12
|
+
5. Identifier column – unique ratio > 99 %
|
|
13
|
+
6. Sequential index – integer column == range(0,n) or range(1,n+1)
|
|
14
|
+
7. Numeric kind – continuous vs discrete for confirmed numeric cols
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from typing import TYPE_CHECKING
|
|
20
|
+
|
|
21
|
+
import polars as pl
|
|
22
|
+
|
|
23
|
+
from .config import ColumnTypeInfo, NumericKind, TypeFlag, SemanticType
|
|
24
|
+
from ..models._data_types import _INT_DTYPES, _NUMERIC_DTYPES
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
# Threshold constants
|
|
30
|
+
_NUMERIC_COERCE_THRESHOLD = 0.95 # ≥95 % non-null after cast → reclassify
|
|
31
|
+
_DATETIME_COERCE_THRESHOLD = 0.80 # ≥80 % non-null after cast → reclassify
|
|
32
|
+
_ENCODED_CATEGORY_MAX_UNIQUE = 15 # int with fewer unique values → label-encoded
|
|
33
|
+
_ENCODED_CATEGORY_MAX_RATIO = 0.05
|
|
34
|
+
_IDENTIFIER_UNIQUE_RATIO = 0.99 # >99 % unique → identifier
|
|
35
|
+
_IDENTIFIER_MAX_MEDIAN_LENGTH = 40
|
|
36
|
+
_DISCRETE_NUNIQUE_THRESHOLD = 20 # numeric with <20 unique values → discrete
|
|
37
|
+
|
|
38
|
+
_FREE_TEXT_AVG_WORDS: int = 5 # avg word count above which → Text
|
|
39
|
+
_FREE_TEXT_MEDIAN_CHARS: int = 35
|
|
40
|
+
_FREE_TEXT_P90_CHARS: int = 60
|
|
41
|
+
_FREE_TEXT_MIN_UNIQUE_RATIO: float = 0.40
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# Common boolean string values (lowercased)
|
|
45
|
+
_BOOL_STRING_SET = {"true", "false", "yes", "no", "t", "f", "0", "1"}
|
|
46
|
+
|
|
47
|
+
class TypeDetector:
|
|
48
|
+
"""
|
|
49
|
+
Run selective type-detection on a Polars DataFrame.
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
columns : list[str]
|
|
54
|
+
The columns to inspect (already validated against the frame).
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, columns: list[str]) -> None:
|
|
58
|
+
self._columns = columns
|
|
59
|
+
|
|
60
|
+
# ------------------------------------------------------------------
|
|
61
|
+
# Public
|
|
62
|
+
# ------------------------------------------------------------------
|
|
63
|
+
|
|
64
|
+
def detect(self, df: pl.DataFrame) -> dict[str, ColumnTypeInfo]:
|
|
65
|
+
"""
|
|
66
|
+
Return a mapping of column name → ColumnTypeInfo for every
|
|
67
|
+
column in self._columns.
|
|
68
|
+
"""
|
|
69
|
+
results: dict[str, ColumnTypeInfo] = {}
|
|
70
|
+
n_rows = df.height
|
|
71
|
+
|
|
72
|
+
for col_name in self._columns:
|
|
73
|
+
series = df[col_name]
|
|
74
|
+
original_dtype = str(series.dtype)
|
|
75
|
+
info = ColumnTypeInfo(
|
|
76
|
+
column=col_name,
|
|
77
|
+
original_dtype=original_dtype,
|
|
78
|
+
inferred_dtype=original_dtype,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Work with a copy that may be re-assigned after coercion
|
|
82
|
+
working = series
|
|
83
|
+
|
|
84
|
+
# 1 & 2: Coercion for string columns
|
|
85
|
+
if series.dtype == pl.Utf8 or series.dtype == pl.String:
|
|
86
|
+
coerced, flag = self._try_numeric_coerce(series, n_rows)
|
|
87
|
+
if coerced is not None:
|
|
88
|
+
info.inferred_dtype = str(coerced.dtype)
|
|
89
|
+
info.flags.append(flag) # type: ignore[arg-type]
|
|
90
|
+
working = coerced
|
|
91
|
+
|
|
92
|
+
self._check_coerced_encoded_category(working, info, n_rows)
|
|
93
|
+
else:
|
|
94
|
+
coerced_dt, flag_dt = self._try_datetime_coerce(
|
|
95
|
+
series, col_name, n_rows
|
|
96
|
+
)
|
|
97
|
+
if coerced_dt is not None:
|
|
98
|
+
info.inferred_dtype = str(coerced_dt.dtype)
|
|
99
|
+
info.flags.append(flag_dt) # type: ignore[arg-type]
|
|
100
|
+
working = coerced_dt
|
|
101
|
+
|
|
102
|
+
info.semantic_type = SemanticType.Datetime
|
|
103
|
+
results[col_name] = info
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
# 3: Boolean candidate
|
|
107
|
+
self._check_boolean_candidate(working, info)
|
|
108
|
+
|
|
109
|
+
# Work only on numeric-ish columns for the remaining checks
|
|
110
|
+
if working.dtype in _NUMERIC_DTYPES:
|
|
111
|
+
# 4 & 5: Encoded category and identifier checks — integers only.
|
|
112
|
+
# Continuous floats have high cardinality by nature and are never
|
|
113
|
+
# identifiers; restricting these checks prevents false Identifier
|
|
114
|
+
# classification of genuine numeric features.
|
|
115
|
+
if working.dtype in _INT_DTYPES:
|
|
116
|
+
self._check_encoded_category(working, info, n_rows)
|
|
117
|
+
self._check_identifier(working, info, n_rows)
|
|
118
|
+
|
|
119
|
+
# 6: Sequential index (integers only)
|
|
120
|
+
if working.dtype in _INT_DTYPES or working.dtype in (
|
|
121
|
+
pl.Float32,
|
|
122
|
+
pl.Float64,
|
|
123
|
+
):
|
|
124
|
+
self._check_sequential_index(working, info, n_rows)
|
|
125
|
+
|
|
126
|
+
# 7: Numeric kind (skip for identifiers / sequential indices)
|
|
127
|
+
if not any(
|
|
128
|
+
info.has_flag(f)
|
|
129
|
+
for f in (
|
|
130
|
+
TypeFlag.IdentifierColumn,
|
|
131
|
+
TypeFlag.SequentialIndex,
|
|
132
|
+
TypeFlag.FloatSequentialIndex,
|
|
133
|
+
)
|
|
134
|
+
):
|
|
135
|
+
self._classify_numeric_kind(working, info)
|
|
136
|
+
|
|
137
|
+
elif working.dtype == pl.Utf8 or working.dtype == pl.String:
|
|
138
|
+
# String identifier check
|
|
139
|
+
self._check_identifier(working, info, n_rows)
|
|
140
|
+
|
|
141
|
+
self._check_free_text(working, info, n_rows)
|
|
142
|
+
|
|
143
|
+
info.semantic_type = self._derive_semantic_type(
|
|
144
|
+
info,
|
|
145
|
+
working,
|
|
146
|
+
n_rows,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
results[col_name] = info
|
|
150
|
+
|
|
151
|
+
return results
|
|
152
|
+
|
|
153
|
+
@staticmethod
|
|
154
|
+
def _derive_semantic_type(
|
|
155
|
+
info: ColumnTypeInfo,
|
|
156
|
+
working: pl.Series,
|
|
157
|
+
n_rows: int,
|
|
158
|
+
) -> SemanticType:
|
|
159
|
+
if TypeFlag.IdentifierColumn in info.flags:
|
|
160
|
+
return SemanticType.Identifier
|
|
161
|
+
|
|
162
|
+
if TypeFlag.BooleanCandidate in info.flags:
|
|
163
|
+
return SemanticType.Boolean
|
|
164
|
+
|
|
165
|
+
is_native_datetime = working.dtype in (
|
|
166
|
+
pl.Date,
|
|
167
|
+
pl.Datetime,
|
|
168
|
+
pl.Duration,
|
|
169
|
+
pl.Time,
|
|
170
|
+
) or (hasattr(pl, "Datetime") and isinstance(working.dtype, pl.Datetime))
|
|
171
|
+
|
|
172
|
+
if is_native_datetime or TypeFlag.DatetimeCoerced in info.flags:
|
|
173
|
+
return SemanticType.Datetime
|
|
174
|
+
|
|
175
|
+
if TypeFlag.EncodedCategory in info.flags:
|
|
176
|
+
return SemanticType.Categorical
|
|
177
|
+
|
|
178
|
+
if working.dtype in (pl.Utf8, pl.String):
|
|
179
|
+
if TypeFlag.FreeTextCandidate in info.flags:
|
|
180
|
+
return SemanticType.Text
|
|
181
|
+
|
|
182
|
+
return SemanticType.Categorical
|
|
183
|
+
|
|
184
|
+
if working.dtype in _NUMERIC_DTYPES:
|
|
185
|
+
return SemanticType.Numeric
|
|
186
|
+
|
|
187
|
+
return SemanticType.Categorical
|
|
188
|
+
|
|
189
|
+
# ------------------------------------------------------------------
|
|
190
|
+
# Step 1: Numeric coercion
|
|
191
|
+
# ------------------------------------------------------------------
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def _try_numeric_coerce(
|
|
195
|
+
series: pl.Series, n_rows: int
|
|
196
|
+
) -> tuple[pl.Series, TypeFlag] | tuple[None, None]:
|
|
197
|
+
"""
|
|
198
|
+
Attempt to cast a Utf8 series to Float64.
|
|
199
|
+
Returns the cast series + flag if success rate ≥ threshold, else (None, None).
|
|
200
|
+
"""
|
|
201
|
+
if n_rows == 0:
|
|
202
|
+
return None, None
|
|
203
|
+
try:
|
|
204
|
+
cast = series.cast(pl.Float64, strict=False)
|
|
205
|
+
except Exception:
|
|
206
|
+
return None, None
|
|
207
|
+
|
|
208
|
+
non_null = cast.drop_nulls().len()
|
|
209
|
+
# Compare against original non-null count to avoid penalising
|
|
210
|
+
# columns that were already sparse
|
|
211
|
+
original_non_null = series.drop_nulls().len()
|
|
212
|
+
denom = original_non_null if original_non_null > 0 else n_rows
|
|
213
|
+
success_rate = non_null / denom
|
|
214
|
+
if success_rate >= _NUMERIC_COERCE_THRESHOLD:
|
|
215
|
+
return cast, TypeFlag.NumericCoerced
|
|
216
|
+
return None, None
|
|
217
|
+
|
|
218
|
+
# ------------------------------------------------------------------
|
|
219
|
+
# Step 2: Datetime coercion
|
|
220
|
+
# ------------------------------------------------------------------
|
|
221
|
+
|
|
222
|
+
@staticmethod
|
|
223
|
+
def _try_datetime_coerce(
|
|
224
|
+
series: pl.Series, col_name: str, n_rows: int
|
|
225
|
+
) -> tuple[pl.Series, TypeFlag] | tuple[None, None]:
|
|
226
|
+
"""
|
|
227
|
+
Attempt datetime coercion if the column name looks date-like.
|
|
228
|
+
Returns the parsed series + flag if success rate ≥ threshold.
|
|
229
|
+
"""
|
|
230
|
+
if n_rows == 0:
|
|
231
|
+
return None, None
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
cast = series.str.to_datetime(strict=False)
|
|
235
|
+
except Exception:
|
|
236
|
+
return None, None
|
|
237
|
+
|
|
238
|
+
original_non_null = series.drop_nulls().len()
|
|
239
|
+
denom = original_non_null if original_non_null > 0 else n_rows
|
|
240
|
+
non_null = cast.drop_nulls().len()
|
|
241
|
+
if denom > 0 and non_null / denom >= _DATETIME_COERCE_THRESHOLD:
|
|
242
|
+
return cast, TypeFlag.DatetimeCoerced
|
|
243
|
+
return None, None
|
|
244
|
+
|
|
245
|
+
# ------------------------------------------------------------------
|
|
246
|
+
# Step 3: Boolean candidate
|
|
247
|
+
# ------------------------------------------------------------------
|
|
248
|
+
|
|
249
|
+
@staticmethod
|
|
250
|
+
def _check_boolean_candidate(series: pl.Series, info: ColumnTypeInfo) -> None:
|
|
251
|
+
if series.dtype == pl.Boolean:
|
|
252
|
+
info.flags.append(TypeFlag.BooleanCandidate)
|
|
253
|
+
return
|
|
254
|
+
|
|
255
|
+
if series.dtype in _INT_DTYPES:
|
|
256
|
+
unique_vals = set(series.drop_nulls().unique().to_list())
|
|
257
|
+
if unique_vals <= {0, 1}:
|
|
258
|
+
info.flags.append(TypeFlag.BooleanCandidate)
|
|
259
|
+
elif series.dtype in (pl.Utf8, pl.String):
|
|
260
|
+
unique_vals_lower = {
|
|
261
|
+
str(v).lower() for v in series.drop_nulls().unique().to_list()
|
|
262
|
+
}
|
|
263
|
+
if unique_vals_lower and unique_vals_lower <= _BOOL_STRING_SET:
|
|
264
|
+
info.flags.append(TypeFlag.BooleanCandidate)
|
|
265
|
+
|
|
266
|
+
# ------------------------------------------------------------------
|
|
267
|
+
# Step 4: Encoded category
|
|
268
|
+
# ------------------------------------------------------------------
|
|
269
|
+
|
|
270
|
+
@staticmethod
|
|
271
|
+
def _check_coerced_encoded_category(
|
|
272
|
+
series: pl.Series, info: ColumnTypeInfo, n_rows: int
|
|
273
|
+
) -> None:
|
|
274
|
+
"""
|
|
275
|
+
Post-coercion low-cardinality check for Float64 series that originated
|
|
276
|
+
as strings. Sets EncodedCategory only when:
|
|
277
|
+
1. All non-null values are whole numbers (the strings were integer-like)
|
|
278
|
+
2. Cardinality passes the same absolute + ratio thresholds as the
|
|
279
|
+
native-integer encoded-category check.
|
|
280
|
+
|
|
281
|
+
This distinguishes "1","2","3" (label-encoded → Categorical) from
|
|
282
|
+
"1.5","2.7","3.1" (genuine floats → Numeric).
|
|
283
|
+
"""
|
|
284
|
+
if TypeFlag.BooleanCandidate in info.flags:
|
|
285
|
+
return
|
|
286
|
+
|
|
287
|
+
valid = series.drop_nulls()
|
|
288
|
+
n_valid = valid.len()
|
|
289
|
+
if n_valid == 0:
|
|
290
|
+
return
|
|
291
|
+
|
|
292
|
+
# Whole-number check: reject true floats like 1.5, 2.7
|
|
293
|
+
try:
|
|
294
|
+
as_int = valid.cast(pl.Int64, strict=False)
|
|
295
|
+
except Exception:
|
|
296
|
+
return
|
|
297
|
+
if not (valid == as_int.cast(pl.Float64, strict=False)).all():
|
|
298
|
+
return
|
|
299
|
+
|
|
300
|
+
# Cardinality thresholds (same logic as _check_encoded_category)
|
|
301
|
+
n_unique = valid.n_unique()
|
|
302
|
+
min_val = int(valid.min())
|
|
303
|
+
max_val = int(valid.max())
|
|
304
|
+
range_span = (max_val - min_val) + 1
|
|
305
|
+
is_tight_sequence = range_span == n_unique
|
|
306
|
+
absolute_limit = 50 if is_tight_sequence else _ENCODED_CATEGORY_MAX_UNIQUE
|
|
307
|
+
absolute_ok = 0 < n_unique < absolute_limit
|
|
308
|
+
ratio_ok = (n_unique / n_valid) < _ENCODED_CATEGORY_MAX_RATIO
|
|
309
|
+
|
|
310
|
+
if (absolute_ok and ratio_ok) or (is_tight_sequence and absolute_ok):
|
|
311
|
+
info.flags.append(TypeFlag.EncodedCategory)
|
|
312
|
+
|
|
313
|
+
@staticmethod
|
|
314
|
+
def _check_encoded_category(
|
|
315
|
+
series: pl.Series, info: ColumnTypeInfo, n_rows: int
|
|
316
|
+
) -> None:
|
|
317
|
+
# Skip if already flagged as boolean candidate (subset of {0,1})
|
|
318
|
+
if TypeFlag.BooleanCandidate in info.flags:
|
|
319
|
+
return
|
|
320
|
+
|
|
321
|
+
if not series.dtype.is_integer():
|
|
322
|
+
return
|
|
323
|
+
|
|
324
|
+
valid_series = series.drop_nulls()
|
|
325
|
+
n_valid = valid_series.len()
|
|
326
|
+
|
|
327
|
+
if n_valid == 0:
|
|
328
|
+
return
|
|
329
|
+
|
|
330
|
+
n_unique = valid_series.n_unique()
|
|
331
|
+
|
|
332
|
+
min_val = valid_series.min()
|
|
333
|
+
max_val = valid_series.max()
|
|
334
|
+
range_span = (max_val - min_val) + 1
|
|
335
|
+
|
|
336
|
+
is_tight_sequence = range_span == n_unique
|
|
337
|
+
|
|
338
|
+
absolute_limit = 50 if is_tight_sequence else _ENCODED_CATEGORY_MAX_UNIQUE
|
|
339
|
+
|
|
340
|
+
absolute_ok = 0 < n_unique < absolute_limit
|
|
341
|
+
ratio_ok = (n_unique / n_valid) < _ENCODED_CATEGORY_MAX_RATIO
|
|
342
|
+
|
|
343
|
+
if (absolute_ok and ratio_ok) or (is_tight_sequence and absolute_ok):
|
|
344
|
+
info.flags.append(TypeFlag.EncodedCategory)
|
|
345
|
+
|
|
346
|
+
# ------------------------------------------------------------------
|
|
347
|
+
# Step 5: Identifier column
|
|
348
|
+
# ------------------------------------------------------------------
|
|
349
|
+
|
|
350
|
+
@staticmethod
|
|
351
|
+
def _check_identifier(series: pl.Series, info: ColumnTypeInfo, n_rows: int) -> None:
|
|
352
|
+
if n_rows == 0:
|
|
353
|
+
return
|
|
354
|
+
|
|
355
|
+
n_unique = series.n_unique()
|
|
356
|
+
if n_unique / n_rows <= _IDENTIFIER_UNIQUE_RATIO:
|
|
357
|
+
return
|
|
358
|
+
|
|
359
|
+
if series.dtype in (pl.Utf8, pl.String):
|
|
360
|
+
lengths = series.drop_nulls().str.len_chars()
|
|
361
|
+
if lengths.len() == 0:
|
|
362
|
+
return
|
|
363
|
+
|
|
364
|
+
median_length = lengths.median()
|
|
365
|
+
|
|
366
|
+
if (
|
|
367
|
+
median_length is not None
|
|
368
|
+
and median_length > _IDENTIFIER_MAX_MEDIAN_LENGTH
|
|
369
|
+
):
|
|
370
|
+
return
|
|
371
|
+
|
|
372
|
+
info.flags.append(TypeFlag.IdentifierColumn)
|
|
373
|
+
|
|
374
|
+
# ------------------------------------------------------------------
|
|
375
|
+
# Step 6: Sequential index
|
|
376
|
+
# ------------------------------------------------------------------
|
|
377
|
+
|
|
378
|
+
@staticmethod
|
|
379
|
+
def _check_sequential_index(
|
|
380
|
+
series: pl.Series, info: ColumnTypeInfo, n_rows: int
|
|
381
|
+
) -> None:
|
|
382
|
+
if n_rows == 0 or TypeFlag.IdentifierColumn not in info.flags:
|
|
383
|
+
# Only bother if already flagged as identifier
|
|
384
|
+
return
|
|
385
|
+
|
|
386
|
+
is_float = series.dtype in (pl.Float32, pl.Float64)
|
|
387
|
+
is_int = series.dtype in _INT_DTYPES
|
|
388
|
+
|
|
389
|
+
if not (is_float or is_int):
|
|
390
|
+
return
|
|
391
|
+
|
|
392
|
+
s_min = series.min()
|
|
393
|
+
s_max = series.max()
|
|
394
|
+
|
|
395
|
+
if (s_min != 0 and s_max != n_rows - 1) or (s_min != 1 or s_max != n_rows):
|
|
396
|
+
return
|
|
397
|
+
|
|
398
|
+
if is_float:
|
|
399
|
+
series_int = series.cast(pl.Int64)
|
|
400
|
+
if not (series == series_int).all():
|
|
401
|
+
return
|
|
402
|
+
series_to_check = series_int
|
|
403
|
+
else:
|
|
404
|
+
series_to_check = series
|
|
405
|
+
|
|
406
|
+
if series_to_check.n_unique() == n_rows:
|
|
407
|
+
flag = (
|
|
408
|
+
TypeFlag.FloatSequentialIndex if is_float else TypeFlag.SequentialIndex
|
|
409
|
+
)
|
|
410
|
+
info.flags.append(flag)
|
|
411
|
+
|
|
412
|
+
# ------------------------------------------------------------------
|
|
413
|
+
# Step 7: Numeric kind
|
|
414
|
+
# ------------------------------------------------------------------
|
|
415
|
+
|
|
416
|
+
@staticmethod
|
|
417
|
+
def _classify_numeric_kind(series: pl.Series, info: ColumnTypeInfo) -> None:
|
|
418
|
+
# Skip if it's an encoded category (treat as categorical, not numeric)
|
|
419
|
+
if TypeFlag.EncodedCategory in info.flags:
|
|
420
|
+
return
|
|
421
|
+
|
|
422
|
+
n_unique = series.drop_nulls().n_unique()
|
|
423
|
+
|
|
424
|
+
if series.dtype in _INT_DTYPES:
|
|
425
|
+
info.numeric_kind = NumericKind.Discrete
|
|
426
|
+
elif n_unique < _DISCRETE_NUNIQUE_THRESHOLD:
|
|
427
|
+
info.numeric_kind = NumericKind.Discrete
|
|
428
|
+
else:
|
|
429
|
+
info.numeric_kind = NumericKind.Continuous
|
|
430
|
+
|
|
431
|
+
@staticmethod
|
|
432
|
+
def _check_free_text(
|
|
433
|
+
series: pl.Series,
|
|
434
|
+
info: ColumnTypeInfo,
|
|
435
|
+
n_rows: int,
|
|
436
|
+
) -> None:
|
|
437
|
+
non_null = series.drop_nulls()
|
|
438
|
+
if non_null.len() == 0:
|
|
439
|
+
return
|
|
440
|
+
|
|
441
|
+
char_lengths = non_null.str.len_chars()
|
|
442
|
+
median_chars = float(char_lengths.median() or 0.0)
|
|
443
|
+
|
|
444
|
+
if median_chars > _FREE_TEXT_MEDIAN_CHARS:
|
|
445
|
+
info.flags.append(TypeFlag.FreeTextCandidate)
|
|
446
|
+
return
|
|
447
|
+
|
|
448
|
+
space_counts = non_null.str.count_matches(r"\s+")
|
|
449
|
+
median_words = float(space_counts.median() or 0.0) + 1.0
|
|
450
|
+
|
|
451
|
+
if median_words > _FREE_TEXT_AVG_WORDS:
|
|
452
|
+
info.flags.append(TypeFlag.FreeTextCandidate)
|
|
453
|
+
return
|
|
454
|
+
|
|
455
|
+
p90_chars = float(char_lengths.quantile(0.9) or 0.0)
|
|
456
|
+
|
|
457
|
+
unique_ratio = series.n_unique() / n_rows if n_rows > 0 else 0.0
|
|
458
|
+
|
|
459
|
+
if (
|
|
460
|
+
p90_chars > _FREE_TEXT_P90_CHARS
|
|
461
|
+
and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO
|
|
462
|
+
):
|
|
463
|
+
info.flags.append(TypeFlag.FreeTextCandidate)
|
profiling/config.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration and result dataclasses for the profiling phase — Phase 1 redesign.
|
|
3
|
+
|
|
4
|
+
ProfileConfig controls the structural profiler's behaviour.
|
|
5
|
+
Stats dataclasses hold per-column and dataset-level profiling results.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from enum import StrEnum
|
|
13
|
+
from typing import Optional, Union
|
|
14
|
+
|
|
15
|
+
from ._missingness_config import (
|
|
16
|
+
ColumnMissingnessProfile,
|
|
17
|
+
)
|
|
18
|
+
from ._correlation_config import (
|
|
19
|
+
CorrelationProfileResult,
|
|
20
|
+
)
|
|
21
|
+
from ._categorical_config import (
|
|
22
|
+
CategoricalStats,
|
|
23
|
+
)
|
|
24
|
+
from ._numeric_config import (
|
|
25
|
+
NumericStats,
|
|
26
|
+
)
|
|
27
|
+
from ._datetime_config import (
|
|
28
|
+
DatetimeStats,
|
|
29
|
+
)
|
|
30
|
+
from ._boolean_config import BooleanStats
|
|
31
|
+
from ._text_config import TextStats
|
|
32
|
+
from ._target_config import TargetProfileResult
|
|
33
|
+
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
# Core enums
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class SemanticType(StrEnum):
|
|
40
|
+
Numeric = "numeric"
|
|
41
|
+
Categorical = "categorical"
|
|
42
|
+
Datetime = "datetime"
|
|
43
|
+
Boolean = "boolean"
|
|
44
|
+
Text = "text"
|
|
45
|
+
Identifier = "identifier"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Modality(StrEnum):
|
|
49
|
+
Tabular = "tabular"
|
|
50
|
+
# Placeholder slots for future modalities — no implementation yet.
|
|
51
|
+
# Image = "image"
|
|
52
|
+
# TimeSeries = "time_series"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ---------------------------------------------------------------------------
|
|
56
|
+
# Type-detection enums — kept for TypeDetector compatibility
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class NumericKind(StrEnum):
|
|
61
|
+
Continuous = "continuous"
|
|
62
|
+
Discrete = "discrete"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class TypeFlag(StrEnum):
|
|
66
|
+
NumericCoerced = "numeric_coerced"
|
|
67
|
+
DatetimeCoerced = "datetime_coerced"
|
|
68
|
+
BooleanCandidate = "boolean_candidate"
|
|
69
|
+
EncodedCategory = "encoded_category"
|
|
70
|
+
IdentifierColumn = "identifier_column"
|
|
71
|
+
SequentialIndex = "sequential_index"
|
|
72
|
+
FloatSequentialIndex = "float_sequential_index"
|
|
73
|
+
FreeTextCandidate = "free_text_candidate"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ---------------------------------------------------------------------------
|
|
77
|
+
# Column and dataset result containers
|
|
78
|
+
# ---------------------------------------------------------------------------
|
|
79
|
+
|
|
80
|
+
AnyStats = Union[NumericStats, CategoricalStats, DatetimeStats, BooleanStats, TextStats]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class ColumnProfile:
|
|
85
|
+
name: str = ""
|
|
86
|
+
semantic_type: Optional[SemanticType] = None
|
|
87
|
+
type_flags: list[TypeFlag] = field(default_factory=list)
|
|
88
|
+
original_dtype: str = ""
|
|
89
|
+
inferred_dtype: str = ""
|
|
90
|
+
missingness: Optional[ColumnMissingnessProfile] = None
|
|
91
|
+
is_target: bool = False
|
|
92
|
+
stats: Optional[AnyStats] = None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class RowMissingnessDistribution:
|
|
97
|
+
"""
|
|
98
|
+
Dataset-level summary of per-row missing-value counts.
|
|
99
|
+
Computed by StructuralProfiler over the full active column set.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
pct_zero_missing: float = 0.0
|
|
103
|
+
pct_one_to_two: float = 0.0
|
|
104
|
+
pct_three_to_five: float = 0.0
|
|
105
|
+
pct_over_five: float = 0.0
|
|
106
|
+
pct_over_half_missing: float = 0.0
|
|
107
|
+
drop_candidate_row_count: int = 0
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass
|
|
111
|
+
class MemoryBreakdown:
|
|
112
|
+
column_bytes: dict[str, int] = field(default_factory=dict)
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def sorted_by_usage(self) -> list[tuple[str, int]]:
|
|
116
|
+
return sorted(self.column_bytes.items(), key=lambda x: x[1], reverse=True)
|
|
117
|
+
|
|
118
|
+
def top_consumers(self, n: int = 10) -> list[tuple[str, int]]:
|
|
119
|
+
return self.sorted_by_usage[:n]
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@dataclass
|
|
123
|
+
class DatasetStats:
|
|
124
|
+
modality: Modality = Modality.Tabular
|
|
125
|
+
row_count: int = 0
|
|
126
|
+
column_count: int = 0
|
|
127
|
+
memory_bytes: int = 0
|
|
128
|
+
memory_breakdown: Optional[MemoryBreakdown] = None
|
|
129
|
+
duplicate_count: int = 0
|
|
130
|
+
duplicate_ratio: float = 0.0
|
|
131
|
+
overall_sparsity: float = 0.0
|
|
132
|
+
was_chunked: bool = False
|
|
133
|
+
missingness_matrix: Optional[dict[str, dict[str, float]]] = None
|
|
134
|
+
row_distribution: RowMissingnessDistribution = field(
|
|
135
|
+
default_factory=RowMissingnessDistribution
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
feature_correlation: Optional[CorrelationProfileResult] = None
|
|
139
|
+
|
|
140
|
+
target_correlations: dict[str, CorrelationProfileResult] = field(
|
|
141
|
+
default_factory=dict,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@dataclass
|
|
146
|
+
class StructuralProfileResult:
|
|
147
|
+
columns: dict[str, ColumnProfile] = field(default_factory=dict)
|
|
148
|
+
dataset: DatasetStats = field(default_factory=DatasetStats)
|
|
149
|
+
targets: dict[str, TargetProfileResult] = field(default_factory=dict)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ---------------------------------------------------------------------------
|
|
153
|
+
# ProfileConfig — clean break from per-profiler column lists
|
|
154
|
+
# ---------------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@dataclass
|
|
158
|
+
class ProfileConfig:
|
|
159
|
+
"""
|
|
160
|
+
Controls the structural profiler's behaviour.
|
|
161
|
+
|
|
162
|
+
Parameters
|
|
163
|
+
----------
|
|
164
|
+
modality : Modality
|
|
165
|
+
Data modality. Currently only Tabular is implemented.
|
|
166
|
+
target_column : Optional[str]
|
|
167
|
+
Name of the label/target column, if any.
|
|
168
|
+
column_overrides : dict[str, SemanticType]
|
|
169
|
+
Explicit semantic type assignments that override auto-detection.
|
|
170
|
+
exclude_columns : list[str]
|
|
171
|
+
Columns to skip entirely during profiling.
|
|
172
|
+
compute_correlation : bool
|
|
173
|
+
Whether to compute the feature-feature correlation matrix.
|
|
174
|
+
correlation_target_column : Optional[str]
|
|
175
|
+
Column used for feature-target correlation metrics.
|
|
176
|
+
memory_threshold_mb : float
|
|
177
|
+
Memory (MB) above which chunked processing activates.
|
|
178
|
+
chunk_size : int
|
|
179
|
+
Rows per chunk when chunked processing is active.
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
modality: Modality = Modality.Tabular
|
|
183
|
+
target_columns: list[str] = field(default_factory=list)
|
|
184
|
+
column_overrides: dict[str, SemanticType] = field(default_factory=dict)
|
|
185
|
+
exclude_columns: list[str] = field(default_factory=list)
|
|
186
|
+
compute_correlation: bool = False
|
|
187
|
+
correlation_target_column: Optional[str] = None
|
|
188
|
+
memory_threshold_mb: float = 500.0
|
|
189
|
+
chunk_size: int = 100_000
|
|
190
|
+
|
|
191
|
+
def to_dict(self) -> dict:
|
|
192
|
+
return {
|
|
193
|
+
"modality": str(self.modality),
|
|
194
|
+
"target_columns": list(self.target_columns),
|
|
195
|
+
"column_overrides": {k: str(v) for k, v in self.column_overrides.items()},
|
|
196
|
+
"exclude_columns": list(self.exclude_columns),
|
|
197
|
+
"compute_correlation": self.compute_correlation,
|
|
198
|
+
"correlation_target_column": self.correlation_target_column,
|
|
199
|
+
"memory_threshold_mb": self.memory_threshold_mb,
|
|
200
|
+
"chunk_size": self.chunk_size,
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
@classmethod
|
|
204
|
+
def from_dict(cls, data: dict) -> ProfileConfig:
|
|
205
|
+
return cls(
|
|
206
|
+
modality=Modality(data.get("modality", Modality.Tabular)),
|
|
207
|
+
target_column=data.get("target_column"),
|
|
208
|
+
column_overrides={
|
|
209
|
+
k: SemanticType(v) for k, v in data.get("column_overrides", {}).items()
|
|
210
|
+
},
|
|
211
|
+
exclude_columns=list(data.get("exclude_columns", [])),
|
|
212
|
+
compute_correlation=bool(data.get("compute_correlation", False)),
|
|
213
|
+
correlation_target_column=data.get("correlation_target_column"),
|
|
214
|
+
memory_threshold_mb=float(data.get("memory_threshold_mb", 500.0)),
|
|
215
|
+
chunk_size=int(data.get("chunk_size", 100_000)),
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def to_json(self) -> str:
|
|
219
|
+
return json.dumps(self.to_dict())
|
|
220
|
+
|
|
221
|
+
@classmethod
|
|
222
|
+
def from_json(cls, json_str: str) -> ProfileConfig:
|
|
223
|
+
return cls.from_dict(json.loads(json_str))
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
@dataclass
|
|
227
|
+
class ColumnTypeInfo:
|
|
228
|
+
column: str
|
|
229
|
+
original_dtype: str
|
|
230
|
+
inferred_dtype: str
|
|
231
|
+
numeric_kind: Optional[NumericKind] = None
|
|
232
|
+
flags: list[TypeFlag] = field(default_factory=list)
|
|
233
|
+
semantic_type: Optional[SemanticType] = None
|
|
234
|
+
|
|
235
|
+
def has_flag(self, flag: TypeFlag) -> bool:
|
|
236
|
+
return flag in self.flags
|