dataforge-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. dataforge_ml-0.1.0.dist-info/METADATA +34 -0
  2. dataforge_ml-0.1.0.dist-info/RECORD +54 -0
  3. dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
  4. dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
  5. dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
  6. models/__init__.py +0 -0
  7. models/_data_structure.py +7 -0
  8. models/_data_types.py +12 -0
  9. profiling/__init__.py +35 -0
  10. profiling/_base.py +101 -0
  11. profiling/_boolean_config.py +37 -0
  12. profiling/_boolean_profiler.py +191 -0
  13. profiling/_categorical.py +315 -0
  14. profiling/_categorical_config.py +87 -0
  15. profiling/_correlation_config.py +225 -0
  16. profiling/_correlation_profiler.py +544 -0
  17. profiling/_datetime_config.py +98 -0
  18. profiling/_datetime_profiler.py +406 -0
  19. profiling/_missingness_config.py +137 -0
  20. profiling/_missingness_profiler.py +252 -0
  21. profiling/_numeric_config.py +116 -0
  22. profiling/_numeric_profiler.py +403 -0
  23. profiling/_tabular.py +249 -0
  24. profiling/_target_config.py +74 -0
  25. profiling/_target_profiler.py +156 -0
  26. profiling/_text_config.py +40 -0
  27. profiling/_text_profiler.py +194 -0
  28. profiling/_type_detector.py +463 -0
  29. profiling/config.py +236 -0
  30. profiling/structural.py +280 -0
  31. splitting/__init__.py +4 -0
  32. splitting/_config.py +56 -0
  33. splitting/_splitter.py +202 -0
  34. tests/__init__.py +0 -0
  35. tests/conftest.py +7 -0
  36. tests/integration/__init__.py +0 -0
  37. tests/integration/conftest.py +82 -0
  38. tests/integration/test_structural_end_to_end.py +219 -0
  39. tests/unit/__init__.py +0 -0
  40. tests/unit/profiling/__init__.py +0 -0
  41. tests/unit/profiling/conftest.py +81 -0
  42. tests/unit/profiling/test_boolean_profiler.py +91 -0
  43. tests/unit/profiling/test_categorical_profiler.py +182 -0
  44. tests/unit/profiling/test_correlation_profiler.py +124 -0
  45. tests/unit/profiling/test_datetime_profiler.py +133 -0
  46. tests/unit/profiling/test_missingness_profiler.py +51 -0
  47. tests/unit/profiling/test_numeric_profiler.py +212 -0
  48. tests/unit/profiling/test_target_profiler.py +44 -0
  49. tests/unit/profiling/test_text_profiler.py +61 -0
  50. tests/unit/profiling/test_type_detector.py +32 -0
  51. tests/unit/splitting/__init__.py +0 -0
  52. tests/unit/splitting/test_data_splitter.py +417 -0
  53. utils/__init__.py +0 -0
  54. utils/data_loader.py +110 -0
@@ -0,0 +1,463 @@
1
+ """
2
+ TypeDetector – selective data-type detection for Polars DataFrames.
3
+
4
+ Detection is opt-in: only columns listed in ProfileConfig.type_detection_columns
5
+ are examined. The detector never mutates the original frame.
6
+
7
+ Detection pipeline (in order, applied per column):
8
+ 1. Numeric coercion – object/Utf8 columns → try cast to Float64
9
+ 2. Datetime coercion – object/Utf8 columns with date-like names/values
10
+ 3. Boolean candidate – int {0,1} or string {"true","false","yes","no",…}
11
+ 4. Encoded category – int with low cardinality (<15 unique values)
12
+ 5. Identifier column – unique ratio > 99 %
13
+ 6. Sequential index – integer column == range(0,n) or range(1,n+1)
14
+ 7. Numeric kind – continuous vs discrete for confirmed numeric cols
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from typing import TYPE_CHECKING
20
+
21
+ import polars as pl
22
+
23
+ from .config import ColumnTypeInfo, NumericKind, TypeFlag, SemanticType
24
+ from ..models._data_types import _INT_DTYPES, _NUMERIC_DTYPES
25
+
26
+ if TYPE_CHECKING:
27
+ pass
28
+
29
+ # Threshold constants
30
+ _NUMERIC_COERCE_THRESHOLD = 0.95 # ≥95 % non-null after cast → reclassify
31
+ _DATETIME_COERCE_THRESHOLD = 0.80 # ≥80 % non-null after cast → reclassify
32
+ _ENCODED_CATEGORY_MAX_UNIQUE = 15 # int with fewer unique values → label-encoded
33
+ _ENCODED_CATEGORY_MAX_RATIO = 0.05
34
+ _IDENTIFIER_UNIQUE_RATIO = 0.99 # >99 % unique → identifier
35
+ _IDENTIFIER_MAX_MEDIAN_LENGTH = 40
36
+ _DISCRETE_NUNIQUE_THRESHOLD = 20 # numeric with <20 unique values → discrete
37
+
38
+ _FREE_TEXT_AVG_WORDS: int = 5 # avg word count above which → Text
39
+ _FREE_TEXT_MEDIAN_CHARS: int = 35
40
+ _FREE_TEXT_P90_CHARS: int = 60
41
+ _FREE_TEXT_MIN_UNIQUE_RATIO: float = 0.40
42
+
43
+
44
+ # Common boolean string values (lowercased)
45
+ _BOOL_STRING_SET = {"true", "false", "yes", "no", "t", "f", "0", "1"}
46
+
47
+ class TypeDetector:
48
+ """
49
+ Run selective type-detection on a Polars DataFrame.
50
+
51
+ Parameters
52
+ ----------
53
+ columns : list[str]
54
+ The columns to inspect (already validated against the frame).
55
+ """
56
+
57
+ def __init__(self, columns: list[str]) -> None:
58
+ self._columns = columns
59
+
60
+ # ------------------------------------------------------------------
61
+ # Public
62
+ # ------------------------------------------------------------------
63
+
64
+ def detect(self, df: pl.DataFrame) -> dict[str, ColumnTypeInfo]:
65
+ """
66
+ Return a mapping of column name → ColumnTypeInfo for every
67
+ column in self._columns.
68
+ """
69
+ results: dict[str, ColumnTypeInfo] = {}
70
+ n_rows = df.height
71
+
72
+ for col_name in self._columns:
73
+ series = df[col_name]
74
+ original_dtype = str(series.dtype)
75
+ info = ColumnTypeInfo(
76
+ column=col_name,
77
+ original_dtype=original_dtype,
78
+ inferred_dtype=original_dtype,
79
+ )
80
+
81
+ # Work with a copy that may be re-assigned after coercion
82
+ working = series
83
+
84
+ # 1 & 2: Coercion for string columns
85
+ if series.dtype == pl.Utf8 or series.dtype == pl.String:
86
+ coerced, flag = self._try_numeric_coerce(series, n_rows)
87
+ if coerced is not None:
88
+ info.inferred_dtype = str(coerced.dtype)
89
+ info.flags.append(flag) # type: ignore[arg-type]
90
+ working = coerced
91
+
92
+ self._check_coerced_encoded_category(working, info, n_rows)
93
+ else:
94
+ coerced_dt, flag_dt = self._try_datetime_coerce(
95
+ series, col_name, n_rows
96
+ )
97
+ if coerced_dt is not None:
98
+ info.inferred_dtype = str(coerced_dt.dtype)
99
+ info.flags.append(flag_dt) # type: ignore[arg-type]
100
+ working = coerced_dt
101
+
102
+ info.semantic_type = SemanticType.Datetime
103
+ results[col_name] = info
104
+ continue
105
+
106
+ # 3: Boolean candidate
107
+ self._check_boolean_candidate(working, info)
108
+
109
+ # Work only on numeric-ish columns for the remaining checks
110
+ if working.dtype in _NUMERIC_DTYPES:
111
+ # 4 & 5: Encoded category and identifier checks — integers only.
112
+ # Continuous floats have high cardinality by nature and are never
113
+ # identifiers; restricting these checks prevents false Identifier
114
+ # classification of genuine numeric features.
115
+ if working.dtype in _INT_DTYPES:
116
+ self._check_encoded_category(working, info, n_rows)
117
+ self._check_identifier(working, info, n_rows)
118
+
119
+ # 6: Sequential index (integers only)
120
+ if working.dtype in _INT_DTYPES or working.dtype in (
121
+ pl.Float32,
122
+ pl.Float64,
123
+ ):
124
+ self._check_sequential_index(working, info, n_rows)
125
+
126
+ # 7: Numeric kind (skip for identifiers / sequential indices)
127
+ if not any(
128
+ info.has_flag(f)
129
+ for f in (
130
+ TypeFlag.IdentifierColumn,
131
+ TypeFlag.SequentialIndex,
132
+ TypeFlag.FloatSequentialIndex,
133
+ )
134
+ ):
135
+ self._classify_numeric_kind(working, info)
136
+
137
+ elif working.dtype == pl.Utf8 or working.dtype == pl.String:
138
+ # String identifier check
139
+ self._check_identifier(working, info, n_rows)
140
+
141
+ self._check_free_text(working, info, n_rows)
142
+
143
+ info.semantic_type = self._derive_semantic_type(
144
+ info,
145
+ working,
146
+ n_rows,
147
+ )
148
+
149
+ results[col_name] = info
150
+
151
+ return results
152
+
153
+ @staticmethod
154
+ def _derive_semantic_type(
155
+ info: ColumnTypeInfo,
156
+ working: pl.Series,
157
+ n_rows: int,
158
+ ) -> SemanticType:
159
+ if TypeFlag.IdentifierColumn in info.flags:
160
+ return SemanticType.Identifier
161
+
162
+ if TypeFlag.BooleanCandidate in info.flags:
163
+ return SemanticType.Boolean
164
+
165
+ is_native_datetime = working.dtype in (
166
+ pl.Date,
167
+ pl.Datetime,
168
+ pl.Duration,
169
+ pl.Time,
170
+ ) or (hasattr(pl, "Datetime") and isinstance(working.dtype, pl.Datetime))
171
+
172
+ if is_native_datetime or TypeFlag.DatetimeCoerced in info.flags:
173
+ return SemanticType.Datetime
174
+
175
+ if TypeFlag.EncodedCategory in info.flags:
176
+ return SemanticType.Categorical
177
+
178
+ if working.dtype in (pl.Utf8, pl.String):
179
+ if TypeFlag.FreeTextCandidate in info.flags:
180
+ return SemanticType.Text
181
+
182
+ return SemanticType.Categorical
183
+
184
+ if working.dtype in _NUMERIC_DTYPES:
185
+ return SemanticType.Numeric
186
+
187
+ return SemanticType.Categorical
188
+
189
+ # ------------------------------------------------------------------
190
+ # Step 1: Numeric coercion
191
+ # ------------------------------------------------------------------
192
+
193
+ @staticmethod
194
+ def _try_numeric_coerce(
195
+ series: pl.Series, n_rows: int
196
+ ) -> tuple[pl.Series, TypeFlag] | tuple[None, None]:
197
+ """
198
+ Attempt to cast a Utf8 series to Float64.
199
+ Returns the cast series + flag if success rate ≥ threshold, else (None, None).
200
+ """
201
+ if n_rows == 0:
202
+ return None, None
203
+ try:
204
+ cast = series.cast(pl.Float64, strict=False)
205
+ except Exception:
206
+ return None, None
207
+
208
+ non_null = cast.drop_nulls().len()
209
+ # Compare against original non-null count to avoid penalising
210
+ # columns that were already sparse
211
+ original_non_null = series.drop_nulls().len()
212
+ denom = original_non_null if original_non_null > 0 else n_rows
213
+ success_rate = non_null / denom
214
+ if success_rate >= _NUMERIC_COERCE_THRESHOLD:
215
+ return cast, TypeFlag.NumericCoerced
216
+ return None, None
217
+
218
+ # ------------------------------------------------------------------
219
+ # Step 2: Datetime coercion
220
+ # ------------------------------------------------------------------
221
+
222
+ @staticmethod
223
+ def _try_datetime_coerce(
224
+ series: pl.Series, col_name: str, n_rows: int
225
+ ) -> tuple[pl.Series, TypeFlag] | tuple[None, None]:
226
+ """
227
+ Attempt datetime coercion if the column name looks date-like.
228
+ Returns the parsed series + flag if success rate ≥ threshold.
229
+ """
230
+ if n_rows == 0:
231
+ return None, None
232
+
233
+ try:
234
+ cast = series.str.to_datetime(strict=False)
235
+ except Exception:
236
+ return None, None
237
+
238
+ original_non_null = series.drop_nulls().len()
239
+ denom = original_non_null if original_non_null > 0 else n_rows
240
+ non_null = cast.drop_nulls().len()
241
+ if denom > 0 and non_null / denom >= _DATETIME_COERCE_THRESHOLD:
242
+ return cast, TypeFlag.DatetimeCoerced
243
+ return None, None
244
+
245
+ # ------------------------------------------------------------------
246
+ # Step 3: Boolean candidate
247
+ # ------------------------------------------------------------------
248
+
249
+ @staticmethod
250
+ def _check_boolean_candidate(series: pl.Series, info: ColumnTypeInfo) -> None:
251
+ if series.dtype == pl.Boolean:
252
+ info.flags.append(TypeFlag.BooleanCandidate)
253
+ return
254
+
255
+ if series.dtype in _INT_DTYPES:
256
+ unique_vals = set(series.drop_nulls().unique().to_list())
257
+ if unique_vals <= {0, 1}:
258
+ info.flags.append(TypeFlag.BooleanCandidate)
259
+ elif series.dtype in (pl.Utf8, pl.String):
260
+ unique_vals_lower = {
261
+ str(v).lower() for v in series.drop_nulls().unique().to_list()
262
+ }
263
+ if unique_vals_lower and unique_vals_lower <= _BOOL_STRING_SET:
264
+ info.flags.append(TypeFlag.BooleanCandidate)
265
+
266
+ # ------------------------------------------------------------------
267
+ # Step 4: Encoded category
268
+ # ------------------------------------------------------------------
269
+
270
+ @staticmethod
271
+ def _check_coerced_encoded_category(
272
+ series: pl.Series, info: ColumnTypeInfo, n_rows: int
273
+ ) -> None:
274
+ """
275
+ Post-coercion low-cardinality check for Float64 series that originated
276
+ as strings. Sets EncodedCategory only when:
277
+ 1. All non-null values are whole numbers (the strings were integer-like)
278
+ 2. Cardinality passes the same absolute + ratio thresholds as the
279
+ native-integer encoded-category check.
280
+
281
+ This distinguishes "1","2","3" (label-encoded → Categorical) from
282
+ "1.5","2.7","3.1" (genuine floats → Numeric).
283
+ """
284
+ if TypeFlag.BooleanCandidate in info.flags:
285
+ return
286
+
287
+ valid = series.drop_nulls()
288
+ n_valid = valid.len()
289
+ if n_valid == 0:
290
+ return
291
+
292
+ # Whole-number check: reject true floats like 1.5, 2.7
293
+ try:
294
+ as_int = valid.cast(pl.Int64, strict=False)
295
+ except Exception:
296
+ return
297
+ if not (valid == as_int.cast(pl.Float64, strict=False)).all():
298
+ return
299
+
300
+ # Cardinality thresholds (same logic as _check_encoded_category)
301
+ n_unique = valid.n_unique()
302
+ min_val = int(valid.min())
303
+ max_val = int(valid.max())
304
+ range_span = (max_val - min_val) + 1
305
+ is_tight_sequence = range_span == n_unique
306
+ absolute_limit = 50 if is_tight_sequence else _ENCODED_CATEGORY_MAX_UNIQUE
307
+ absolute_ok = 0 < n_unique < absolute_limit
308
+ ratio_ok = (n_unique / n_valid) < _ENCODED_CATEGORY_MAX_RATIO
309
+
310
+ if (absolute_ok and ratio_ok) or (is_tight_sequence and absolute_ok):
311
+ info.flags.append(TypeFlag.EncodedCategory)
312
+
313
+ @staticmethod
314
+ def _check_encoded_category(
315
+ series: pl.Series, info: ColumnTypeInfo, n_rows: int
316
+ ) -> None:
317
+ # Skip if already flagged as boolean candidate (subset of {0,1})
318
+ if TypeFlag.BooleanCandidate in info.flags:
319
+ return
320
+
321
+ if not series.dtype.is_integer():
322
+ return
323
+
324
+ valid_series = series.drop_nulls()
325
+ n_valid = valid_series.len()
326
+
327
+ if n_valid == 0:
328
+ return
329
+
330
+ n_unique = valid_series.n_unique()
331
+
332
+ min_val = valid_series.min()
333
+ max_val = valid_series.max()
334
+ range_span = (max_val - min_val) + 1
335
+
336
+ is_tight_sequence = range_span == n_unique
337
+
338
+ absolute_limit = 50 if is_tight_sequence else _ENCODED_CATEGORY_MAX_UNIQUE
339
+
340
+ absolute_ok = 0 < n_unique < absolute_limit
341
+ ratio_ok = (n_unique / n_valid) < _ENCODED_CATEGORY_MAX_RATIO
342
+
343
+ if (absolute_ok and ratio_ok) or (is_tight_sequence and absolute_ok):
344
+ info.flags.append(TypeFlag.EncodedCategory)
345
+
346
+ # ------------------------------------------------------------------
347
+ # Step 5: Identifier column
348
+ # ------------------------------------------------------------------
349
+
350
+ @staticmethod
351
+ def _check_identifier(series: pl.Series, info: ColumnTypeInfo, n_rows: int) -> None:
352
+ if n_rows == 0:
353
+ return
354
+
355
+ n_unique = series.n_unique()
356
+ if n_unique / n_rows <= _IDENTIFIER_UNIQUE_RATIO:
357
+ return
358
+
359
+ if series.dtype in (pl.Utf8, pl.String):
360
+ lengths = series.drop_nulls().str.len_chars()
361
+ if lengths.len() == 0:
362
+ return
363
+
364
+ median_length = lengths.median()
365
+
366
+ if (
367
+ median_length is not None
368
+ and median_length > _IDENTIFIER_MAX_MEDIAN_LENGTH
369
+ ):
370
+ return
371
+
372
+ info.flags.append(TypeFlag.IdentifierColumn)
373
+
374
+ # ------------------------------------------------------------------
375
+ # Step 6: Sequential index
376
+ # ------------------------------------------------------------------
377
+
378
+ @staticmethod
379
+ def _check_sequential_index(
380
+ series: pl.Series, info: ColumnTypeInfo, n_rows: int
381
+ ) -> None:
382
+ if n_rows == 0 or TypeFlag.IdentifierColumn not in info.flags:
383
+ # Only bother if already flagged as identifier
384
+ return
385
+
386
+ is_float = series.dtype in (pl.Float32, pl.Float64)
387
+ is_int = series.dtype in _INT_DTYPES
388
+
389
+ if not (is_float or is_int):
390
+ return
391
+
392
+ s_min = series.min()
393
+ s_max = series.max()
394
+
395
+ if (s_min != 0 and s_max != n_rows - 1) or (s_min != 1 or s_max != n_rows):
396
+ return
397
+
398
+ if is_float:
399
+ series_int = series.cast(pl.Int64)
400
+ if not (series == series_int).all():
401
+ return
402
+ series_to_check = series_int
403
+ else:
404
+ series_to_check = series
405
+
406
+ if series_to_check.n_unique() == n_rows:
407
+ flag = (
408
+ TypeFlag.FloatSequentialIndex if is_float else TypeFlag.SequentialIndex
409
+ )
410
+ info.flags.append(flag)
411
+
412
+ # ------------------------------------------------------------------
413
+ # Step 7: Numeric kind
414
+ # ------------------------------------------------------------------
415
+
416
+ @staticmethod
417
+ def _classify_numeric_kind(series: pl.Series, info: ColumnTypeInfo) -> None:
418
+ # Skip if it's an encoded category (treat as categorical, not numeric)
419
+ if TypeFlag.EncodedCategory in info.flags:
420
+ return
421
+
422
+ n_unique = series.drop_nulls().n_unique()
423
+
424
+ if series.dtype in _INT_DTYPES:
425
+ info.numeric_kind = NumericKind.Discrete
426
+ elif n_unique < _DISCRETE_NUNIQUE_THRESHOLD:
427
+ info.numeric_kind = NumericKind.Discrete
428
+ else:
429
+ info.numeric_kind = NumericKind.Continuous
430
+
431
+ @staticmethod
432
+ def _check_free_text(
433
+ series: pl.Series,
434
+ info: ColumnTypeInfo,
435
+ n_rows: int,
436
+ ) -> None:
437
+ non_null = series.drop_nulls()
438
+ if non_null.len() == 0:
439
+ return
440
+
441
+ char_lengths = non_null.str.len_chars()
442
+ median_chars = float(char_lengths.median() or 0.0)
443
+
444
+ if median_chars > _FREE_TEXT_MEDIAN_CHARS:
445
+ info.flags.append(TypeFlag.FreeTextCandidate)
446
+ return
447
+
448
+ space_counts = non_null.str.count_matches(r"\s+")
449
+ median_words = float(space_counts.median() or 0.0) + 1.0
450
+
451
+ if median_words > _FREE_TEXT_AVG_WORDS:
452
+ info.flags.append(TypeFlag.FreeTextCandidate)
453
+ return
454
+
455
+ p90_chars = float(char_lengths.quantile(0.9) or 0.0)
456
+
457
+ unique_ratio = series.n_unique() / n_rows if n_rows > 0 else 0.0
458
+
459
+ if (
460
+ p90_chars > _FREE_TEXT_P90_CHARS
461
+ and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO
462
+ ):
463
+ info.flags.append(TypeFlag.FreeTextCandidate)
profiling/config.py ADDED
@@ -0,0 +1,236 @@
1
+ """
2
+ Configuration and result dataclasses for the profiling phase — Phase 1 redesign.
3
+
4
+ ProfileConfig controls the structural profiler's behaviour.
5
+ Stats dataclasses hold per-column and dataset-level profiling results.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from dataclasses import dataclass, field
12
+ from enum import StrEnum
13
+ from typing import Optional, Union
14
+
15
+ from ._missingness_config import (
16
+ ColumnMissingnessProfile,
17
+ )
18
+ from ._correlation_config import (
19
+ CorrelationProfileResult,
20
+ )
21
+ from ._categorical_config import (
22
+ CategoricalStats,
23
+ )
24
+ from ._numeric_config import (
25
+ NumericStats,
26
+ )
27
+ from ._datetime_config import (
28
+ DatetimeStats,
29
+ )
30
+ from ._boolean_config import BooleanStats
31
+ from ._text_config import TextStats
32
+ from ._target_config import TargetProfileResult
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Core enums
36
+ # ---------------------------------------------------------------------------
37
+
38
+
39
+ class SemanticType(StrEnum):
40
+ Numeric = "numeric"
41
+ Categorical = "categorical"
42
+ Datetime = "datetime"
43
+ Boolean = "boolean"
44
+ Text = "text"
45
+ Identifier = "identifier"
46
+
47
+
48
+ class Modality(StrEnum):
49
+ Tabular = "tabular"
50
+ # Placeholder slots for future modalities — no implementation yet.
51
+ # Image = "image"
52
+ # TimeSeries = "time_series"
53
+
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # Type-detection enums — kept for TypeDetector compatibility
57
+ # ---------------------------------------------------------------------------
58
+
59
+
60
+ class NumericKind(StrEnum):
61
+ Continuous = "continuous"
62
+ Discrete = "discrete"
63
+
64
+
65
+ class TypeFlag(StrEnum):
66
+ NumericCoerced = "numeric_coerced"
67
+ DatetimeCoerced = "datetime_coerced"
68
+ BooleanCandidate = "boolean_candidate"
69
+ EncodedCategory = "encoded_category"
70
+ IdentifierColumn = "identifier_column"
71
+ SequentialIndex = "sequential_index"
72
+ FloatSequentialIndex = "float_sequential_index"
73
+ FreeTextCandidate = "free_text_candidate"
74
+
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # Column and dataset result containers
78
+ # ---------------------------------------------------------------------------
79
+
80
+ AnyStats = Union[NumericStats, CategoricalStats, DatetimeStats, BooleanStats, TextStats]
81
+
82
+
83
+ @dataclass
84
+ class ColumnProfile:
85
+ name: str = ""
86
+ semantic_type: Optional[SemanticType] = None
87
+ type_flags: list[TypeFlag] = field(default_factory=list)
88
+ original_dtype: str = ""
89
+ inferred_dtype: str = ""
90
+ missingness: Optional[ColumnMissingnessProfile] = None
91
+ is_target: bool = False
92
+ stats: Optional[AnyStats] = None
93
+
94
+
95
+ @dataclass
96
+ class RowMissingnessDistribution:
97
+ """
98
+ Dataset-level summary of per-row missing-value counts.
99
+ Computed by StructuralProfiler over the full active column set.
100
+ """
101
+
102
+ pct_zero_missing: float = 0.0
103
+ pct_one_to_two: float = 0.0
104
+ pct_three_to_five: float = 0.0
105
+ pct_over_five: float = 0.0
106
+ pct_over_half_missing: float = 0.0
107
+ drop_candidate_row_count: int = 0
108
+
109
+
110
+ @dataclass
111
+ class MemoryBreakdown:
112
+ column_bytes: dict[str, int] = field(default_factory=dict)
113
+
114
+ @property
115
+ def sorted_by_usage(self) -> list[tuple[str, int]]:
116
+ return sorted(self.column_bytes.items(), key=lambda x: x[1], reverse=True)
117
+
118
+ def top_consumers(self, n: int = 10) -> list[tuple[str, int]]:
119
+ return self.sorted_by_usage[:n]
120
+
121
+
122
+ @dataclass
123
+ class DatasetStats:
124
+ modality: Modality = Modality.Tabular
125
+ row_count: int = 0
126
+ column_count: int = 0
127
+ memory_bytes: int = 0
128
+ memory_breakdown: Optional[MemoryBreakdown] = None
129
+ duplicate_count: int = 0
130
+ duplicate_ratio: float = 0.0
131
+ overall_sparsity: float = 0.0
132
+ was_chunked: bool = False
133
+ missingness_matrix: Optional[dict[str, dict[str, float]]] = None
134
+ row_distribution: RowMissingnessDistribution = field(
135
+ default_factory=RowMissingnessDistribution
136
+ )
137
+
138
+ feature_correlation: Optional[CorrelationProfileResult] = None
139
+
140
+ target_correlations: dict[str, CorrelationProfileResult] = field(
141
+ default_factory=dict,
142
+ )
143
+
144
+
145
+ @dataclass
146
+ class StructuralProfileResult:
147
+ columns: dict[str, ColumnProfile] = field(default_factory=dict)
148
+ dataset: DatasetStats = field(default_factory=DatasetStats)
149
+ targets: dict[str, TargetProfileResult] = field(default_factory=dict)
150
+
151
+
152
+ # ---------------------------------------------------------------------------
153
+ # ProfileConfig — clean break from per-profiler column lists
154
+ # ---------------------------------------------------------------------------
155
+
156
+
157
+ @dataclass
158
+ class ProfileConfig:
159
+ """
160
+ Controls the structural profiler's behaviour.
161
+
162
+ Parameters
163
+ ----------
164
+ modality : Modality
165
+ Data modality. Currently only Tabular is implemented.
166
+ target_column : Optional[str]
167
+ Name of the label/target column, if any.
168
+ column_overrides : dict[str, SemanticType]
169
+ Explicit semantic type assignments that override auto-detection.
170
+ exclude_columns : list[str]
171
+ Columns to skip entirely during profiling.
172
+ compute_correlation : bool
173
+ Whether to compute the feature-feature correlation matrix.
174
+ correlation_target_column : Optional[str]
175
+ Column used for feature-target correlation metrics.
176
+ memory_threshold_mb : float
177
+ Memory (MB) above which chunked processing activates.
178
+ chunk_size : int
179
+ Rows per chunk when chunked processing is active.
180
+ """
181
+
182
+ modality: Modality = Modality.Tabular
183
+ target_columns: list[str] = field(default_factory=list)
184
+ column_overrides: dict[str, SemanticType] = field(default_factory=dict)
185
+ exclude_columns: list[str] = field(default_factory=list)
186
+ compute_correlation: bool = False
187
+ correlation_target_column: Optional[str] = None
188
+ memory_threshold_mb: float = 500.0
189
+ chunk_size: int = 100_000
190
+
191
+ def to_dict(self) -> dict:
192
+ return {
193
+ "modality": str(self.modality),
194
+ "target_columns": list(self.target_columns),
195
+ "column_overrides": {k: str(v) for k, v in self.column_overrides.items()},
196
+ "exclude_columns": list(self.exclude_columns),
197
+ "compute_correlation": self.compute_correlation,
198
+ "correlation_target_column": self.correlation_target_column,
199
+ "memory_threshold_mb": self.memory_threshold_mb,
200
+ "chunk_size": self.chunk_size,
201
+ }
202
+
203
+ @classmethod
204
+ def from_dict(cls, data: dict) -> ProfileConfig:
205
+ return cls(
206
+ modality=Modality(data.get("modality", Modality.Tabular)),
207
+ target_column=data.get("target_column"),
208
+ column_overrides={
209
+ k: SemanticType(v) for k, v in data.get("column_overrides", {}).items()
210
+ },
211
+ exclude_columns=list(data.get("exclude_columns", [])),
212
+ compute_correlation=bool(data.get("compute_correlation", False)),
213
+ correlation_target_column=data.get("correlation_target_column"),
214
+ memory_threshold_mb=float(data.get("memory_threshold_mb", 500.0)),
215
+ chunk_size=int(data.get("chunk_size", 100_000)),
216
+ )
217
+
218
+ def to_json(self) -> str:
219
+ return json.dumps(self.to_dict())
220
+
221
+ @classmethod
222
+ def from_json(cls, json_str: str) -> ProfileConfig:
223
+ return cls.from_dict(json.loads(json_str))
224
+
225
+
226
+ @dataclass
227
+ class ColumnTypeInfo:
228
+ column: str
229
+ original_dtype: str
230
+ inferred_dtype: str
231
+ numeric_kind: Optional[NumericKind] = None
232
+ flags: list[TypeFlag] = field(default_factory=list)
233
+ semantic_type: Optional[SemanticType] = None
234
+
235
+ def has_flag(self, flag: TypeFlag) -> bool:
236
+ return flag in self.flags