dataforge-ml 0.5.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/PKG-INFO +1 -1
  2. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/pyproject.toml +1 -1
  3. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_boolean_profiler.py +1 -13
  4. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_categorical.py +1 -2
  5. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_correlation_config.py +134 -35
  6. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_correlation_profiler.py +174 -3
  7. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_missingness_profiler.py +2 -1
  8. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_numeric_profiler.py +73 -118
  9. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_target_profiler.py +4 -2
  10. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_type_detector.py +78 -102
  11. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/structural.py +1 -1
  12. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/PKG-INFO +1 -1
  13. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/LICENSE +0 -0
  14. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/README.md +0 -0
  15. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/setup.cfg +0 -0
  16. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/__init__.py +0 -0
  17. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/models/__init__.py +0 -0
  18. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/models/_data_structure.py +0 -0
  19. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/models/_data_types.py +0 -0
  20. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/__init__.py +0 -0
  21. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_base.py +0 -0
  22. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
  23. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
  24. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
  25. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
  26. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  27. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
  28. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_tabular.py +0 -0
  29. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
  30. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
  31. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
  32. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/config.py +0 -0
  33. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/splitting/__init__.py +0 -0
  34. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/splitting/_config.py +0 -0
  35. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
  36. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/utils/__init__.py +0 -0
  37. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml/utils/data_loader.py +0 -0
  38. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
  39. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  40. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
  41. {dataforge_ml-0.5.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.5.0
3
+ Version: 0.7.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "0.5.0"
7
+ version = "0.7.0"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -84,19 +84,7 @@ class BooleanProfiler(ColumnBatchProfiler[BooleanProfileResult]):
84
84
  if override is not None:
85
85
  return False
86
86
 
87
- # Native boolean dtype
88
- if series.dtype == pl.Boolean:
89
- return True
90
-
91
- # Integer {0, 1} column — check after dropping nulls
92
- if series.dtype in _INT_DTYPES:
93
- clean = series.drop_nulls()
94
- if clean.len() == 0:
95
- return False
96
- unique_vals = set(clean.unique().to_list())
97
- return unique_vals <= {0, 1}
98
-
99
- return False
87
+ return True
100
88
 
101
89
  # ------------------------------------------------------------------
102
90
  # Orchestration
@@ -49,7 +49,6 @@ from .config import (
49
49
  ProfileConfig,
50
50
  SemanticType,
51
51
  )
52
- from ..models._data_types import _CAT_DTYPES
53
52
 
54
53
  # ---------------------------------------------------------------------------
55
54
  # Module-level thresholds (documented so callers can see what drives flags)
@@ -115,7 +114,7 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
115
114
  if override is not None:
116
115
  return False
117
116
 
118
- return series.dtype in _CAT_DTYPES
117
+ return True
119
118
 
120
119
  def _run(
121
120
  self,
@@ -7,41 +7,41 @@ numeric/categorical column lists that are already resolved upstream).
7
7
 
8
8
  Design notes
9
9
  ------------
10
- - Pearson matrix : linear relationships between numeric columns.
11
- - Spearman matrix : monotonic (rank-based) relationships; robust to
12
- outliers and non-linearity.
13
- - Near-redundancy : any pair with |r| > 0.95 flagged identical signal,
14
- one should be dropped before modelling.
15
- - Feature–target : Pearson for numeric target, ANOVA F / eta² for
16
- categorical target. Top-10 reported.
17
- - Mutual information: MI for all features vs target (classif or regression).
18
- Captures non-linear dependencies correlation misses.
10
+ - Pearson / Spearman : linear / monotonic relationships between numeric columns.
11
+ - Cramér's V : association between categorical column pairs [0, 1].
12
+ - Eta-squared : numeric-categorical association via ANOVA [0, 1].
13
+ - Near-redundancy : Pearson/Spearman |r| > 0.95, Cramér's V > 0.80,
14
+ or eta² > 0.50 flagged — near-identical signal.
15
+ - Feature–target : Pearson (numeric target), ANOVA/eta² (categorical target).
16
+ - Mutual information : MI for all features vs target (classif or regression).
19
17
  """
18
+
20
19
  from __future__ import annotations
21
20
 
22
21
  from dataclasses import dataclass, field
23
22
  from enum import StrEnum
24
23
  from typing import Optional
25
24
 
26
-
27
25
  # ---------------------------------------------------------------------------
28
26
  # Enums
29
27
  # ---------------------------------------------------------------------------
30
28
 
29
+
31
30
  class CorrelationMethod(StrEnum):
32
- Pearson = "pearson"
31
+ Pearson = "pearson"
33
32
  Spearman = "spearman"
34
33
 
35
34
 
36
35
  class TargetType(StrEnum):
37
- Numeric = "numeric" # numeric target → Pearson + MI regression
38
- Categorical = "categorical" # categorical target → ANOVA/eta² + MI classif
36
+ Numeric = "numeric" # numeric target → Pearson + MI regression
37
+ Categorical = "categorical" # categorical target → ANOVA/eta² + MI classif
39
38
 
40
39
 
41
40
  # ---------------------------------------------------------------------------
42
41
  # Pairwise correlation result
43
42
  # ---------------------------------------------------------------------------
44
43
 
44
+
45
45
  @dataclass
46
46
  class CorrelationPair:
47
47
  """
@@ -62,14 +62,74 @@ class CorrelationPair:
62
62
 
63
63
  col_a: str
64
64
  col_b: str
65
- pearson_r: Optional[float] = None
65
+ pearson_r: Optional[float] = None
66
66
  spearman_r: Optional[float] = None
67
67
  near_redundant: bool = False
68
68
 
69
69
  def to_dict(self) -> dict:
70
70
  return {
71
- "col_a": self.col_a, "col_b": self.col_b,
72
- "pearson_r": self.pearson_r, "spearman_r": self.spearman_r,
71
+ "col_a": self.col_a,
72
+ "col_b": self.col_b,
73
+ "pearson_r": self.pearson_r,
74
+ "spearman_r": self.spearman_r,
75
+ "near_redundant": self.near_redundant,
76
+ }
77
+
78
+
79
+ @dataclass
80
+ class CramerVPair:
81
+ """
82
+ Cramér's V association between two categorical columns.
83
+
84
+ Attributes
85
+ ----------
86
+ col_a, col_b : str
87
+ cramer_v : float | None
88
+ Cramér's V in [0, 1]. None when computation fails or sample too small.
89
+ near_redundant : bool
90
+ True when cramer_v exceeds the near-redundancy threshold (default 0.80).
91
+ """
92
+
93
+ col_a: str = ""
94
+ col_b: str = ""
95
+ cramer_v: Optional[float] = None
96
+ near_redundant: bool = False
97
+
98
+ def to_dict(self) -> dict:
99
+ return {
100
+ "col_a": self.col_a,
101
+ "col_b": self.col_b,
102
+ "cramer_v": self.cramer_v,
103
+ "near_redundant": self.near_redundant,
104
+ }
105
+
106
+
107
+ @dataclass
108
+ class EtaSquaredPair:
109
+ """
110
+ Eta-squared (η²) association between a numeric and a categorical column.
111
+
112
+ Attributes
113
+ ----------
114
+ numeric_col : str
115
+ categorical_col : str
116
+ eta_squared : float | None
117
+ Effect size in [0, 1]. None when computation fails.
118
+ Rule of thumb: 0.01 small, 0.06 medium, 0.14 large.
119
+ near_redundant : bool
120
+ True when eta_squared exceeds the near-redundancy threshold (default 0.50).
121
+ """
122
+
123
+ numeric_col: str = ""
124
+ categorical_col: str = ""
125
+ eta_squared: Optional[float] = None
126
+ near_redundant: bool = False
127
+
128
+ def to_dict(self) -> dict:
129
+ return {
130
+ "numeric_col": self.numeric_col,
131
+ "categorical_col": self.categorical_col,
132
+ "eta_squared": self.eta_squared,
73
133
  "near_redundant": self.near_redundant,
74
134
  }
75
135
 
@@ -78,6 +138,7 @@ class CorrelationPair:
78
138
  # Feature–target entries
79
139
  # ---------------------------------------------------------------------------
80
140
 
141
+
81
142
  @dataclass
82
143
  class NumericTargetCorrelation:
83
144
  """
@@ -88,7 +149,8 @@ class NumericTargetCorrelation:
88
149
  feature : str
89
150
  pearson_r : float | None
90
151
  """
91
- feature: str
152
+
153
+ feature: str
92
154
  pearson_r: Optional[float] = None
93
155
 
94
156
  def to_dict(self) -> dict:
@@ -113,15 +175,18 @@ class CategoricalTargetCorrelation:
113
175
  Effect size: SS_between / SS_total. Ranges [0, 1].
114
176
  Rule of thumb: 0.01 small, 0.06 medium, 0.14 large.
115
177
  """
116
- feature: str
178
+
179
+ feature: str
117
180
  f_statistic: Optional[float] = None
118
- p_value: Optional[float] = None
181
+ p_value: Optional[float] = None
119
182
  eta_squared: Optional[float] = None
120
183
 
121
184
  def to_dict(self) -> dict:
122
185
  return {
123
- "feature": self.feature, "f_statistic": self.f_statistic,
124
- "p_value": self.p_value, "eta_squared": self.eta_squared,
186
+ "feature": self.feature,
187
+ "f_statistic": self.f_statistic,
188
+ "p_value": self.p_value,
189
+ "eta_squared": self.eta_squared,
125
190
  }
126
191
 
127
192
 
@@ -129,6 +194,7 @@ class CategoricalTargetCorrelation:
129
194
  # Mutual information
130
195
  # ---------------------------------------------------------------------------
131
196
 
197
+
132
198
  @dataclass
133
199
  class MutualInformationEntry:
134
200
  """
@@ -143,9 +209,10 @@ class MutualInformationEntry:
143
209
  rank : int
144
210
  1 = highest MI (most informative).
145
211
  """
146
- feature: str
212
+
213
+ feature: str
147
214
  mi_score: float = 0.0
148
- rank: int = 0
215
+ rank: int = 0
149
216
 
150
217
  def to_dict(self) -> dict:
151
218
  return {"feature": self.feature, "mi_score": self.mi_score, "rank": self.rank}
@@ -155,6 +222,7 @@ class MutualInformationEntry:
155
222
  # Near-redundancy summary
156
223
  # ---------------------------------------------------------------------------
157
224
 
225
+
158
226
  @dataclass
159
227
  class NearRedundancyGroup:
160
228
  """
@@ -164,17 +232,22 @@ class NearRedundancyGroup:
164
232
  The suggested_drop list contains every column except the first
165
233
  alphabetically — a simple, deterministic heuristic.
166
234
  """
167
- columns: list[str] = field(default_factory=list)
235
+
236
+ columns: list[str] = field(default_factory=list)
168
237
  suggested_drop: list[str] = field(default_factory=list)
169
238
 
170
239
  def to_dict(self) -> dict:
171
- return {"columns": list(self.columns), "suggested_drop": list(self.suggested_drop)}
240
+ return {
241
+ "columns": list(self.columns),
242
+ "suggested_drop": list(self.suggested_drop),
243
+ }
172
244
 
173
245
 
174
246
  # ---------------------------------------------------------------------------
175
247
  # Top-level result
176
248
  # ---------------------------------------------------------------------------
177
249
 
250
+
178
251
  @dataclass
179
252
  class CorrelationProfileResult:
180
253
  """
@@ -211,23 +284,34 @@ class CorrelationProfileResult:
211
284
 
212
285
  # Column scope
213
286
  analysed_numeric_columns: list[str] = field(default_factory=list)
287
+ analysed_categorical_columns: list[str] = field(default_factory=list)
214
288
 
215
289
  # Pairwise matrices
216
- pearson_matrix: dict[str, dict[str, float]] = field(default_factory=dict)
290
+ pearson_matrix: dict[str, dict[str, float]] = field(default_factory=dict)
217
291
  spearman_matrix: dict[str, dict[str, float]] = field(default_factory=dict)
218
292
 
219
- # Pairwise summaries
220
- pairwise: list[CorrelationPair] = field(default_factory=list)
293
+ # Pairwise summaries — numeric ↔ numeric
294
+ pairwise: list[CorrelationPair] = field(default_factory=list)
221
295
  near_redundant_pairs: list[CorrelationPair] = field(default_factory=list)
222
296
  near_redundancy_groups: list[NearRedundancyGroup] = field(default_factory=list)
223
297
 
298
+ # Pairwise summaries — categorical ↔ categorical (Cramér's V)
299
+ cramer_v_pairs: list[CramerVPair] = field(default_factory=list)
300
+ near_redundant_cramer_v_pairs: list[CramerVPair] = field(default_factory=list)
301
+
302
+ # Pairwise summaries — numeric ↔ categorical (eta-squared)
303
+ eta_squared_pairs: list[EtaSquaredPair] = field(default_factory=list)
304
+ near_redundant_eta_squared_pairs: list[EtaSquaredPair] = field(default_factory=list)
305
+
224
306
  # Target info
225
- target_column: Optional[str] = None
226
- target_type: Optional[TargetType] = None
307
+ target_column: Optional[str] = None
308
+ target_type: Optional[TargetType] = None
227
309
 
228
310
  # Feature–target correlations (top-10 each)
229
- feature_target_numeric: list[NumericTargetCorrelation] = field(default_factory=list)
230
- feature_target_categorical: list[CategoricalTargetCorrelation] = field(default_factory=list)
311
+ feature_target_numeric: list[NumericTargetCorrelation] = field(default_factory=list)
312
+ feature_target_categorical: list[CategoricalTargetCorrelation] = field(
313
+ default_factory=list
314
+ )
231
315
 
232
316
  # Mutual information (all features, ranked)
233
317
  mutual_information: list[MutualInformationEntry] = field(default_factory=list)
@@ -249,14 +333,29 @@ class CorrelationProfileResult:
249
333
  def to_dict(self) -> dict:
250
334
  return {
251
335
  "analysed_numeric_columns": list(self.analysed_numeric_columns),
336
+ "analysed_categorical_columns": list(self.analysed_categorical_columns),
252
337
  "pearson_matrix": {k: dict(v) for k, v in self.pearson_matrix.items()},
253
338
  "spearman_matrix": {k: dict(v) for k, v in self.spearman_matrix.items()},
254
339
  "pairwise": [p.to_dict() for p in self.pairwise],
255
340
  "near_redundant_pairs": [p.to_dict() for p in self.near_redundant_pairs],
256
- "near_redundancy_groups": [g.to_dict() for g in self.near_redundancy_groups],
341
+ "near_redundancy_groups": [
342
+ g.to_dict() for g in self.near_redundancy_groups
343
+ ],
344
+ "cramer_v_pairs": [p.to_dict() for p in self.cramer_v_pairs],
345
+ "near_redundant_cramer_v_pairs": [
346
+ p.to_dict() for p in self.near_redundant_cramer_v_pairs
347
+ ],
348
+ "eta_squared_pairs": [p.to_dict() for p in self.eta_squared_pairs],
349
+ "near_redundant_eta_squared_pairs": [
350
+ p.to_dict() for p in self.near_redundant_eta_squared_pairs
351
+ ],
257
352
  "target_column": self.target_column,
258
353
  "target_type": str(self.target_type) if self.target_type else None,
259
- "feature_target_numeric": [f.to_dict() for f in self.feature_target_numeric],
260
- "feature_target_categorical": [f.to_dict() for f in self.feature_target_categorical],
354
+ "feature_target_numeric": [
355
+ f.to_dict() for f in self.feature_target_numeric
356
+ ],
357
+ "feature_target_categorical": [
358
+ f.to_dict() for f in self.feature_target_categorical
359
+ ],
261
360
  "mutual_information": [m.to_dict() for m in self.mutual_information],
262
361
  }
@@ -47,6 +47,8 @@ from ._correlation_config import (
47
47
  CategoricalTargetCorrelation,
48
48
  CorrelationPair,
49
49
  CorrelationProfileResult,
50
+ CramerVPair,
51
+ EtaSquaredPair,
50
52
  MutualInformationEntry,
51
53
  NearRedundancyGroup,
52
54
  NumericTargetCorrelation,
@@ -55,6 +57,8 @@ from ._correlation_config import (
55
57
  from ..models._data_types import _NUMERIC_DTYPES, _INT_DTYPES
56
58
 
57
59
  _NEAR_REDUNDANT_THRESHOLD: float = 0.95
60
+ _NEAR_REDUNDANT_CRAMER_V_THRESHOLD: float = 0.80
61
+ _NEAR_REDUNDANT_ETA_SQUARED_THRESHOLD: float = 0.50
58
62
  _TOP_N_FEATURE_TARGET: int = 10
59
63
  _MI_N_NEIGHBORS: int = 3
60
64
  _MI_MIN_ROWS: int = 10 # min complete-case rows for a meaningful k-NN MI estimate
@@ -142,13 +146,14 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
142
146
  self,
143
147
  df: pl.DataFrame,
144
148
  numeric_cols: list[str],
149
+ categorical_cols: Optional[list[str]] = None,
145
150
  ) -> CorrelationProfileResult:
146
151
  """
147
152
  Compute pairwise feature-feature correlation matrices.
148
153
 
149
- Pearson + Spearman matrices and near-redundancy groups are filled.
150
- All target-specific fields are left at their defaults (empty lists /
151
- None). Call profile_target() separately for each target column.
154
+ Pearson + Spearman for numeric pairs, Cramér's V for categorical pairs,
155
+ eta-squared for numeric-categorical pairs. All target-specific fields
156
+ are left at their defaults. Call profile_target() for target analysis.
152
157
  """
153
158
  result = CorrelationProfileResult()
154
159
 
@@ -159,6 +164,9 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
159
164
  ]
160
165
  result.analysed_numeric_columns = resolved_numeric
161
166
 
167
+ resolved_categorical = [c for c in (categorical_cols or []) if c in df.columns]
168
+ result.analysed_categorical_columns = resolved_categorical
169
+
162
170
  if len(resolved_numeric) >= 2:
163
171
  pearson_mat, spearman_mat = self._compute_matrices(df, resolved_numeric)
164
172
  result.pearson_matrix = pearson_mat
@@ -171,6 +179,22 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
171
179
  result.near_redundant_pairs
172
180
  )
173
181
 
182
+ if len(resolved_categorical) >= 2:
183
+ result.cramer_v_pairs = self._compute_cramer_v_pairs(
184
+ df, resolved_categorical, _NEAR_REDUNDANT_CRAMER_V_THRESHOLD
185
+ )
186
+ result.near_redundant_cramer_v_pairs = [
187
+ p for p in result.cramer_v_pairs if p.near_redundant
188
+ ]
189
+
190
+ if resolved_numeric and resolved_categorical:
191
+ result.eta_squared_pairs = self._compute_eta_squared_pairs(
192
+ df, resolved_numeric, resolved_categorical, _NEAR_REDUNDANT_ETA_SQUARED_THRESHOLD
193
+ )
194
+ result.near_redundant_eta_squared_pairs = [
195
+ p for p in result.eta_squared_pairs if p.near_redundant
196
+ ]
197
+
174
198
  return result
175
199
 
176
200
  def profile_target(
@@ -316,6 +340,153 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
316
340
  for members in uf.groups()
317
341
  ]
318
342
 
343
+ # ------------------------------------------------------------------
344
+ # Step 3b: Cramér's V — categorical ↔ categorical
345
+ # ------------------------------------------------------------------
346
+
347
+ @staticmethod
348
+ def _compute_cramer_v_pairs(
349
+ df: pl.DataFrame,
350
+ cat_cols: list[str],
351
+ threshold: float,
352
+ ) -> list[CramerVPair]:
353
+ try:
354
+ from scipy.stats import chi2_contingency
355
+ except ImportError:
356
+ warnings.warn(
357
+ "scipy is required for Cramér's V. Install: pip install scipy",
358
+ stacklevel=3,
359
+ )
360
+ return []
361
+
362
+ import numpy as np
363
+
364
+ pairs: list[CramerVPair] = []
365
+ for col_a, col_b in itertools.combinations(cat_cols, 2):
366
+ pair_df = (
367
+ df.select([
368
+ pl.col(col_a).cast(pl.Utf8, strict=False),
369
+ pl.col(col_b).cast(pl.Utf8, strict=False),
370
+ ])
371
+ .drop_nulls()
372
+ )
373
+ n = pair_df.height
374
+ if n < 5:
375
+ pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
376
+ continue
377
+
378
+ counts = pair_df.group_by([col_a, col_b]).agg(pl.len().alias("count"))
379
+ a_unique = sorted(counts[col_a].unique().to_list())
380
+ b_unique = sorted(counts[col_b].unique().to_list())
381
+ if len(a_unique) < 2 or len(b_unique) < 2:
382
+ pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
383
+ continue
384
+
385
+ a_idx = {v: i for i, v in enumerate(a_unique)}
386
+ b_idx = {v: i for i, v in enumerate(b_unique)}
387
+ ct = np.zeros((len(a_unique), len(b_unique)), dtype=int)
388
+ for a_val, b_val, cnt in zip(
389
+ counts[col_a].to_list(),
390
+ counts[col_b].to_list(),
391
+ counts["count"].to_list(),
392
+ ):
393
+ ct[a_idx[a_val], b_idx[b_val]] = cnt
394
+
395
+ try:
396
+ chi2, _, _, _ = chi2_contingency(ct)
397
+ r, c = ct.shape
398
+ phi2 = chi2 / n
399
+ # Bergsma & Wicher (2013) bias correction
400
+ phi2_corr = max(0.0, phi2 - (r - 1) * (c - 1) / (n - 1))
401
+ r_corr = r - (r - 1) ** 2 / (n - 1)
402
+ c_corr = c - (c - 1) ** 2 / (n - 1)
403
+ denom = min(r_corr - 1, c_corr - 1)
404
+ if denom <= 0:
405
+ # Near-saturated contingency table (n_unique ≈ n_rows):
406
+ # bias correction collapses denominator; skip the pair.
407
+ pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
408
+ continue
409
+ v = float(np.sqrt(phi2_corr / denom))
410
+ v = max(0.0, min(1.0, v))
411
+ except Exception as exc:
412
+ warnings.warn(
413
+ f"Cramér's V failed for ({col_a}, {col_b}): {exc}", stacklevel=3
414
+ )
415
+ pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
416
+ continue
417
+
418
+ pairs.append(CramerVPair(
419
+ col_a=col_a, col_b=col_b,
420
+ cramer_v=v,
421
+ near_redundant=v > threshold,
422
+ ))
423
+
424
+ return pairs
425
+
426
+ # ------------------------------------------------------------------
427
+ # Step 3c: Eta-squared — numeric ↔ categorical
428
+ # ------------------------------------------------------------------
429
+
430
+ @staticmethod
431
+ def _compute_eta_squared_pairs(
432
+ df: pl.DataFrame,
433
+ numeric_cols: list[str],
434
+ cat_cols: list[str],
435
+ threshold: float,
436
+ ) -> list[EtaSquaredPair]:
437
+ try:
438
+ from scipy.stats import f_oneway
439
+ except ImportError:
440
+ warnings.warn(
441
+ "scipy is required for eta-squared. Install: pip install scipy",
442
+ stacklevel=3,
443
+ )
444
+ return []
445
+
446
+ pairs: list[EtaSquaredPair] = []
447
+ for num_col in numeric_cols:
448
+ feat = df[num_col].cast(pl.Float64)
449
+ valid_feat = feat.drop_nulls()
450
+ if valid_feat.len() == 0:
451
+ continue
452
+ grand_mean = float(valid_feat.mean()) # type: ignore[arg-type]
453
+ ss_total = float(((valid_feat - grand_mean) ** 2).sum() or 0.0)
454
+
455
+ for cat_col in cat_cols:
456
+ target = df[cat_col]
457
+ categories = target.drop_nulls().unique().to_list()
458
+ groups = [
459
+ feat.filter(target == cat).drop_nulls().to_numpy()
460
+ for cat in categories
461
+ ]
462
+ non_empty = [g for g in groups if len(g) > 0]
463
+ if len(non_empty) < 2:
464
+ pairs.append(EtaSquaredPair(numeric_col=num_col, categorical_col=cat_col))
465
+ continue
466
+ try:
467
+ f_oneway(*non_empty)
468
+ ss_between = sum(
469
+ len(g) * (float(g.mean()) - grand_mean) ** 2
470
+ for g in non_empty
471
+ )
472
+ eta_sq = ss_between / ss_total if ss_total > 0 else 0.0
473
+ eta_sq = max(0.0, min(1.0, eta_sq))
474
+ except Exception as exc:
475
+ warnings.warn(
476
+ f"Eta-squared failed for ({num_col}, {cat_col}): {exc}",
477
+ stacklevel=3,
478
+ )
479
+ pairs.append(EtaSquaredPair(numeric_col=num_col, categorical_col=cat_col))
480
+ continue
481
+
482
+ pairs.append(EtaSquaredPair(
483
+ numeric_col=num_col, categorical_col=cat_col,
484
+ eta_squared=eta_sq,
485
+ near_redundant=eta_sq > threshold,
486
+ ))
487
+
488
+ return pairs
489
+
319
490
  # ------------------------------------------------------------------
320
491
  # Step 5a: Feature–target Pearson (unchanged)
321
492
  # ------------------------------------------------------------------
@@ -207,7 +207,8 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
207
207
  profile.effective_null_ratio = eff_count / n_rows if n_rows else 0.0
208
208
 
209
209
  r = profile.effective_null_ratio
210
- if r < _SEVERITY_MINOR:
210
+
211
+ if r < _SEVERITY_MINOR and r != 0:
211
212
  profile.severity = MissingSeverity.Minor
212
213
  elif r < _SEVERITY_MODERATE:
213
214
  profile.severity = MissingSeverity.Moderate
@@ -50,7 +50,6 @@ from ._numeric_config import (
50
50
  NumericTopValueEntry,
51
51
  HistogramBin,
52
52
  )
53
- from ..models._data_types import _NUMERIC_DTYPES
54
53
 
55
54
  # ---------------------------------------------------------------------------
56
55
  # Thresholds (documented so callers can see what drives labels / flags)
@@ -119,7 +118,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
119
118
  if override is not None:
120
119
  return False
121
120
 
122
- return series.dtype in _NUMERIC_DTYPES
121
+ return True
123
122
 
124
123
  def _run(
125
124
  self,
@@ -127,9 +126,8 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
127
126
  columns: list[str],
128
127
  ) -> NumericProfileResult:
129
128
  result = NumericProfileResult()
130
-
131
129
  n_rows = df.height
132
- # Intersect requested columns with the actual schema
130
+
133
131
  available = [
134
132
  c
135
133
  for c in self._resolve_columns(df.columns, columns)
@@ -137,15 +135,78 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
137
135
  ]
138
136
  result.analysed_columns = available
139
137
 
140
- for col_name in available:
141
- series = df[col_name]
142
- profile = self._profile_column(series, n_rows)
143
- result.columns[col_name] = profile
138
+ if not available:
139
+ return result
140
+
141
+ # One df.select([...]) for all scalar stats across all columns so
142
+ # Polars can parallelise expression evaluation rather than running
143
+ # independent query plans per column.
144
+ exprs: list[pl.Expr] = []
145
+ for col in available:
146
+ c = pl.col(col).cast(pl.Float64, strict=False)
147
+ exprs.append(c.mean().alias(f"{col}__mean"))
148
+ exprs.append(c.median().alias(f"{col}__median"))
149
+ exprs.append(c.min().alias(f"{col}__min"))
150
+ exprs.append(c.max().alias(f"{col}__max"))
151
+ exprs.append(c.std(ddof=1).alias(f"{col}__std"))
152
+ for q in _QUANTILE_LEVELS:
153
+ exprs.append(
154
+ c.quantile(q, interpolation="linear").alias(f"{col}__q{q}")
155
+ )
156
+
157
+ batch = df.select(exprs).row(0, named=True)
158
+
159
+ for col in available:
160
+ series = df[col]
161
+ f64 = series.cast(pl.Float64, strict=False)
162
+ clean = f64.drop_nulls()
163
+ profile = NumericStats()
164
+
165
+ if clean.len() == 0:
166
+ result.columns[col] = profile
167
+ continue
168
+
169
+ # Central tendency
170
+ mean = float(batch[f"{col}__mean"])
171
+ median = float(batch[f"{col}__median"])
172
+ profile.mean = mean
173
+ profile.median = median
174
+ if median == 0.0:
175
+ profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
176
+ else:
177
+ profile.mean_median_ratio = mean / median
178
+
179
+ # Range
180
+ profile.min = float(batch[f"{col}__min"])
181
+ profile.max = float(batch[f"{col}__max"])
182
+
183
+ # Spread — Polars returns null for std with ddof=1 on a single row
184
+ std_val = batch[f"{col}__std"]
185
+ profile.std = float(std_val) if std_val is not None else 0.0
186
+ profile.variance = profile.std ** 2
187
+
188
+ # Percentiles
189
+ q_vals = [batch[f"{col}__q{q}"] for q in _QUANTILE_LEVELS]
190
+ profile.percentiles = PercentileSnapshot(
191
+ p1=q_vals[0], p5=q_vals[1], p25=q_vals[2], p50=q_vals[3],
192
+ p75=q_vals[4], p95=q_vals[5], p99=q_vals[6],
193
+ )
194
+
195
+ # Frequency / distribution stays per-column (returns a frame, not a scalar)
196
+ self._compute_frequency_and_distribution(series, clean, profile, n_rows)
197
+
198
+ # Shape stays per-column (delegates to scipy on a numpy array)
199
+ self._compute_shape(clean, profile)
200
+
201
+ self._check_scale_anomaly(profile)
202
+
203
+ result.columns[col] = profile
144
204
 
145
205
  return result
146
206
 
147
207
  # ------------------------------------------------------------------
148
- # Per-column driver
208
+ # Per-column helpers (frequency/distribution and shape only —
209
+ # scalar stats are now batched in _run above)
149
210
  # ------------------------------------------------------------------
150
211
 
151
212
  @staticmethod
@@ -196,7 +257,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
196
257
  # --- 20-Bin Histogram Distribution (Continuous) ---
197
258
  import numpy as np
198
259
 
199
- counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins=20)
260
+ counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins="auto")
200
261
  profile.histogram = [
201
262
  HistogramBin(
202
263
  lower_bound=float(bin_edges[i]),
@@ -207,73 +268,8 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
207
268
  for i in range(len(counts))
208
269
  ]
209
270
 
210
- def _profile_column(
211
- self,
212
- series: pl.Series,
213
- n_rows: int,
214
- ) -> NumericStats:
215
- profile = NumericStats()
216
-
217
- f64 = series.cast(pl.Float64)
218
- clean = f64.drop_nulls()
219
-
220
- if clean.len() == 0:
221
- return profile
222
-
223
- self._compute_central_tendency(clean, profile)
224
- self._compute_range(clean, profile)
225
- self._compute_frequency_and_distribution(series, clean, profile, n_rows)
226
- self._compute_percentiles(clean, profile)
227
- self._compute_spread(clean, profile)
228
- self._compute_shape(clean, profile)
229
- self._check_scale_anomaly(profile)
230
-
231
- return profile
232
-
233
271
  # ------------------------------------------------------------------
234
- # Step 1: Central tendency
235
- # ------------------------------------------------------------------
236
-
237
- @staticmethod
238
- def _compute_central_tendency(
239
- clean: pl.Series,
240
- profile: NumericStats,
241
- ) -> None:
242
- mean = float(clean.mean()) # type: ignore[arg-type]
243
- median = float(clean.median()) # type: ignore[arg-type]
244
-
245
- profile.mean = mean
246
- profile.median = median
247
-
248
- # Mean/median ratio: primary skew indicator at a glance.
249
- # Guard against division by zero (e.g. a column of all zeros).
250
- if median == 0.0:
251
- profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
252
- else:
253
- profile.mean_median_ratio = mean / median
254
-
255
- # ------------------------------------------------------------------
256
- # Step 2: Spread
257
- # ------------------------------------------------------------------
258
-
259
- @staticmethod
260
- def _compute_spread(
261
- clean: pl.Series,
262
- profile: NumericStats,
263
- ) -> None:
264
- n = clean.len()
265
- if n < 2:
266
- # Std / variance undefined for a single observation
267
- profile.std = 0.0
268
- profile.variance = 0.0
269
- return
270
-
271
- std = float(clean.std(ddof=1)) # type: ignore[arg-type]
272
- profile.std = std
273
- profile.variance = std**2
274
-
275
- # ------------------------------------------------------------------
276
- # Step 3: Shape — skewness and kurtosis
272
+ # Step 2: Shape — skewness and kurtosis
277
273
  # ------------------------------------------------------------------
278
274
 
279
275
  @staticmethod
@@ -315,48 +311,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
315
311
  profile.kurtosis_tag = KurtosisTag.Mesokurtic
316
312
 
317
313
  # ------------------------------------------------------------------
318
- # Step 4: Range
319
- # ------------------------------------------------------------------
320
-
321
- @staticmethod
322
- def _compute_range(
323
- clean: pl.Series,
324
- profile: NumericStats,
325
- ) -> None:
326
- profile.min = float(clean.min()) # type: ignore[arg-type]
327
- profile.max = float(clean.max()) # type: ignore[arg-type]
328
-
329
- # ------------------------------------------------------------------
330
- # Step 5: Percentiles
331
- # ------------------------------------------------------------------
332
-
333
- @staticmethod
334
- def _compute_percentiles(
335
- clean: pl.Series,
336
- profile: NumericStats,
337
- ) -> None:
338
- # Polars quantile() is O(n log n) once; compute all at once via select
339
- # to avoid repeated passes.
340
- quantile_frame = pl.DataFrame({"v": clean}).select(
341
- [
342
- pl.col("v").quantile(q, interpolation="linear").alias(f"q{i}")
343
- for i, q in enumerate(_QUANTILE_LEVELS)
344
- ]
345
- )
346
- row = quantile_frame.row(0)
347
- # row order: p1, p5, p25, p50, p75, p95, p99
348
- profile.percentiles = PercentileSnapshot(
349
- p1=row[0],
350
- p5=row[1],
351
- p25=row[2],
352
- p50=row[3],
353
- p75=row[4],
354
- p95=row[5],
355
- p99=row[6],
356
- )
357
-
358
- # ------------------------------------------------------------------
359
- # Step 6: Scale-anomaly flag
314
+ # Step 3: Scale-anomaly flag
360
315
  # ------------------------------------------------------------------
361
316
 
362
317
  @staticmethod
@@ -148,9 +148,11 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
148
148
  """Generates numeric metrics and checks for target skewness."""
149
149
  num_profiler = NumericProfiler(config=self.config)
150
150
 
151
- num_profile = num_profiler._profile_column(series, n_rows)
151
+ col_name = series.name
152
+ num_result = num_profiler.profile(series.to_frame(), [col_name])
153
+ num_profile = num_result.columns.get(col_name)
152
154
  result.numeric_profile = num_profile
153
155
 
154
156
  # Flag Skewness (Highly skewed targets often require Log/Yeo-Johnson transforms)
155
- if num_profile.skewness_severity in (SkewSeverity.High, SkewSeverity.Severe):
157
+ if num_profile and num_profile.skewness_severity in (SkewSeverity.High, SkewSeverity.Severe):
156
158
  result.flags.append(TargetFlag.HighlySkewed)
@@ -35,10 +35,11 @@ _IDENTIFIER_UNIQUE_RATIO = 0.99 # >99 % unique → identifier
35
35
  _IDENTIFIER_MAX_MEDIAN_LENGTH = 40
36
36
  _DISCRETE_NUNIQUE_THRESHOLD = 20 # numeric with <20 unique values → discrete
37
37
 
38
- _FREE_TEXT_AVG_WORDS: int = 5 # avg word count above which → Text
39
- _FREE_TEXT_MEDIAN_CHARS: int = 35
40
- _FREE_TEXT_P90_CHARS: int = 60
38
+ _FREE_TEXT_AVG_WORDS: int = 3
39
+ _FREE_TEXT_MEDIAN_CHARS: int = 20
40
+ _FREE_TEXT_P90_CHARS: int = 35
41
41
  _FREE_TEXT_MIN_UNIQUE_RATIO: float = 0.40
42
+ _FREE_TEXT_HIGH_UNIQUE_WITH_SPACES: float = 0.70 # unique ratio above which multi-token strings → Text
42
43
 
43
44
 
44
45
  # Common boolean string values (lowercased)
@@ -77,115 +78,87 @@ class TypeDetector:
77
78
  original_dtype=original_dtype,
78
79
  inferred_dtype=original_dtype,
79
80
  )
80
-
81
- # Work with a copy that may be re-assigned after coercion
82
81
  working = series
83
82
 
84
83
  # 1 & 2: Coercion for string columns
85
- if series.dtype == pl.Utf8 or series.dtype == pl.String:
84
+ if series.dtype in (pl.Utf8, pl.String):
86
85
  coerced, flag = self._try_numeric_coerce(series, n_rows)
87
86
  if coerced is not None:
88
87
  info.inferred_dtype = str(coerced.dtype)
89
88
  info.flags.append(flag) # type: ignore[arg-type]
90
89
  working = coerced
91
-
92
- self._check_coerced_encoded_category(working, info, n_rows)
90
+ self._check_coerced_encoded_category(working, info)
93
91
  else:
94
92
  coerced_dt, flag_dt = self._try_datetime_coerce(
95
- series, col_name, n_rows
93
+ series, n_rows
96
94
  )
97
95
  if coerced_dt is not None:
98
96
  info.inferred_dtype = str(coerced_dt.dtype)
99
97
  info.flags.append(flag_dt) # type: ignore[arg-type]
100
- working = coerced_dt
101
-
102
98
  info.semantic_type = SemanticType.Datetime
103
99
  results[col_name] = info
104
100
  continue
105
101
 
106
102
  # 3: Boolean candidate
107
103
  self._check_boolean_candidate(working, info)
104
+ if TypeFlag.BooleanCandidate in info.flags:
105
+ info.semantic_type = SemanticType.Boolean
106
+ results[col_name] = info
107
+ continue
108
+
109
+ # Native datetime types
110
+ if working.dtype in (pl.Date, pl.Datetime, pl.Duration, pl.Time) or isinstance(
111
+ working.dtype, pl.Datetime
112
+ ):
113
+ info.semantic_type = SemanticType.Datetime
114
+ results[col_name] = info
115
+ continue
108
116
 
109
- # Work only on numeric-ish columns for the remaining checks
117
+ # 4–7: Numeric path
110
118
  if working.dtype in _NUMERIC_DTYPES:
111
- # 4 & 5: Encoded category and identifier checks — integers only.
112
- # Continuous floats have high cardinality by nature and are never
113
- # identifiers; restricting these checks prevents false Identifier
114
- # classification of genuine numeric features.
115
119
  if working.dtype in _INT_DTYPES:
116
- self._check_encoded_category(working, info, n_rows)
117
- self._check_identifier(working, info, n_rows)
118
-
119
- # 6: Sequential index (integers only)
120
- if working.dtype in _INT_DTYPES or working.dtype in (
121
- pl.Float32,
122
- pl.Float64,
123
- ):
124
- self._check_sequential_index(working, info, n_rows)
125
-
126
- # 7: Numeric kind (skip for identifiers / sequential indices)
127
- if not any(
128
- info.has_flag(f)
129
- for f in (
130
- TypeFlag.IdentifierColumn,
131
- TypeFlag.SequentialIndex,
132
- TypeFlag.FloatSequentialIndex,
133
- )
134
- ):
120
+ # EncodedCategory and IdentifierColumn are mutually exclusive:
121
+ # low-cardinality and near-unique cannot both be true.
122
+ # Check encoded category first; skip identifier if it matches.
123
+ self._check_encoded_category(working, info)
124
+ if TypeFlag.EncodedCategory not in info.flags:
125
+ self._check_identifier(working, info, n_rows)
126
+ if TypeFlag.IdentifierColumn in info.flags:
127
+ self._check_sequential_index(working, info, n_rows)
128
+
129
+ if TypeFlag.EncodedCategory in info.flags:
130
+ info.semantic_type = SemanticType.Categorical
131
+ elif TypeFlag.IdentifierColumn in info.flags:
132
+ info.semantic_type = SemanticType.Identifier
133
+ else:
135
134
  self._classify_numeric_kind(working, info)
135
+ info.semantic_type = SemanticType.Numeric
136
136
 
137
- elif working.dtype == pl.Utf8 or working.dtype == pl.String:
138
- # String identifier check
139
- self._check_identifier(working, info, n_rows)
137
+ results[col_name] = info
138
+ continue
140
139
 
140
+ # String path
141
+ if working.dtype in (pl.Utf8, pl.String):
141
142
  self._check_free_text(working, info, n_rows)
142
-
143
- info.semantic_type = self._derive_semantic_type(
144
- info,
145
- working,
146
- n_rows,
147
- )
148
-
143
+ if TypeFlag.FreeTextCandidate in info.flags:
144
+ info.semantic_type = SemanticType.Text
145
+ results[col_name] = info
146
+ continue
147
+ self._check_identifier(working, info, n_rows)
148
+ info.semantic_type = (
149
+ SemanticType.Identifier
150
+ if TypeFlag.IdentifierColumn in info.flags
151
+ else SemanticType.Categorical
152
+ )
153
+ results[col_name] = info
154
+ continue
155
+
156
+ # Fallback
157
+ info.semantic_type = SemanticType.Text
149
158
  results[col_name] = info
150
159
 
151
160
  return results
152
161
 
153
- @staticmethod
154
- def _derive_semantic_type(
155
- info: ColumnTypeInfo,
156
- working: pl.Series,
157
- n_rows: int,
158
- ) -> SemanticType:
159
- if TypeFlag.IdentifierColumn in info.flags:
160
- return SemanticType.Identifier
161
-
162
- if TypeFlag.BooleanCandidate in info.flags:
163
- return SemanticType.Boolean
164
-
165
- is_native_datetime = working.dtype in (
166
- pl.Date,
167
- pl.Datetime,
168
- pl.Duration,
169
- pl.Time,
170
- ) or (hasattr(pl, "Datetime") and isinstance(working.dtype, pl.Datetime))
171
-
172
- if is_native_datetime or TypeFlag.DatetimeCoerced in info.flags:
173
- return SemanticType.Datetime
174
-
175
- if TypeFlag.EncodedCategory in info.flags:
176
- return SemanticType.Categorical
177
-
178
- if working.dtype in (pl.Utf8, pl.String):
179
- if TypeFlag.FreeTextCandidate in info.flags:
180
- return SemanticType.Text
181
-
182
- return SemanticType.Categorical
183
-
184
- if working.dtype in _NUMERIC_DTYPES:
185
- return SemanticType.Numeric
186
-
187
- return SemanticType.Categorical
188
-
189
162
  # ------------------------------------------------------------------
190
163
  # Step 1: Numeric coercion
191
164
  # ------------------------------------------------------------------
@@ -221,7 +194,7 @@ class TypeDetector:
221
194
 
222
195
  @staticmethod
223
196
  def _try_datetime_coerce(
224
- series: pl.Series, col_name: str, n_rows: int
197
+ series: pl.Series, n_rows: int
225
198
  ) -> tuple[pl.Series, TypeFlag] | tuple[None, None]:
226
199
  """
227
200
  Attempt datetime coercion if the column name looks date-like.
@@ -269,7 +242,7 @@ class TypeDetector:
269
242
 
270
243
  @staticmethod
271
244
  def _check_coerced_encoded_category(
272
- series: pl.Series, info: ColumnTypeInfo, n_rows: int
245
+ series: pl.Series, info: ColumnTypeInfo
273
246
  ) -> None:
274
247
  """
275
248
  Post-coercion low-cardinality check for Float64 series that originated
@@ -312,9 +285,8 @@ class TypeDetector:
312
285
 
313
286
  @staticmethod
314
287
  def _check_encoded_category(
315
- series: pl.Series, info: ColumnTypeInfo, n_rows: int
288
+ series: pl.Series, info: ColumnTypeInfo
316
289
  ) -> None:
317
- # Skip if already flagged as boolean candidate (subset of {0,1})
318
290
  if TypeFlag.BooleanCandidate in info.flags:
319
291
  return
320
292
 
@@ -357,16 +329,17 @@ class TypeDetector:
357
329
  return
358
330
 
359
331
  if series.dtype in (pl.Utf8, pl.String):
360
- lengths = series.drop_nulls().str.len_chars()
361
- if lengths.len() == 0:
332
+ non_null = series.drop_nulls()
333
+ if non_null.len() == 0:
362
334
  return
363
335
 
364
- median_length = lengths.median()
336
+ median_length = non_null.str.len_chars().median()
337
+ if median_length is not None and median_length > _IDENTIFIER_MAX_MEDIAN_LENGTH:
338
+ return
365
339
 
366
- if (
367
- median_length is not None
368
- and median_length > _IDENTIFIER_MAX_MEDIAN_LENGTH
369
- ):
340
+ # Real identifiers are single tokens — no spaces.
341
+ # Sentences and descriptions have median_spaces > 0.
342
+ if float(non_null.str.count_matches(r"\s+").median() or 0.0) > 0:
370
343
  return
371
344
 
372
345
  info.flags.append(TypeFlag.IdentifierColumn)
@@ -440,24 +413,27 @@ class TypeDetector:
440
413
 
441
414
  char_lengths = non_null.str.len_chars()
442
415
  median_chars = float(char_lengths.median() or 0.0)
416
+ space_counts = non_null.str.count_matches(r"\s+")
417
+ median_spaces = float(space_counts.median() or 0.0)
418
+ median_words = median_spaces + 1.0
419
+ unique_ratio = series.n_unique() / n_rows if n_rows > 0 else 0.0
443
420
 
444
- if median_chars > _FREE_TEXT_MEDIAN_CHARS:
421
+ # Multi-word strings of medium length: names, addresses, short descriptions
422
+ if median_chars > _FREE_TEXT_MEDIAN_CHARS and median_spaces >= 1.0:
445
423
  info.flags.append(TypeFlag.FreeTextCandidate)
446
424
  return
447
425
 
448
- space_counts = non_null.str.count_matches(r"\s+")
449
- median_words = float(space_counts.median() or 0.0) + 1.0
450
-
426
+ # Long average word count: sentences, paragraphs
451
427
  if median_words > _FREE_TEXT_AVG_WORDS:
452
428
  info.flags.append(TypeFlag.FreeTextCandidate)
453
429
  return
454
430
 
455
431
  p90_chars = float(char_lengths.quantile(0.9) or 0.0)
432
+ if p90_chars > _FREE_TEXT_P90_CHARS and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO:
433
+ info.flags.append(TypeFlag.FreeTextCandidate)
434
+ return
456
435
 
457
- unique_ratio = series.n_unique() / n_rows if n_rows > 0 else 0.0
458
-
459
- if (
460
- p90_chars > _FREE_TEXT_P90_CHARS
461
- and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO
462
- ):
436
+ # High-cardinality multi-token strings that don't meet char thresholds:
437
+ # e.g. short full names like "John Smith", compound tokens
438
+ if unique_ratio >= _FREE_TEXT_HIGH_UNIQUE_WITH_SPACES and median_spaces >= 1.0:
463
439
  info.flags.append(TypeFlag.FreeTextCandidate)
@@ -199,7 +199,7 @@ class StructuralProfiler:
199
199
 
200
200
  # 8a. Feature-feature matrices — computed ONCE, target-independent.
201
201
  feature_corr = corr_profiler.profile_features(
202
- data, numeric_cols
202
+ data, numeric_cols, categorical_cols
203
203
  )
204
204
  result.dataset.feature_correlation = feature_corr
205
205
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.5.0
3
+ Version: 0.7.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
File without changes
File without changes
File without changes