dataforge-ml 0.6.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/PKG-INFO +1 -1
  2. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/pyproject.toml +1 -1
  3. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_correlation_profiler.py +7 -1
  4. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_type_detector.py +71 -96
  5. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/PKG-INFO +1 -1
  6. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/LICENSE +0 -0
  7. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/README.md +0 -0
  8. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/setup.cfg +0 -0
  9. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/__init__.py +0 -0
  10. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/models/__init__.py +0 -0
  11. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/models/_data_structure.py +0 -0
  12. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/models/_data_types.py +0 -0
  13. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/__init__.py +0 -0
  14. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_base.py +0 -0
  15. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
  16. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
  17. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_categorical.py +0 -0
  18. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
  19. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
  20. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
  21. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
  22. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  23. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
  24. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
  25. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_numeric_profiler.py +0 -0
  26. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_tabular.py +0 -0
  27. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
  28. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
  29. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
  30. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
  31. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/config.py +0 -0
  32. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/structural.py +0 -0
  33. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/splitting/__init__.py +0 -0
  34. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/splitting/_config.py +0 -0
  35. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
  36. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/utils/__init__.py +0 -0
  37. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/utils/data_loader.py +0 -0
  38. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
  39. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  40. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
  41. {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.6.0
3
+ Version: 0.7.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "0.6.0"
7
+ version = "0.7.0"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -400,7 +400,13 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
400
400
  phi2_corr = max(0.0, phi2 - (r - 1) * (c - 1) / (n - 1))
401
401
  r_corr = r - (r - 1) ** 2 / (n - 1)
402
402
  c_corr = c - (c - 1) ** 2 / (n - 1)
403
- v = float(np.sqrt(phi2_corr / min(r_corr - 1, c_corr - 1)))
403
+ denom = min(r_corr - 1, c_corr - 1)
404
+ if denom <= 0:
405
+ # Near-saturated contingency table (n_unique ≈ n_rows):
406
+ # bias correction collapses denominator; skip the pair.
407
+ pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
408
+ continue
409
+ v = float(np.sqrt(phi2_corr / denom))
404
410
  v = max(0.0, min(1.0, v))
405
411
  except Exception as exc:
406
412
  warnings.warn(
@@ -35,10 +35,11 @@ _IDENTIFIER_UNIQUE_RATIO = 0.99 # >99 % unique → identifier
35
35
  _IDENTIFIER_MAX_MEDIAN_LENGTH = 40
36
36
  _DISCRETE_NUNIQUE_THRESHOLD = 20 # numeric with <20 unique values → discrete
37
37
 
38
- _FREE_TEXT_AVG_WORDS: int = 5 # avg word count above which → Text
39
- _FREE_TEXT_MEDIAN_CHARS: int = 35
40
- _FREE_TEXT_P90_CHARS: int = 60
38
+ _FREE_TEXT_AVG_WORDS: int = 3
39
+ _FREE_TEXT_MEDIAN_CHARS: int = 20
40
+ _FREE_TEXT_P90_CHARS: int = 35
41
41
  _FREE_TEXT_MIN_UNIQUE_RATIO: float = 0.40
42
+ _FREE_TEXT_HIGH_UNIQUE_WITH_SPACES: float = 0.70 # unique ratio above which multi-token strings → Text
42
43
 
43
44
 
44
45
  # Common boolean string values (lowercased)
@@ -77,115 +78,87 @@ class TypeDetector:
77
78
  original_dtype=original_dtype,
78
79
  inferred_dtype=original_dtype,
79
80
  )
80
-
81
- # Work with a copy that may be re-assigned after coercion
82
81
  working = series
83
82
 
84
83
  # 1 & 2: Coercion for string columns
85
- if series.dtype == pl.Utf8 or series.dtype == pl.String:
84
+ if series.dtype in (pl.Utf8, pl.String):
86
85
  coerced, flag = self._try_numeric_coerce(series, n_rows)
87
86
  if coerced is not None:
88
87
  info.inferred_dtype = str(coerced.dtype)
89
88
  info.flags.append(flag) # type: ignore[arg-type]
90
89
  working = coerced
91
-
92
- self._check_coerced_encoded_category(working, info, n_rows)
90
+ self._check_coerced_encoded_category(working, info)
93
91
  else:
94
92
  coerced_dt, flag_dt = self._try_datetime_coerce(
95
- series, col_name, n_rows
93
+ series, n_rows
96
94
  )
97
95
  if coerced_dt is not None:
98
96
  info.inferred_dtype = str(coerced_dt.dtype)
99
97
  info.flags.append(flag_dt) # type: ignore[arg-type]
100
- working = coerced_dt
101
-
102
98
  info.semantic_type = SemanticType.Datetime
103
99
  results[col_name] = info
104
100
  continue
105
101
 
106
102
  # 3: Boolean candidate
107
103
  self._check_boolean_candidate(working, info)
108
-
109
- # Work only on numeric-ish columns for the remaining checks
104
+ if TypeFlag.BooleanCandidate in info.flags:
105
+ info.semantic_type = SemanticType.Boolean
106
+ results[col_name] = info
107
+ continue
108
+
109
+ # Native datetime types
110
+ if working.dtype in (pl.Date, pl.Datetime, pl.Duration, pl.Time) or isinstance(
111
+ working.dtype, pl.Datetime
112
+ ):
113
+ info.semantic_type = SemanticType.Datetime
114
+ results[col_name] = info
115
+ continue
116
+
117
+ # 4–7: Numeric path
110
118
  if working.dtype in _NUMERIC_DTYPES:
111
- # 4 & 5: Encoded category and identifier checks — integers only.
112
- # Continuous floats have high cardinality by nature and are never
113
- # identifiers; restricting these checks prevents false Identifier
114
- # classification of genuine numeric features.
115
119
  if working.dtype in _INT_DTYPES:
116
- self._check_encoded_category(working, info, n_rows)
117
- self._check_identifier(working, info, n_rows)
118
-
119
- # 6: Sequential index (integers only)
120
- if working.dtype in _INT_DTYPES or working.dtype in (
121
- pl.Float32,
122
- pl.Float64,
123
- ):
124
- self._check_sequential_index(working, info, n_rows)
125
-
126
- # 7: Numeric kind (skip for identifiers / sequential indices)
127
- if not any(
128
- info.has_flag(f)
129
- for f in (
130
- TypeFlag.IdentifierColumn,
131
- TypeFlag.SequentialIndex,
132
- TypeFlag.FloatSequentialIndex,
133
- )
134
- ):
120
+ # EncodedCategory and IdentifierColumn are mutually exclusive:
121
+ # low-cardinality and near-unique cannot both be true.
122
+ # Check encoded category first; skip identifier if it matches.
123
+ self._check_encoded_category(working, info)
124
+ if TypeFlag.EncodedCategory not in info.flags:
125
+ self._check_identifier(working, info, n_rows)
126
+ if TypeFlag.IdentifierColumn in info.flags:
127
+ self._check_sequential_index(working, info, n_rows)
128
+
129
+ if TypeFlag.EncodedCategory in info.flags:
130
+ info.semantic_type = SemanticType.Categorical
131
+ elif TypeFlag.IdentifierColumn in info.flags:
132
+ info.semantic_type = SemanticType.Identifier
133
+ else:
135
134
  self._classify_numeric_kind(working, info)
135
+ info.semantic_type = SemanticType.Numeric
136
136
 
137
- elif working.dtype == pl.Utf8 or working.dtype == pl.String:
138
- # String identifier check
139
- self._check_identifier(working, info, n_rows)
137
+ results[col_name] = info
138
+ continue
140
139
 
140
+ # String path
141
+ if working.dtype in (pl.Utf8, pl.String):
141
142
  self._check_free_text(working, info, n_rows)
142
-
143
- info.semantic_type = self._derive_semantic_type(
144
- info,
145
- working,
146
- n_rows,
147
- )
148
-
143
+ if TypeFlag.FreeTextCandidate in info.flags:
144
+ info.semantic_type = SemanticType.Text
145
+ results[col_name] = info
146
+ continue
147
+ self._check_identifier(working, info, n_rows)
148
+ info.semantic_type = (
149
+ SemanticType.Identifier
150
+ if TypeFlag.IdentifierColumn in info.flags
151
+ else SemanticType.Categorical
152
+ )
153
+ results[col_name] = info
154
+ continue
155
+
156
+ # Fallback
157
+ info.semantic_type = SemanticType.Text
149
158
  results[col_name] = info
150
159
 
151
160
  return results
152
161
 
153
- @staticmethod
154
- def _derive_semantic_type(
155
- info: ColumnTypeInfo,
156
- working: pl.Series,
157
- n_rows: int,
158
- ) -> SemanticType:
159
- if TypeFlag.IdentifierColumn in info.flags:
160
- return SemanticType.Identifier
161
-
162
- if TypeFlag.BooleanCandidate in info.flags:
163
- return SemanticType.Boolean
164
-
165
- is_native_datetime = working.dtype in (
166
- pl.Date,
167
- pl.Datetime,
168
- pl.Duration,
169
- pl.Time,
170
- ) or (hasattr(pl, "Datetime") and isinstance(working.dtype, pl.Datetime))
171
-
172
- if is_native_datetime or TypeFlag.DatetimeCoerced in info.flags:
173
- return SemanticType.Datetime
174
-
175
- if TypeFlag.EncodedCategory in info.flags:
176
- return SemanticType.Categorical
177
-
178
- if working.dtype in (pl.Utf8, pl.String):
179
- if TypeFlag.FreeTextCandidate in info.flags:
180
- return SemanticType.Text
181
-
182
- return SemanticType.Categorical
183
-
184
- if working.dtype in _NUMERIC_DTYPES:
185
- return SemanticType.Numeric
186
-
187
- return SemanticType.Categorical
188
-
189
162
  # ------------------------------------------------------------------
190
163
  # Step 1: Numeric coercion
191
164
  # ------------------------------------------------------------------
@@ -221,7 +194,7 @@ class TypeDetector:
221
194
 
222
195
  @staticmethod
223
196
  def _try_datetime_coerce(
224
- series: pl.Series, col_name: str, n_rows: int
197
+ series: pl.Series, n_rows: int
225
198
  ) -> tuple[pl.Series, TypeFlag] | tuple[None, None]:
226
199
  """
227
200
  Attempt datetime coercion if the column name looks date-like.
@@ -269,7 +242,7 @@ class TypeDetector:
269
242
 
270
243
  @staticmethod
271
244
  def _check_coerced_encoded_category(
272
- series: pl.Series, info: ColumnTypeInfo, n_rows: int
245
+ series: pl.Series, info: ColumnTypeInfo
273
246
  ) -> None:
274
247
  """
275
248
  Post-coercion low-cardinality check for Float64 series that originated
@@ -312,9 +285,8 @@ class TypeDetector:
312
285
 
313
286
  @staticmethod
314
287
  def _check_encoded_category(
315
- series: pl.Series, info: ColumnTypeInfo, n_rows: int
288
+ series: pl.Series, info: ColumnTypeInfo
316
289
  ) -> None:
317
- # Skip if already flagged as boolean candidate (subset of {0,1})
318
290
  if TypeFlag.BooleanCandidate in info.flags:
319
291
  return
320
292
 
@@ -441,24 +413,27 @@ class TypeDetector:
441
413
 
442
414
  char_lengths = non_null.str.len_chars()
443
415
  median_chars = float(char_lengths.median() or 0.0)
416
+ space_counts = non_null.str.count_matches(r"\s+")
417
+ median_spaces = float(space_counts.median() or 0.0)
418
+ median_words = median_spaces + 1.0
419
+ unique_ratio = series.n_unique() / n_rows if n_rows > 0 else 0.0
444
420
 
445
- if median_chars > _FREE_TEXT_MEDIAN_CHARS:
421
+ # Multi-word strings of medium length: names, addresses, short descriptions
422
+ if median_chars > _FREE_TEXT_MEDIAN_CHARS and median_spaces >= 1.0:
446
423
  info.flags.append(TypeFlag.FreeTextCandidate)
447
424
  return
448
425
 
449
- space_counts = non_null.str.count_matches(r"\s+")
450
- median_words = float(space_counts.median() or 0.0) + 1.0
451
-
426
+ # Long average word count: sentences, paragraphs
452
427
  if median_words > _FREE_TEXT_AVG_WORDS:
453
428
  info.flags.append(TypeFlag.FreeTextCandidate)
454
429
  return
455
430
 
456
431
  p90_chars = float(char_lengths.quantile(0.9) or 0.0)
432
+ if p90_chars > _FREE_TEXT_P90_CHARS and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO:
433
+ info.flags.append(TypeFlag.FreeTextCandidate)
434
+ return
457
435
 
458
- unique_ratio = series.n_unique() / n_rows if n_rows > 0 else 0.0
459
-
460
- if (
461
- p90_chars > _FREE_TEXT_P90_CHARS
462
- and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO
463
- ):
436
+ # High-cardinality multi-token strings that don't meet char thresholds:
437
+ # e.g. short full names like "John Smith", compound tokens
438
+ if unique_ratio >= _FREE_TEXT_HIGH_UNIQUE_WITH_SPACES and median_spaces >= 1.0:
464
439
  info.flags.append(TypeFlag.FreeTextCandidate)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.6.0
3
+ Version: 0.7.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
File without changes
File without changes
File without changes