dataforge-ml 0.6.0__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/PKG-INFO +1 -1
  2. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/pyproject.toml +1 -1
  3. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_correlation_profiler.py +7 -1
  4. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_missingness_profiler.py +3 -1
  5. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_numeric_profiler.py +3 -2
  6. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_type_detector.py +71 -96
  7. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/config.py +29 -0
  8. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/structural.py +5 -1
  9. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/utils/data_loader.py +1 -3
  10. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/PKG-INFO +1 -1
  11. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/LICENSE +0 -0
  12. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/README.md +0 -0
  13. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/setup.cfg +0 -0
  14. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/__init__.py +0 -0
  15. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/models/__init__.py +0 -0
  16. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/models/_data_structure.py +0 -0
  17. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/models/_data_types.py +0 -0
  18. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/__init__.py +0 -0
  19. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_base.py +0 -0
  20. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
  21. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
  22. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_categorical.py +0 -0
  23. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
  24. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
  25. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
  26. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
  27. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  28. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
  29. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_tabular.py +0 -0
  30. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
  31. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
  32. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
  33. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
  34. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/splitting/__init__.py +0 -0
  35. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/splitting/_config.py +0 -0
  36. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
  37. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml/utils/__init__.py +0 -0
  38. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
  39. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  40. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
  41. {dataforge_ml-0.6.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.6.0
3
+ Version: 0.8.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "0.6.0"
7
+ version = "0.8.0"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -400,7 +400,13 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
400
400
  phi2_corr = max(0.0, phi2 - (r - 1) * (c - 1) / (n - 1))
401
401
  r_corr = r - (r - 1) ** 2 / (n - 1)
402
402
  c_corr = c - (c - 1) ** 2 / (n - 1)
403
- v = float(np.sqrt(phi2_corr / min(r_corr - 1, c_corr - 1)))
403
+ denom = min(r_corr - 1, c_corr - 1)
404
+ if denom <= 0:
405
+ # Near-saturated contingency table (n_unique ≈ n_rows):
406
+ # bias correction collapses denominator; skip the pair.
407
+ pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
408
+ continue
409
+ v = float(np.sqrt(phi2_corr / denom))
404
410
  v = max(0.0, min(1.0, v))
405
411
  except Exception as exc:
406
412
  warnings.warn(
@@ -208,7 +208,9 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
208
208
 
209
209
  r = profile.effective_null_ratio
210
210
 
211
- if r < _SEVERITY_MINOR and r != 0:
211
+ if r == 0.0:
212
+ profile.severity = None
213
+ elif r < _SEVERITY_MINOR:
212
214
  profile.severity = MissingSeverity.Minor
213
215
  elif r < _SEVERITY_MODERATE:
214
216
  profile.severity = MissingSeverity.Moderate
@@ -254,16 +254,17 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
254
254
  for i in range(top_rows)
255
255
  ]
256
256
  else:
257
- # --- 20-Bin Histogram Distribution (Continuous) ---
257
+ # --- Histogram Distribution (Continuous) ---
258
258
  import numpy as np
259
259
 
260
260
  counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins="auto")
261
+ n_clean = clean_f64.len()
261
262
  profile.histogram = [
262
263
  HistogramBin(
263
264
  lower_bound=float(bin_edges[i]),
264
265
  upper_bound=float(bin_edges[i + 1]),
265
266
  count=int(counts[i]),
266
- percentage=int(counts[i]) / n_rows if n_rows > 0 else 0.0,
267
+ percentage=int(counts[i]) / n_clean if n_clean > 0 else 0.0,
267
268
  )
268
269
  for i in range(len(counts))
269
270
  ]
@@ -35,10 +35,11 @@ _IDENTIFIER_UNIQUE_RATIO = 0.99 # >99 % unique → identifier
35
35
  _IDENTIFIER_MAX_MEDIAN_LENGTH = 40
36
36
  _DISCRETE_NUNIQUE_THRESHOLD = 20 # numeric with <20 unique values → discrete
37
37
 
38
- _FREE_TEXT_AVG_WORDS: int = 5 # avg word count above which → Text
39
- _FREE_TEXT_MEDIAN_CHARS: int = 35
40
- _FREE_TEXT_P90_CHARS: int = 60
38
+ _FREE_TEXT_AVG_WORDS: int = 3
39
+ _FREE_TEXT_MEDIAN_CHARS: int = 20
40
+ _FREE_TEXT_P90_CHARS: int = 35
41
41
  _FREE_TEXT_MIN_UNIQUE_RATIO: float = 0.40
42
+ _FREE_TEXT_HIGH_UNIQUE_WITH_SPACES: float = 0.70 # unique ratio above which multi-token strings → Text
42
43
 
43
44
 
44
45
  # Common boolean string values (lowercased)
@@ -77,115 +78,87 @@ class TypeDetector:
77
78
  original_dtype=original_dtype,
78
79
  inferred_dtype=original_dtype,
79
80
  )
80
-
81
- # Work with a copy that may be re-assigned after coercion
82
81
  working = series
83
82
 
84
83
  # 1 & 2: Coercion for string columns
85
- if series.dtype == pl.Utf8 or series.dtype == pl.String:
84
+ if series.dtype in (pl.Utf8, pl.String):
86
85
  coerced, flag = self._try_numeric_coerce(series, n_rows)
87
86
  if coerced is not None:
88
87
  info.inferred_dtype = str(coerced.dtype)
89
88
  info.flags.append(flag) # type: ignore[arg-type]
90
89
  working = coerced
91
-
92
- self._check_coerced_encoded_category(working, info, n_rows)
90
+ self._check_coerced_encoded_category(working, info)
93
91
  else:
94
92
  coerced_dt, flag_dt = self._try_datetime_coerce(
95
- series, col_name, n_rows
93
+ series, n_rows
96
94
  )
97
95
  if coerced_dt is not None:
98
96
  info.inferred_dtype = str(coerced_dt.dtype)
99
97
  info.flags.append(flag_dt) # type: ignore[arg-type]
100
- working = coerced_dt
101
-
102
98
  info.semantic_type = SemanticType.Datetime
103
99
  results[col_name] = info
104
100
  continue
105
101
 
106
102
  # 3: Boolean candidate
107
103
  self._check_boolean_candidate(working, info)
108
-
109
- # Work only on numeric-ish columns for the remaining checks
104
+ if TypeFlag.BooleanCandidate in info.flags:
105
+ info.semantic_type = SemanticType.Boolean
106
+ results[col_name] = info
107
+ continue
108
+
109
+ # Native datetime types
110
+ if working.dtype in (pl.Date, pl.Datetime, pl.Duration, pl.Time) or isinstance(
111
+ working.dtype, pl.Datetime
112
+ ):
113
+ info.semantic_type = SemanticType.Datetime
114
+ results[col_name] = info
115
+ continue
116
+
117
+ # 4–7: Numeric path
110
118
  if working.dtype in _NUMERIC_DTYPES:
111
- # 4 & 5: Encoded category and identifier checks — integers only.
112
- # Continuous floats have high cardinality by nature and are never
113
- # identifiers; restricting these checks prevents false Identifier
114
- # classification of genuine numeric features.
115
119
  if working.dtype in _INT_DTYPES:
116
- self._check_encoded_category(working, info, n_rows)
117
- self._check_identifier(working, info, n_rows)
118
-
119
- # 6: Sequential index (integers only)
120
- if working.dtype in _INT_DTYPES or working.dtype in (
121
- pl.Float32,
122
- pl.Float64,
123
- ):
124
- self._check_sequential_index(working, info, n_rows)
125
-
126
- # 7: Numeric kind (skip for identifiers / sequential indices)
127
- if not any(
128
- info.has_flag(f)
129
- for f in (
130
- TypeFlag.IdentifierColumn,
131
- TypeFlag.SequentialIndex,
132
- TypeFlag.FloatSequentialIndex,
133
- )
134
- ):
120
+ # EncodedCategory and IdentifierColumn are mutually exclusive:
121
+ # low-cardinality and near-unique cannot both be true.
122
+ # Check encoded category first; skip identifier if it matches.
123
+ self._check_encoded_category(working, info)
124
+ if TypeFlag.EncodedCategory not in info.flags:
125
+ self._check_identifier(working, info, n_rows)
126
+ if TypeFlag.IdentifierColumn in info.flags:
127
+ self._check_sequential_index(working, info, n_rows)
128
+
129
+ if TypeFlag.EncodedCategory in info.flags:
130
+ info.semantic_type = SemanticType.Categorical
131
+ elif TypeFlag.IdentifierColumn in info.flags:
132
+ info.semantic_type = SemanticType.Identifier
133
+ else:
135
134
  self._classify_numeric_kind(working, info)
135
+ info.semantic_type = SemanticType.Numeric
136
136
 
137
- elif working.dtype == pl.Utf8 or working.dtype == pl.String:
138
- # String identifier check
139
- self._check_identifier(working, info, n_rows)
137
+ results[col_name] = info
138
+ continue
140
139
 
140
+ # String path
141
+ if working.dtype in (pl.Utf8, pl.String):
141
142
  self._check_free_text(working, info, n_rows)
142
-
143
- info.semantic_type = self._derive_semantic_type(
144
- info,
145
- working,
146
- n_rows,
147
- )
148
-
143
+ if TypeFlag.FreeTextCandidate in info.flags:
144
+ info.semantic_type = SemanticType.Text
145
+ results[col_name] = info
146
+ continue
147
+ self._check_identifier(working, info, n_rows)
148
+ info.semantic_type = (
149
+ SemanticType.Identifier
150
+ if TypeFlag.IdentifierColumn in info.flags
151
+ else SemanticType.Categorical
152
+ )
153
+ results[col_name] = info
154
+ continue
155
+
156
+ # Fallback
157
+ info.semantic_type = SemanticType.Text
149
158
  results[col_name] = info
150
159
 
151
160
  return results
152
161
 
153
- @staticmethod
154
- def _derive_semantic_type(
155
- info: ColumnTypeInfo,
156
- working: pl.Series,
157
- n_rows: int,
158
- ) -> SemanticType:
159
- if TypeFlag.IdentifierColumn in info.flags:
160
- return SemanticType.Identifier
161
-
162
- if TypeFlag.BooleanCandidate in info.flags:
163
- return SemanticType.Boolean
164
-
165
- is_native_datetime = working.dtype in (
166
- pl.Date,
167
- pl.Datetime,
168
- pl.Duration,
169
- pl.Time,
170
- ) or (hasattr(pl, "Datetime") and isinstance(working.dtype, pl.Datetime))
171
-
172
- if is_native_datetime or TypeFlag.DatetimeCoerced in info.flags:
173
- return SemanticType.Datetime
174
-
175
- if TypeFlag.EncodedCategory in info.flags:
176
- return SemanticType.Categorical
177
-
178
- if working.dtype in (pl.Utf8, pl.String):
179
- if TypeFlag.FreeTextCandidate in info.flags:
180
- return SemanticType.Text
181
-
182
- return SemanticType.Categorical
183
-
184
- if working.dtype in _NUMERIC_DTYPES:
185
- return SemanticType.Numeric
186
-
187
- return SemanticType.Categorical
188
-
189
162
  # ------------------------------------------------------------------
190
163
  # Step 1: Numeric coercion
191
164
  # ------------------------------------------------------------------
@@ -221,7 +194,7 @@ class TypeDetector:
221
194
 
222
195
  @staticmethod
223
196
  def _try_datetime_coerce(
224
- series: pl.Series, col_name: str, n_rows: int
197
+ series: pl.Series, n_rows: int
225
198
  ) -> tuple[pl.Series, TypeFlag] | tuple[None, None]:
226
199
  """
227
200
  Attempt datetime coercion if the column name looks date-like.
@@ -269,7 +242,7 @@ class TypeDetector:
269
242
 
270
243
  @staticmethod
271
244
  def _check_coerced_encoded_category(
272
- series: pl.Series, info: ColumnTypeInfo, n_rows: int
245
+ series: pl.Series, info: ColumnTypeInfo
273
246
  ) -> None:
274
247
  """
275
248
  Post-coercion low-cardinality check for Float64 series that originated
@@ -312,9 +285,8 @@ class TypeDetector:
312
285
 
313
286
  @staticmethod
314
287
  def _check_encoded_category(
315
- series: pl.Series, info: ColumnTypeInfo, n_rows: int
288
+ series: pl.Series, info: ColumnTypeInfo
316
289
  ) -> None:
317
- # Skip if already flagged as boolean candidate (subset of {0,1})
318
290
  if TypeFlag.BooleanCandidate in info.flags:
319
291
  return
320
292
 
@@ -441,24 +413,27 @@ class TypeDetector:
441
413
 
442
414
  char_lengths = non_null.str.len_chars()
443
415
  median_chars = float(char_lengths.median() or 0.0)
416
+ space_counts = non_null.str.count_matches(r"\s+")
417
+ median_spaces = float(space_counts.median() or 0.0)
418
+ median_words = median_spaces + 1.0
419
+ unique_ratio = series.n_unique() / n_rows if n_rows > 0 else 0.0
444
420
 
445
- if median_chars > _FREE_TEXT_MEDIAN_CHARS:
421
+ # Multi-word strings of medium length: names, addresses, short descriptions
422
+ if median_chars > _FREE_TEXT_MEDIAN_CHARS and median_spaces >= 1.0:
446
423
  info.flags.append(TypeFlag.FreeTextCandidate)
447
424
  return
448
425
 
449
- space_counts = non_null.str.count_matches(r"\s+")
450
- median_words = float(space_counts.median() or 0.0) + 1.0
451
-
426
+ # Long average word count: sentences, paragraphs
452
427
  if median_words > _FREE_TEXT_AVG_WORDS:
453
428
  info.flags.append(TypeFlag.FreeTextCandidate)
454
429
  return
455
430
 
456
431
  p90_chars = float(char_lengths.quantile(0.9) or 0.0)
432
+ if p90_chars > _FREE_TEXT_P90_CHARS and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO:
433
+ info.flags.append(TypeFlag.FreeTextCandidate)
434
+ return
457
435
 
458
- unique_ratio = series.n_unique() / n_rows if n_rows > 0 else 0.0
459
-
460
- if (
461
- p90_chars > _FREE_TEXT_P90_CHARS
462
- and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO
463
- ):
436
+ # High-cardinality multi-token strings that don't meet char thresholds:
437
+ # e.g. short full names like "John Smith", compound tokens
438
+ if unique_ratio >= _FREE_TEXT_HIGH_UNIQUE_WITH_SPACES and median_spaces >= 1.0:
464
439
  info.flags.append(TypeFlag.FreeTextCandidate)
@@ -71,6 +71,7 @@ class TypeFlag(StrEnum):
71
71
  SequentialIndex = "sequential_index"
72
72
  FloatSequentialIndex = "float_sequential_index"
73
73
  FreeTextCandidate = "free_text_candidate"
74
+ UserOverride = "user_override"
74
75
 
75
76
 
76
77
  # ---------------------------------------------------------------------------
@@ -240,6 +241,34 @@ class ProfileConfig:
240
241
  memory_threshold_mb: float = 500.0
241
242
  chunk_size: int = 100_000
242
243
 
244
+ def set_column_type(self, column: str, semantic_type: Union[str, "SemanticType"]) -> None:
245
+ """
246
+ Explicitly set the semantic type for a column, overriding auto-detection.
247
+
248
+ The override is the sole source of truth for that column's type — the
249
+ type detector's verdict is ignored during profiling. Calling this method
250
+ multiple times on the same column is valid; the last call wins.
251
+
252
+ Parameters
253
+ ----------
254
+ column : str
255
+ Name of the column to override.
256
+ semantic_type : str | SemanticType
257
+ Target semantic type. Accepts a plain string (e.g. ``"numeric"``,
258
+ ``"categorical"``) or a ``SemanticType`` enum value. Invalid strings
259
+ raise ``ValueError``.
260
+ """
261
+ if isinstance(semantic_type, str):
262
+ try:
263
+ semantic_type = SemanticType(semantic_type)
264
+ except ValueError:
265
+ valid = [e.value for e in SemanticType]
266
+ raise ValueError(
267
+ f"Unknown semantic type {semantic_type!r}. "
268
+ f"Valid values: {valid}"
269
+ )
270
+ self.column_overrides[column] = semantic_type
271
+
243
272
  def to_dict(self) -> dict:
244
273
  return {
245
274
  "modality": str(self.modality),
@@ -40,6 +40,7 @@ from .config import (
40
40
  StructuralProfileResult,
41
41
  RowMissingnessDistribution,
42
42
  SemanticType,
43
+ TypeFlag,
43
44
  Modality,
44
45
  )
45
46
 
@@ -130,7 +131,10 @@ class StructuralProfiler:
130
131
  # Overrides for excluded / non-existent columns are silently ignored.
131
132
  for col_name, override_type in self.config.column_overrides.items():
132
133
  if col_name in result.columns:
133
- result.columns[col_name].semantic_type = override_type
134
+ cp = result.columns[col_name]
135
+ cp.semantic_type = override_type
136
+ if TypeFlag.UserOverride not in cp.type_flags:
137
+ cp.type_flags.append(TypeFlag.UserOverride)
134
138
 
135
139
  # ── 6. Per-column profiling routed by SemanticType ───────────────
136
140
  # Batch all columns of the same SemanticType together and call each
@@ -82,8 +82,6 @@ _EXT_LOADERS: dict[str, callable] = {
82
82
 
83
83
 
84
84
  class DataLoader:
85
- def __init__(self, fmt: str | None = None) -> None:
86
- self._fmt_override = fmt.lower() if fmt else None
87
85
 
88
86
  def load(
89
87
  self,
@@ -92,7 +90,7 @@ class DataLoader:
92
90
  ) -> pl.DataFrame:
93
91
  raw, ext_from_path = _read_raw(source)
94
92
 
95
- resolved_fmt = (fmt or self._fmt_override or ext_from_path or "").lower()
93
+ resolved_fmt = (ext_from_path or "").lower()
96
94
 
97
95
  if resolved_fmt not in _EXT_LOADERS:
98
96
  label = resolved_fmt if resolved_fmt else "<unknown>"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.6.0
3
+ Version: 0.8.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
File without changes
File without changes
File without changes