dataforge-ml 0.7.0__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/PKG-INFO +1 -1
  2. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/pyproject.toml +1 -1
  3. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_missingness_profiler.py +3 -1
  4. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_numeric_profiler.py +3 -2
  5. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/config.py +29 -0
  6. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/structural.py +5 -1
  7. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/utils/data_loader.py +1 -3
  8. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/PKG-INFO +1 -1
  9. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/LICENSE +0 -0
  10. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/README.md +0 -0
  11. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/setup.cfg +0 -0
  12. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/__init__.py +0 -0
  13. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/models/__init__.py +0 -0
  14. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/models/_data_structure.py +0 -0
  15. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/models/_data_types.py +0 -0
  16. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/__init__.py +0 -0
  17. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_base.py +0 -0
  18. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
  19. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
  20. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_categorical.py +0 -0
  21. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
  22. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
  23. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
  24. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
  25. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
  26. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  27. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
  28. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_tabular.py +0 -0
  29. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
  30. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
  31. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
  32. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
  33. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_type_detector.py +0 -0
  34. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/splitting/__init__.py +0 -0
  35. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/splitting/_config.py +0 -0
  36. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
  37. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/utils/__init__.py +0 -0
  38. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
  39. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  40. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
  41. {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "0.7.0"
7
+ version = "0.8.0"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -208,7 +208,9 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
208
208
 
209
209
  r = profile.effective_null_ratio
210
210
 
211
- if r < _SEVERITY_MINOR and r != 0:
211
+ if r == 0.0:
212
+ profile.severity = None
213
+ elif r < _SEVERITY_MINOR:
212
214
  profile.severity = MissingSeverity.Minor
213
215
  elif r < _SEVERITY_MODERATE:
214
216
  profile.severity = MissingSeverity.Moderate
@@ -254,16 +254,17 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
254
254
  for i in range(top_rows)
255
255
  ]
256
256
  else:
257
- # --- 20-Bin Histogram Distribution (Continuous) ---
257
+ # --- Histogram Distribution (Continuous) ---
258
258
  import numpy as np
259
259
 
260
260
  counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins="auto")
261
+ n_clean = clean_f64.len()
261
262
  profile.histogram = [
262
263
  HistogramBin(
263
264
  lower_bound=float(bin_edges[i]),
264
265
  upper_bound=float(bin_edges[i + 1]),
265
266
  count=int(counts[i]),
266
- percentage=int(counts[i]) / n_rows if n_rows > 0 else 0.0,
267
+ percentage=int(counts[i]) / n_clean if n_clean > 0 else 0.0,
267
268
  )
268
269
  for i in range(len(counts))
269
270
  ]
@@ -71,6 +71,7 @@ class TypeFlag(StrEnum):
71
71
  SequentialIndex = "sequential_index"
72
72
  FloatSequentialIndex = "float_sequential_index"
73
73
  FreeTextCandidate = "free_text_candidate"
74
+ UserOverride = "user_override"
74
75
 
75
76
 
76
77
  # ---------------------------------------------------------------------------
@@ -240,6 +241,34 @@ class ProfileConfig:
240
241
  memory_threshold_mb: float = 500.0
241
242
  chunk_size: int = 100_000
242
243
 
244
+ def set_column_type(self, column: str, semantic_type: Union[str, "SemanticType"]) -> None:
245
+ """
246
+ Explicitly set the semantic type for a column, overriding auto-detection.
247
+
248
+ The override is the sole source of truth for that column's type — the
249
+ type detector's verdict is ignored during profiling. Calling this method
250
+ multiple times on the same column is valid; the last call wins.
251
+
252
+ Parameters
253
+ ----------
254
+ column : str
255
+ Name of the column to override.
256
+ semantic_type : str | SemanticType
257
+ Target semantic type. Accepts a plain string (e.g. ``"numeric"``,
258
+ ``"categorical"``) or a ``SemanticType`` enum value. Invalid strings
259
+ raise ``ValueError``.
260
+ """
261
+ if isinstance(semantic_type, str):
262
+ try:
263
+ semantic_type = SemanticType(semantic_type)
264
+ except ValueError:
265
+ valid = [e.value for e in SemanticType]
266
+ raise ValueError(
267
+ f"Unknown semantic type {semantic_type!r}. "
268
+ f"Valid values: {valid}"
269
+ )
270
+ self.column_overrides[column] = semantic_type
271
+
243
272
  def to_dict(self) -> dict:
244
273
  return {
245
274
  "modality": str(self.modality),
@@ -40,6 +40,7 @@ from .config import (
40
40
  StructuralProfileResult,
41
41
  RowMissingnessDistribution,
42
42
  SemanticType,
43
+ TypeFlag,
43
44
  Modality,
44
45
  )
45
46
 
@@ -130,7 +131,10 @@ class StructuralProfiler:
130
131
  # Overrides for excluded / non-existent columns are silently ignored.
131
132
  for col_name, override_type in self.config.column_overrides.items():
132
133
  if col_name in result.columns:
133
- result.columns[col_name].semantic_type = override_type
134
+ cp = result.columns[col_name]
135
+ cp.semantic_type = override_type
136
+ if TypeFlag.UserOverride not in cp.type_flags:
137
+ cp.type_flags.append(TypeFlag.UserOverride)
134
138
 
135
139
  # ── 6. Per-column profiling routed by SemanticType ───────────────
136
140
  # Batch all columns of the same SemanticType together and call each
@@ -82,8 +82,6 @@ _EXT_LOADERS: dict[str, callable] = {
82
82
 
83
83
 
84
84
  class DataLoader:
85
- def __init__(self, fmt: str | None = None) -> None:
86
- self._fmt_override = fmt.lower() if fmt else None
87
85
 
88
86
  def load(
89
87
  self,
@@ -92,7 +90,7 @@ class DataLoader:
92
90
  ) -> pl.DataFrame:
93
91
  raw, ext_from_path = _read_raw(source)
94
92
 
95
- resolved_fmt = (fmt or self._fmt_override or ext_from_path or "").lower()
93
+ resolved_fmt = (ext_from_path or "").lower()
96
94
 
97
95
  if resolved_fmt not in _EXT_LOADERS:
98
96
  label = resolved_fmt if resolved_fmt else "<unknown>"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.7.0
3
+ Version: 0.8.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
File without changes
File without changes
File without changes