dataforge-ml 2.0.4__tar.gz → 2.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {dataforge_ml-2.0.4/src/dataforge_ml.egg-info → dataforge_ml-2.0.6}/PKG-INFO +1 -1
  2. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/pyproject.toml +1 -1
  3. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/config.py +3 -14
  4. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_config.py +17 -21
  5. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/orchestrator.py +7 -1
  6. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6/src/dataforge_ml.egg-info}/PKG-INFO +1 -1
  7. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/LICENSE +0 -0
  8. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/README.md +0 -0
  9. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/setup.cfg +0 -0
  10. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/__init__.py +0 -0
  11. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/imputation/__init__.py +0 -0
  12. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/imputation/_config.py +0 -0
  13. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/imputation/_fitted_imputer.py +0 -0
  14. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/imputation/_numeric_imputer.py +0 -0
  15. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/imputation/_regression_estimator_factory.py +0 -0
  16. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/imputation/_strategy_router.py +0 -0
  17. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/imputation/_utils.py +0 -0
  18. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/imputation/orchestrator.py +0 -0
  19. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/models/__init__.py +0 -0
  20. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/models/_data_structure.py +0 -0
  21. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/models/_data_types.py +0 -0
  22. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/__init__.py +0 -0
  23. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_base.py +0 -0
  24. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
  25. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
  26. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_categorical.py +0 -0
  27. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
  28. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
  29. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
  30. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
  31. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
  32. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  33. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
  34. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_nonlinearity_profiler.py +0 -0
  35. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
  36. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_numeric_profiler.py +0 -0
  37. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_tabular.py +0 -0
  38. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_target_config.py +0 -0
  39. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
  40. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_text_config.py +0 -0
  41. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
  42. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_type_detection_config.py +0 -0
  43. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/profiling/_type_detector.py +0 -0
  44. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/splitting/__init__.py +0 -0
  45. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/splitting/_config.py +0 -0
  46. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/splitting/_profile_signals.py +0 -0
  47. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/splitting/_splitter.py +0 -0
  48. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/utils/__init__.py +0 -0
  49. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/utils/_null_detection.py +0 -0
  50. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/utils/_null_normalization.py +0 -0
  51. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml/utils/data_loader.py +0 -0
  52. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
  53. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  54. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml.egg-info/requires.txt +0 -0
  55. {dataforge_ml-2.0.4 → dataforge_ml-2.0.6}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 2.0.4
3
+ Version: 2.0.6
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "2.0.4"
7
+ version = "2.0.6"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">3.10"
@@ -86,20 +86,6 @@ class PipelineConfig:
86
86
  Single seed for all stochastic pipeline operations, including GMM
87
87
  Sampling during bimodal imputation. None produces non-deterministic
88
88
  output.
89
-
90
- Attributes
91
- ----------
92
- exclude_columns : tuple[str, ...]
93
- Hard exclusions — columns dropped globally from every phase.
94
- phase_exclusions : MappingProxyType[PipelinePhase, tuple[str, ...]]
95
- Soft exclusions — columns bypassed for a specific phase but retained
96
- in the dataset.
97
- column_overrides : MappingProxyType[str, SemanticType]
98
- Explicit semantic type assignments respected by all downstream phases.
99
- numeric_kind_overrides : MappingProxyType[str, NumericKind]
100
- Explicit ``NumericKind`` assignments for individual columns, applied
101
- after auto-detection in Phase 1. Only valid for columns whose final
102
- ``SemanticType`` is ``Numeric``; raises at orchestrator time otherwise.
103
89
  """
104
90
 
105
91
  _exclude_columns: list[str] = field(default_factory=list, init=False)
@@ -150,6 +136,9 @@ class PipelineConfig:
150
136
  def numeric_kind_overrides(self) -> "MappingProxyType[str, NumericKind]":
151
137
  """Explicit NumericKind assignments for individual columns, applied after auto-detection in Phase 1.
152
138
 
139
+ Only valid for columns whose final ``SemanticType`` is ``Numeric``;
140
+ raises at orchestrator time otherwise.
141
+
153
142
  Returns
154
143
  -------
155
144
  MappingProxyType[str, NumericKind]
@@ -628,27 +628,6 @@ class ProfileConfig:
628
628
  Default ``False``.
629
629
  nonlinearity : NonlinearityProfileConfig
630
630
  Threshold configuration for the nonlinearity sub-processor.
631
-
632
- Attributes
633
- ----------
634
- numeric_sentinels : MappingProxyType[str, list[float]]
635
- Per-column numeric sentinel declarations. Keys are column names;
636
- values are lists of float-compatible sentinel values that should be
637
- treated as effective nulls (e.g. ``{"age": [-999.0, 9999.0]}``).
638
- Applies to any column whose dtype passes ``_numeric_sentinel_eligible``
639
- (all integer and float Polars dtypes). Defaults to an empty dict —
640
- columns with no declaration are completely unaffected.
641
- string_sentinels : MappingProxyType[str, list[str]]
642
- Per-column user-declared string sentinel declarations. Keys are column
643
- names; values are lists of string values that should be treated as
644
- effective nulls for that column (e.g.
645
- ``{"status": ["N/A", "missing"]}``). Uses **replace semantics**: when
646
- a declaration exists for a column, only the declared values are matched
647
- (case-insensitive); the hardcoded defaults (``"NA"``, ``"NAN"``,
648
- ``"NULL"``, ``"NONE"``, ``"?"``) are not applied for that column.
649
- Empty/whitespace-only strings are always effective null regardless of
650
- any declaration. Defaults to an empty dict — columns with no
651
- declaration continue to use the hardcoded defaults unchanged.
652
631
  """
653
632
 
654
633
  modality: Modality = Modality.Tabular
@@ -704,6 +683,13 @@ class ProfileConfig:
704
683
  """
705
684
  Get the per-column numeric sentinel declarations.
706
685
 
686
+ Keys are column names; values are lists of float-compatible sentinel
687
+ values that should be treated as effective nulls (e.g.
688
+ ``{"age": [-999.0, 9999.0]}``). Applies to any column whose dtype
689
+ passes ``_numeric_sentinel_eligible`` (all integer and float Polars
690
+ dtypes). Defaults to an empty dict — columns with no declaration are
691
+ completely unaffected.
692
+
707
693
  Returns
708
694
  -------
709
695
  MappingProxyType[str, list[float]]
@@ -716,6 +702,16 @@ class ProfileConfig:
716
702
  """
717
703
  Get the per-column user-declared string sentinel declarations.
718
704
 
705
+ Keys are column names; values are lists of string values that should
706
+ be treated as effective nulls for that column (e.g.
707
+ ``{"status": ["N/A", "missing"]}``). Uses **replace semantics**: when
708
+ a declaration exists for a column, only the declared values are
709
+ matched (case-insensitive); the hardcoded defaults (``"NA"``,
710
+ ``"NAN"``, ``"NULL"``, ``"NONE"``, ``"?"``) are not applied for that
711
+ column. Empty/whitespace-only strings are always effective null
712
+ regardless of any declaration. Defaults to an empty dict — columns
713
+ with no declaration continue to use the hardcoded defaults unchanged.
714
+
719
715
  Returns
720
716
  -------
721
717
  MappingProxyType[str, list[str]]
@@ -28,7 +28,7 @@ from typing import Any
28
28
  import numpy as np
29
29
  import polars as pl
30
30
 
31
- from ._base import ModalityProfiler, ColumnBatchProfiler
31
+ from ._base import ModalityProfiler, ColumnBatchProfiler, OverrideCoercionError
32
32
  from ._tabular import TabularProfiler
33
33
  from ._categorical import CategoricalProfiler
34
34
  from ._datetime_profiler import DatetimeProfiler
@@ -114,6 +114,10 @@ class StructuralProfiler:
114
114
  ------
115
115
  TypeError
116
116
  When ``data`` is not a ``polars.DataFrame``.
117
+ OverrideCoercionError
118
+ When a column carrying ``TypeFlag.UserOverride`` completely fails
119
+ coercion to its overridden ``SemanticType`` (zero usable values
120
+ remain despite the original column having non-null data).
117
121
  """
118
122
  if not isinstance(data, pl.DataFrame):
119
123
  raise TypeError(
@@ -241,6 +245,8 @@ class StructuralProfiler:
241
245
  for col_name in batch.analysed_columns:
242
246
  if col_name in result.columns:
243
247
  result.columns[col_name].stats = batch.columns.get(col_name)
248
+ except OverrideCoercionError:
249
+ raise
244
250
  except Exception:
245
251
  pass
246
252
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 2.0.4
3
+ Version: 2.0.6
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
File without changes
File without changes
File without changes