dataforge-ml 1.0.0__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. dataforge_ml-2.0.0/PKG-INFO +101 -0
  2. dataforge_ml-2.0.0/README.md +71 -0
  3. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/pyproject.toml +9 -2
  4. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/__init__.py +3 -1
  5. dataforge_ml-2.0.0/src/dataforge_ml/config.py +343 -0
  6. dataforge_ml-2.0.0/src/dataforge_ml/imputation/__init__.py +29 -0
  7. dataforge_ml-2.0.0/src/dataforge_ml/imputation/_config.py +610 -0
  8. dataforge_ml-2.0.0/src/dataforge_ml/imputation/_fitted_imputer.py +764 -0
  9. dataforge_ml-2.0.0/src/dataforge_ml/imputation/_numeric_imputer.py +1954 -0
  10. dataforge_ml-2.0.0/src/dataforge_ml/imputation/_regression_estimator_factory.py +81 -0
  11. dataforge_ml-2.0.0/src/dataforge_ml/imputation/_strategy_router.py +642 -0
  12. dataforge_ml-2.0.0/src/dataforge_ml/imputation/_utils.py +89 -0
  13. dataforge_ml-2.0.0/src/dataforge_ml/imputation/orchestrator.py +178 -0
  14. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/models/_data_types.py +2 -0
  15. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/__init__.py +14 -1
  16. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_boolean_config.py +14 -0
  17. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_categorical.py +31 -15
  18. dataforge_ml-2.0.0/src/dataforge_ml/profiling/_categorical_config.py +315 -0
  19. dataforge_ml-2.0.0/src/dataforge_ml/profiling/_config.py +483 -0
  20. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_correlation_config.py +198 -2
  21. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_correlation_profiler.py +23 -22
  22. dataforge_ml-2.0.0/src/dataforge_ml/profiling/_datetime_config.py +252 -0
  23. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_datetime_profiler.py +34 -56
  24. dataforge_ml-2.0.0/src/dataforge_ml/profiling/_missingness_config.py +319 -0
  25. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_missingness_profiler.py +108 -25
  26. dataforge_ml-2.0.0/src/dataforge_ml/profiling/_nonlinearity_profiler.py +411 -0
  27. dataforge_ml-2.0.0/src/dataforge_ml/profiling/_numeric_config.py +729 -0
  28. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_numeric_profiler.py +146 -42
  29. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_target_config.py +32 -2
  30. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_target_profiler.py +3 -2
  31. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_text_config.py +15 -0
  32. dataforge_ml-2.0.0/src/dataforge_ml/profiling/_type_detection_config.py +129 -0
  33. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_type_detector.py +85 -61
  34. dataforge_ml-1.0.0/src/dataforge_ml/profiling/structural.py → dataforge_ml-2.0.0/src/dataforge_ml/profiling/orchestrator.py +140 -43
  35. dataforge_ml-2.0.0/src/dataforge_ml/splitting/__init__.py +4 -0
  36. dataforge_ml-2.0.0/src/dataforge_ml/splitting/_config.py +131 -0
  37. dataforge_ml-2.0.0/src/dataforge_ml/splitting/_profile_signals.py +216 -0
  38. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/splitting/_splitter.py +119 -3
  39. dataforge_ml-2.0.0/src/dataforge_ml/utils/_null_detection.py +49 -0
  40. dataforge_ml-2.0.0/src/dataforge_ml/utils/_null_normalization.py +130 -0
  41. dataforge_ml-2.0.0/src/dataforge_ml.egg-info/PKG-INFO +101 -0
  42. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml.egg-info/SOURCES.txt +14 -2
  43. dataforge_ml-2.0.0/src/dataforge_ml.egg-info/requires.txt +16 -0
  44. dataforge_ml-1.0.0/PKG-INFO +0 -36
  45. dataforge_ml-1.0.0/README.md +0 -13
  46. dataforge_ml-1.0.0/src/dataforge_ml/config.py +0 -133
  47. dataforge_ml-1.0.0/src/dataforge_ml/profiling/_categorical_config.py +0 -116
  48. dataforge_ml-1.0.0/src/dataforge_ml/profiling/_config.py +0 -258
  49. dataforge_ml-1.0.0/src/dataforge_ml/profiling/_datetime_config.py +0 -123
  50. dataforge_ml-1.0.0/src/dataforge_ml/profiling/_missingness_config.py +0 -150
  51. dataforge_ml-1.0.0/src/dataforge_ml/profiling/_null_detection.py +0 -22
  52. dataforge_ml-1.0.0/src/dataforge_ml/profiling/_numeric_config.py +0 -152
  53. dataforge_ml-1.0.0/src/dataforge_ml/splitting/__init__.py +0 -4
  54. dataforge_ml-1.0.0/src/dataforge_ml/splitting/_config.py +0 -56
  55. dataforge_ml-1.0.0/src/dataforge_ml.egg-info/PKG-INFO +0 -36
  56. dataforge_ml-1.0.0/src/dataforge_ml.egg-info/requires.txt +0 -9
  57. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/LICENSE +0 -0
  58. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/setup.cfg +0 -0
  59. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/models/__init__.py +0 -0
  60. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/models/_data_structure.py +0 -0
  61. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_base.py +0 -0
  62. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
  63. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_tabular.py +0 -0
  64. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
  65. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/utils/__init__.py +0 -0
  66. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml/utils/data_loader.py +0 -0
  67. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  68. {dataforge_ml-1.0.0 → dataforge_ml-2.0.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -0,0 +1,101 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataforge-ml
3
+ Version: 2.0.0
4
+ Summary: A automated feature engineering and designing pipeline library
5
+ License: MIT
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.10
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Requires-Python: >3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: polars>=1.0.0
15
+ Requires-Dist: scikit-learn>=1.0.0
16
+ Requires-Dist: scipy>=1.10.0
17
+ Requires-Dist: numpy>=2.0.0
18
+ Requires-Dist: pandas>=2.0.0
19
+ Requires-Dist: chardet>=5.0.0
20
+ Requires-Dist: iterative-stratification>=0.1.9
21
+ Requires-Dist: diptest
22
+ Provides-Extra: dev
23
+ Requires-Dist: pytest>=8.0; extra == "dev"
24
+ Requires-Dist: sphinx>=8.0; extra == "dev"
25
+ Requires-Dist: pydata-sphinx-theme>=0.16; extra == "dev"
26
+ Requires-Dist: myst-parser>=4.0; extra == "dev"
27
+ Requires-Dist: numpydoc>=1.8; extra == "dev"
28
+ Requires-Dist: sphinx-autobuild>=2024.0; extra == "dev"
29
+ Dynamic: license-file
30
+
31
+ # DataForgeML
32
+
33
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/DEVunderdog/DataForgeML)
34
+
35
+ Automated data profiling and splitting pipeline for ML datasets.
36
+
37
+ DataForgeML inspects your dataset, detects each column's semantic type (numeric, categorical, boolean, text, datetime, or identifier), computes per-column statistics and missingness, and produces a structured result ready for downstream feature engineering — no manual schema wrangling required.
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ pip install dataforge-ml
43
+ ```
44
+
45
+ ## Quick Start
46
+
47
+ ```python
48
+ from dataforge_ml import DataLoader, PipelineConfig, StructuralProfiler
49
+
50
+ df = DataLoader().load("titanic.csv")
51
+
52
+ config = PipelineConfig()
53
+ result = StructuralProfiler(config).profile(df)
54
+
55
+ print(result.columns["Age"].semantic_type) # SemanticType.Numeric
56
+ print(result.dataset.row_count) # total rows
57
+ ```
58
+
59
+ `DataLoader` auto-detects encoding and delimiter. Supported formats: CSV, TSV, Parquet, JSON, NDJSON, JSONL, XLSX, XLS, Arrow, Feather.
60
+
61
+ ## Column Type Overrides
62
+
63
+ Override the auto-detected type for any column before profiling:
64
+
65
+ ```python
66
+ config = PipelineConfig()
67
+ config.set_column_type("PassengerId", "identifier") # skip stats entirely
68
+ config.set_columns_type(["Survived", "Pclass"], "categorical")
69
+
70
+ result = StructuralProfiler(config).profile(df)
71
+ ```
72
+
73
+ To drop a column from all processing entirely, use `exclude_columns`:
74
+
75
+ ```python
76
+ config = PipelineConfig(exclude_columns=["PassengerId", "Name"])
77
+ ```
78
+
79
+ ## Splitting
80
+
81
+ ```python
82
+ from dataforge_ml import DataLoader, DataSplitter
83
+
84
+ df = DataLoader().load("titanic.csv")
85
+ splitter = DataSplitter(df, target="Survived", random_seed=42)
86
+
87
+ # Random train/test split (stratified by default when target is set)
88
+ split = splitter.random_split(test_size=0.2)
89
+ print(split.train.shape, split.test.shape)
90
+
91
+ # Chronological split (no temporal leakage)
92
+ split = splitter.time_split(time_column="date", test_size=0.2)
93
+
94
+ # K-fold cross-validation
95
+ for fold in splitter.kfold(k=5):
96
+ print(f"Fold {fold.fold_index}: train={fold.train_size}, val={fold.val_size}")
97
+ ```
98
+
99
+ ## License
100
+
101
+ MIT
@@ -0,0 +1,71 @@
1
+ # DataForgeML
2
+
3
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/DEVunderdog/DataForgeML)
4
+
5
+ Automated data profiling and splitting pipeline for ML datasets.
6
+
7
+ DataForgeML inspects your dataset, detects each column's semantic type (numeric, categorical, boolean, text, datetime, or identifier), computes per-column statistics and missingness, and produces a structured result ready for downstream feature engineering — no manual schema wrangling required.
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ pip install dataforge-ml
13
+ ```
14
+
15
+ ## Quick Start
16
+
17
+ ```python
18
+ from dataforge_ml import DataLoader, PipelineConfig, StructuralProfiler
19
+
20
+ df = DataLoader().load("titanic.csv")
21
+
22
+ config = PipelineConfig()
23
+ result = StructuralProfiler(config).profile(df)
24
+
25
+ print(result.columns["Age"].semantic_type) # SemanticType.Numeric
26
+ print(result.dataset.row_count) # total rows
27
+ ```
28
+
29
+ `DataLoader` auto-detects encoding and delimiter. Supported formats: CSV, TSV, Parquet, JSON, NDJSON, JSONL, XLSX, XLS, Arrow, Feather.
30
+
31
+ ## Column Type Overrides
32
+
33
+ Override the auto-detected type for any column before profiling:
34
+
35
+ ```python
36
+ config = PipelineConfig()
37
+ config.set_column_type("PassengerId", "identifier") # skip stats entirely
38
+ config.set_columns_type(["Survived", "Pclass"], "categorical")
39
+
40
+ result = StructuralProfiler(config).profile(df)
41
+ ```
42
+
43
+ To drop a column from all processing entirely, use `exclude_columns`:
44
+
45
+ ```python
46
+ config = PipelineConfig(exclude_columns=["PassengerId", "Name"])
47
+ ```
48
+
49
+ ## Splitting
50
+
51
+ ```python
52
+ from dataforge_ml import DataLoader, DataSplitter
53
+
54
+ df = DataLoader().load("titanic.csv")
55
+ splitter = DataSplitter(df, target="Survived", random_seed=42)
56
+
57
+ # Random train/test split (stratified by default when target is set)
58
+ split = splitter.random_split(test_size=0.2)
59
+ print(split.train.shape, split.test.shape)
60
+
61
+ # Chronological split (no temporal leakage)
62
+ split = splitter.time_split(time_column="date", test_size=0.2)
63
+
64
+ # K-fold cross-validation
65
+ for fold in splitter.kfold(k=5):
66
+ print(f"Fold {fold.fold_index}: train={fold.train_size}, val={fold.val_size}")
67
+ ```
68
+
69
+ ## License
70
+
71
+ MIT
@@ -4,10 +4,10 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "1.0.0"
7
+ version = "2.0.0"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
- requires-python = ">=3.10"
10
+ requires-python = ">3.10"
11
11
  license = {text = "MIT"}
12
12
  classifiers = [
13
13
  "License :: OSI Approved :: MIT License",
@@ -23,11 +23,18 @@ dependencies = [
23
23
  "numpy>=2.0.0",
24
24
  "pandas>=2.0.0",
25
25
  "chardet>=5.0.0",
26
+ "iterative-stratification>=0.1.9",
27
+ "diptest",
26
28
  ]
27
29
 
28
30
  [project.optional-dependencies]
29
31
  dev = [
30
32
  "pytest>=8.0",
33
+ "sphinx>=8.0",
34
+ "pydata-sphinx-theme>=0.16",
35
+ "myst-parser>=4.0",
36
+ "numpydoc>=1.8",
37
+ "sphinx-autobuild>=2024.0",
31
38
  ]
32
39
 
33
40
  [tool.pytest.ini_options]
@@ -1,5 +1,5 @@
1
1
  from .config import PipelineConfig, PipelinePhase, SemanticType, Modality
2
- from .profiling.structural import StructuralProfiler
2
+ from .profiling.orchestrator import StructuralProfiler
3
3
  from .profiling._config import (
4
4
  ProfileConfig,
5
5
  StructuralProfileResult,
@@ -8,6 +8,7 @@ from .profiling._config import (
8
8
  )
9
9
  from .splitting import DataSplitter, SplitResult, FoldResult
10
10
  from .utils.data_loader import DataLoader
11
+ from .imputation._config import ImputationFitDiagnostic
11
12
 
12
13
  __all__ = [
13
14
  "PipelineConfig",
@@ -23,4 +24,5 @@ __all__ = [
23
24
  "SplitResult",
24
25
  "FoldResult",
25
26
  "DataLoader",
27
+ "ImputationFitDiagnostic",
26
28
  ]
@@ -0,0 +1,343 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass, field
5
+ from enum import StrEnum
6
+ from typing import TYPE_CHECKING, Union, Optional
7
+
8
+ if TYPE_CHECKING:
9
+ from dataforge_ml.profiling._config import ProfileConfig, NumericKind
10
+ from dataforge_ml.imputation._config import ImputationConfig
11
+ from dataforge_ml.splitting._config import SplitConfig
12
+
13
+
14
+ class SemanticType(StrEnum):
15
+ """The ML-level interpretation assigned to a column by the type detector.
16
+
17
+ Used throughout the pipeline to route columns to the correct sub-processors
18
+ and to determine which statistical operations apply. See CONTEXT.md §SemanticType
19
+ for the full type taxonomy and the Text vs Categorical distinction.
20
+ """
21
+
22
+ Numeric = "numeric"
23
+ Categorical = "categorical"
24
+ Datetime = "datetime"
25
+ Boolean = "boolean"
26
+ Text = "text"
27
+ Identifier = "identifier"
28
+
29
+
30
+ class Modality(StrEnum):
31
+ """The data modality the pipeline operates on.
32
+
33
+ Currently only ``Tabular`` is supported. Reserved for future expansion to
34
+ additional modalities (time-series, image, etc.).
35
+ """
36
+
37
+ Tabular = "tabular"
38
+
39
+
40
+ class PipelinePhase(StrEnum):
41
+ """The six sequential phases of the DataForgeML feature engineering pipeline.
42
+
43
+ Phase Orchestrators call ``PipelineConfig.resolve_active_columns`` with one
44
+ of these values to obtain the column set for that phase after Hard and Soft
45
+ Exclusions are applied.
46
+ """
47
+
48
+ Profiling = "profiling"
49
+ Imputation = "imputation"
50
+ OutlierDetection = "outlier_detection"
51
+ Normalization = "normalization"
52
+ Encoding = "encoding"
53
+ Scaling = "scaling"
54
+
55
+
56
+ def _default_profile_config() -> ProfileConfig:
57
+ from dataforge_ml.profiling._config import ProfileConfig
58
+ return ProfileConfig()
59
+
60
+
61
+ def _default_imputation_config() -> ImputationConfig:
62
+ from dataforge_ml.imputation._config import ImputationConfig
63
+ return ImputationConfig()
64
+
65
+
66
+ def _default_split_config() -> SplitConfig:
67
+ from dataforge_ml.splitting._config import SplitConfig
68
+ return SplitConfig()
69
+
70
+
71
+ @dataclass
72
+ class PipelineConfig:
73
+ """
74
+ Master configuration for the full 6-phase feature engineering pipeline.
75
+
76
+ Parameters
77
+ ----------
78
+ exclude_columns : list[str]
79
+ Hard exclusions — columns dropped globally from every phase.
80
+ phase_exclusions : dict[PipelinePhase, list[str]]
81
+ Soft exclusions — columns bypassed for a specific phase but retained
82
+ in the dataset.
83
+ column_overrides : dict[str, SemanticType]
84
+ Explicit semantic type assignments respected by all downstream phases.
85
+ numeric_kind_overrides : dict[str, NumericKind]
86
+ Explicit ``NumericKind`` assignments for individual columns, applied
87
+ after auto-detection in Phase 1. Only valid for columns whose final
88
+ ``SemanticType`` is ``Numeric``; raises at orchestrator time otherwise.
89
+ profiling : ProfileConfig
90
+ Phase 1-specific parameters (correlation, chunking, memory threshold).
91
+ imputation : ImputationConfig
92
+ Phase 2-specific parameters (strategy thresholds, size guards).
93
+ split : SplitConfig
94
+ Splitting thresholds (stratification signal cap, boolean minority bar).
95
+ random_seed : int, optional
96
+ Single seed for all stochastic pipeline operations, including GMM
97
+ Sampling during bimodal imputation. None produces non-deterministic
98
+ output.
99
+ """
100
+
101
+ exclude_columns: list[str] = field(default_factory=list)
102
+ phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
103
+ column_overrides: dict[str, SemanticType] = field(default_factory=dict)
104
+ numeric_kind_overrides: dict[str, NumericKind] = field(default_factory=dict)
105
+ profiling: ProfileConfig = field(default_factory=_default_profile_config)
106
+ imputation: ImputationConfig = field(default_factory=_default_imputation_config)
107
+ split: SplitConfig = field(default_factory=_default_split_config)
108
+ random_seed: Optional[int] = None
109
+
110
+ def resolve_active_columns(
111
+ self, phase: PipelinePhase, available_columns: list[str]
112
+ ) -> list[str]:
113
+ """Return the columns the given phase should operate on.
114
+
115
+ Hard Exclusions are applied first, then phase-specific Soft Exclusions.
116
+ Columns absent from ``available_columns`` are silently ignored in both
117
+ exclusion lists.
118
+
119
+ Parameters
120
+ ----------
121
+ phase : PipelinePhase
122
+ The pipeline phase requesting the active column set.
123
+ available_columns : list[str]
124
+ The full list of columns currently present in the DataFrame.
125
+
126
+ Returns
127
+ -------
128
+ list[str]
129
+ Columns from ``available_columns`` that are not excluded by either
130
+ Hard or Soft Exclusion rules for the given phase, preserving the
131
+ original order.
132
+ """
133
+ hard_set = set(self.exclude_columns)
134
+ soft_set = set(self.phase_exclusions.get(phase, []))
135
+ excluded = hard_set | soft_set
136
+ return [c for c in available_columns if c not in excluded]
137
+
138
+ def add_exclusions(self, cols: list[str]) -> None:
139
+ """Add columns to the hard exclusion set, deduplicating automatically.
140
+
141
+ Columns already present in ``exclude_columns`` and duplicate entries
142
+ within ``cols`` are silently ignored. Calling with an empty list is a
143
+ no-op.
144
+
145
+ Parameters
146
+ ----------
147
+ cols : list[str]
148
+ Column names to register as hard exclusions. Deduplication is
149
+ handled here; callers do not need to pre-deduplicate.
150
+ """
151
+ existing = set(self.exclude_columns)
152
+ for col in cols:
153
+ if col not in existing:
154
+ self.exclude_columns.append(col)
155
+ existing.add(col)
156
+
157
+ def set_column_type(
158
+ self, column: str, semantic_type: Union[str, SemanticType]
159
+ ) -> None:
160
+ """Explicitly set the semantic type for a column, overriding auto-detection.
161
+
162
+ Parameters
163
+ ----------
164
+ column : str
165
+ Name of the column to override.
166
+ semantic_type : str or SemanticType
167
+ The desired semantic type. Accepts enum values or their string
168
+ equivalents (e.g. ``"numeric"``, ``"categorical"``).
169
+
170
+ Raises
171
+ ------
172
+ ValueError
173
+ When ``semantic_type`` is a string that does not match any
174
+ ``SemanticType`` value.
175
+ """
176
+ if isinstance(semantic_type, str):
177
+ try:
178
+ semantic_type = SemanticType(semantic_type)
179
+ except ValueError:
180
+ valid = [e.value for e in SemanticType]
181
+ raise ValueError(
182
+ f"Unknown semantic type {semantic_type!r}. "
183
+ f"Valid values: {valid}"
184
+ )
185
+ self.column_overrides[column] = semantic_type
186
+
187
+ def set_columns_type(
188
+ self, columns: list[str], semantic_type: Union[str, SemanticType]
189
+ ) -> None:
190
+ """Assign the same semantic type to every column in the list.
191
+
192
+ Parameters
193
+ ----------
194
+ columns : list[str]
195
+ Column names to override.
196
+ semantic_type : str or SemanticType
197
+ The desired semantic type applied to every column in the list.
198
+ """
199
+ for column in columns:
200
+ self.set_column_type(column, semantic_type)
201
+
202
+ def set_numeric_kind(
203
+ self, column: str, kind: Union[str, NumericKind]
204
+ ) -> None:
205
+ """Explicitly set the ``NumericKind`` for a single column.
206
+
207
+ Parameters
208
+ ----------
209
+ column : str
210
+ Name of the column to override.
211
+ kind : str or NumericKind
212
+ The desired numeric kind. Accepts enum values or their string
213
+ equivalents (``"continuous"``, ``"bounded_discrete"``).
214
+
215
+ Raises
216
+ ------
217
+ ValueError
218
+ When ``kind`` is a string that does not match any ``NumericKind``
219
+ value.
220
+ """
221
+ from dataforge_ml.profiling._config import NumericKind as _NumericKind
222
+ if isinstance(kind, str):
223
+ try:
224
+ kind = _NumericKind(kind)
225
+ except ValueError:
226
+ valid = [e.value for e in _NumericKind]
227
+ raise ValueError(
228
+ f"Unknown NumericKind {kind!r}. Valid values: {valid}"
229
+ )
230
+ self.numeric_kind_overrides[column] = kind
231
+
232
+ def set_columns_numeric_kind(
233
+ self, columns: list[str], kind: Union[str, NumericKind]
234
+ ) -> None:
235
+ """Assign the same ``NumericKind`` to every column in the list.
236
+
237
+ Parameters
238
+ ----------
239
+ columns : list[str]
240
+ Column names to override.
241
+ kind : str or NumericKind
242
+ The desired numeric kind applied to every column in the list.
243
+ """
244
+ for column in columns:
245
+ self.set_numeric_kind(column, kind)
246
+
247
+ def to_dict(self) -> dict:
248
+ """Serialise the pipeline configuration to a plain dictionary.
249
+
250
+ Returns
251
+ -------
252
+ dict
253
+ All fields serialised to JSON-compatible types; nested configs are
254
+ recursively serialised via their own ``to_dict`` methods.
255
+ """
256
+ return {
257
+ "exclude_columns": list(self.exclude_columns),
258
+ "phase_exclusions": {
259
+ str(phase): list(cols)
260
+ for phase, cols in self.phase_exclusions.items()
261
+ },
262
+ "column_overrides": {
263
+ col: str(sem_type)
264
+ for col, sem_type in self.column_overrides.items()
265
+ },
266
+ "numeric_kind_overrides": {
267
+ col: str(kind)
268
+ for col, kind in self.numeric_kind_overrides.items()
269
+ },
270
+ "profiling": self.profiling.to_dict(),
271
+ "imputation": self.imputation.to_dict(),
272
+ "split": self.split.to_dict(),
273
+ "random_seed": self.random_seed,
274
+ }
275
+
276
+ @classmethod
277
+ def from_dict(cls, data: dict) -> PipelineConfig:
278
+ """Reconstruct a ``PipelineConfig`` from a plain dictionary.
279
+
280
+ Parameters
281
+ ----------
282
+ data : dict
283
+ Dictionary as produced by ``to_dict()``.
284
+
285
+ Returns
286
+ -------
287
+ PipelineConfig
288
+ Fully populated configuration instance with all nested sub-configs
289
+ restored.
290
+ """
291
+ from dataforge_ml.profiling._config import ProfileConfig, NumericKind as _NumericKind
292
+ from dataforge_ml.imputation._config import ImputationConfig
293
+ from dataforge_ml.splitting._config import SplitConfig
294
+ return cls(
295
+ exclude_columns=list(data.get("exclude_columns", [])),
296
+ phase_exclusions={
297
+ PipelinePhase(phase_str): list(cols)
298
+ for phase_str, cols in data.get("phase_exclusions", {}).items()
299
+ },
300
+ column_overrides={
301
+ col: SemanticType(sem_str)
302
+ for col, sem_str in data.get("column_overrides", {}).items()
303
+ },
304
+ numeric_kind_overrides={
305
+ col: _NumericKind(kind_str)
306
+ for col, kind_str in data.get("numeric_kind_overrides", {}).items()
307
+ },
308
+ profiling=ProfileConfig.from_dict(data.get("profiling", {})),
309
+ imputation=ImputationConfig.from_dict(data.get("imputation", {})),
310
+ split=SplitConfig.from_dict(data.get("split", {})),
311
+ random_seed=data.get("random_seed"),
312
+ )
313
+
314
+ def to_json(self, indent: int = 2) -> str:
315
+ """Serialise the pipeline configuration to a JSON string.
316
+
317
+ Parameters
318
+ ----------
319
+ indent : int
320
+ Number of spaces used for JSON indentation.
321
+
322
+ Returns
323
+ -------
324
+ str
325
+ JSON representation of ``to_dict()``.
326
+ """
327
+ return json.dumps(self.to_dict(), indent=indent)
328
+
329
+ @classmethod
330
+ def from_json(cls, json_str: str) -> PipelineConfig:
331
+ """Reconstruct a ``PipelineConfig`` from a JSON string.
332
+
333
+ Parameters
334
+ ----------
335
+ json_str : str
336
+ JSON string as produced by ``to_json()``.
337
+
338
+ Returns
339
+ -------
340
+ PipelineConfig
341
+ Fully populated configuration instance.
342
+ """
343
+ return cls.from_dict(json.loads(json_str))
@@ -0,0 +1,29 @@
1
+ from ._config import (
2
+ ColumnImputationRecord,
3
+ ImputationConfig,
4
+ ImputationFitDiagnostic,
5
+ ImputationResult,
6
+ ImputationStrategy,
7
+ NumericImputationConfig,
8
+ )
9
+ from ._fitted_imputer import (
10
+ FittedColumnAbsentError,
11
+ FittedImputer,
12
+ UnfittedColumnError,
13
+ UnseenColumnError,
14
+ )
15
+ from .orchestrator import ImputationOrchestrator
16
+
17
+ __all__ = [
18
+ "ImputationStrategy",
19
+ "NumericImputationConfig",
20
+ "ImputationConfig",
21
+ "ImputationFitDiagnostic",
22
+ "ColumnImputationRecord",
23
+ "ImputationResult",
24
+ "FittedImputer",
25
+ "UnfittedColumnError",
26
+ "UnseenColumnError",
27
+ "FittedColumnAbsentError",
28
+ "ImputationOrchestrator",
29
+ ]