dataforge-ml 2.0.1__tar.gz → 2.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {dataforge_ml-2.0.1/src/dataforge_ml.egg-info → dataforge_ml-2.0.3}/PKG-INFO +5 -4
  2. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/README.md +4 -3
  3. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/pyproject.toml +1 -1
  4. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/config.py +126 -78
  5. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/imputation/_config.py +308 -37
  6. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/imputation/_fitted_imputer.py +3 -3
  7. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/imputation/orchestrator.py +2 -0
  8. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/__init__.py +2 -1
  9. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_base.py +85 -5
  10. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_boolean_profiler.py +34 -2
  11. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_categorical.py +20 -1
  12. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_config.py +420 -42
  13. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_datetime_profiler.py +56 -7
  14. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_numeric_config.py +35 -15
  15. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_numeric_profiler.py +57 -17
  16. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_text_profiler.py +20 -1
  17. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/orchestrator.py +6 -2
  18. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3/src/dataforge_ml.egg-info}/PKG-INFO +5 -4
  19. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/LICENSE +0 -0
  20. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/setup.cfg +0 -0
  21. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/__init__.py +0 -0
  22. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/imputation/__init__.py +0 -0
  23. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/imputation/_numeric_imputer.py +0 -0
  24. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/imputation/_regression_estimator_factory.py +0 -0
  25. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/imputation/_strategy_router.py +0 -0
  26. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/imputation/_utils.py +0 -0
  27. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/models/__init__.py +0 -0
  28. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/models/_data_structure.py +0 -0
  29. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/models/_data_types.py +0 -0
  30. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
  31. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
  32. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
  33. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
  34. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
  35. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  36. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
  37. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_nonlinearity_profiler.py +0 -0
  38. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_tabular.py +0 -0
  39. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_target_config.py +0 -0
  40. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
  41. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_text_config.py +0 -0
  42. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_type_detection_config.py +0 -0
  43. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/profiling/_type_detector.py +0 -0
  44. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/splitting/__init__.py +0 -0
  45. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/splitting/_config.py +0 -0
  46. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/splitting/_profile_signals.py +0 -0
  47. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/splitting/_splitter.py +0 -0
  48. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/utils/__init__.py +0 -0
  49. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/utils/_null_detection.py +0 -0
  50. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/utils/_null_normalization.py +0 -0
  51. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml/utils/data_loader.py +0 -0
  52. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
  53. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  54. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml.egg-info/requires.txt +0 -0
  55. {dataforge_ml-2.0.1 → dataforge_ml-2.0.3}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 2.0.1
3
+ Version: 2.0.3
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -65,15 +65,16 @@ Override the auto-detected type for any column before profiling:
65
65
  ```python
66
66
  config = PipelineConfig()
67
67
  config.set_column_type("PassengerId", "identifier") # skip stats entirely
68
- config.set_columns_type(["Survived", "Pclass"], "categorical")
68
+ config.set_column_type(["Survived", "Pclass"], "categorical")
69
69
 
70
70
  result = StructuralProfiler(config).profile(df)
71
71
  ```
72
72
 
73
- To drop a column from all processing entirely, use `exclude_columns`:
73
+ To drop a column from all processing entirely, use `add_exclusion`:
74
74
 
75
75
  ```python
76
- config = PipelineConfig(exclude_columns=["PassengerId", "Name"])
76
+ config = PipelineConfig()
77
+ config.add_exclusion(["PassengerId", "Name"])
77
78
  ```
78
79
 
79
80
  ## Splitting
@@ -35,15 +35,16 @@ Override the auto-detected type for any column before profiling:
35
35
  ```python
36
36
  config = PipelineConfig()
37
37
  config.set_column_type("PassengerId", "identifier") # skip stats entirely
38
- config.set_columns_type(["Survived", "Pclass"], "categorical")
38
+ config.set_column_type(["Survived", "Pclass"], "categorical")
39
39
 
40
40
  result = StructuralProfiler(config).profile(df)
41
41
  ```
42
42
 
43
- To drop a column from all processing entirely, use `exclude_columns`:
43
+ To drop a column from all processing entirely, use `add_exclusion`:
44
44
 
45
45
  ```python
46
- config = PipelineConfig(exclude_columns=["PassengerId", "Name"])
46
+ config = PipelineConfig()
47
+ config.add_exclusion(["PassengerId", "Name"])
47
48
  ```
48
49
 
49
50
  ## Splitting
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "2.0.1"
7
+ version = "2.0.3"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">3.10"
@@ -4,6 +4,7 @@ import json
4
4
  from dataclasses import dataclass, field
5
5
  from enum import StrEnum
6
6
  from typing import TYPE_CHECKING, Union, Optional
7
+ from types import MappingProxyType
7
8
 
8
9
  if TYPE_CHECKING:
9
10
  from dataforge_ml.profiling._config import ProfileConfig, NumericKind
@@ -75,17 +76,6 @@ class PipelineConfig:
75
76
 
76
77
  Parameters
77
78
  ----------
78
- exclude_columns : list[str]
79
- Hard exclusions — columns dropped globally from every phase.
80
- phase_exclusions : dict[PipelinePhase, list[str]]
81
- Soft exclusions — columns bypassed for a specific phase but retained
82
- in the dataset.
83
- column_overrides : dict[str, SemanticType]
84
- Explicit semantic type assignments respected by all downstream phases.
85
- numeric_kind_overrides : dict[str, NumericKind]
86
- Explicit ``NumericKind`` assignments for individual columns, applied
87
- after auto-detection in Phase 1. Only valid for columns whose final
88
- ``SemanticType`` is ``Numeric``; raises at orchestrator time otherwise.
89
79
  profiling : ProfileConfig
90
80
  Phase 1-specific parameters (correlation, chunking, memory threshold).
91
81
  imputation : ImputationConfig
@@ -96,17 +86,78 @@ class PipelineConfig:
96
86
  Single seed for all stochastic pipeline operations, including GMM
97
87
  Sampling during bimodal imputation. None produces non-deterministic
98
88
  output.
89
+
90
+ Attributes
91
+ ----------
92
+ exclude_columns : tuple[str, ...]
93
+ Hard exclusions — columns dropped globally from every phase.
94
+ phase_exclusions : MappingProxyType[PipelinePhase, tuple[str, ...]]
95
+ Soft exclusions — columns bypassed for a specific phase but retained
96
+ in the dataset.
97
+ column_overrides : MappingProxyType[str, SemanticType]
98
+ Explicit semantic type assignments respected by all downstream phases.
99
+ numeric_kind_overrides : MappingProxyType[str, NumericKind]
100
+ Explicit ``NumericKind`` assignments for individual columns, applied
101
+ after auto-detection in Phase 1. Only valid for columns whose final
102
+ ``SemanticType`` is ``Numeric``; raises at orchestrator time otherwise.
99
103
  """
100
104
 
101
- exclude_columns: list[str] = field(default_factory=list)
102
- phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
103
- column_overrides: dict[str, SemanticType] = field(default_factory=dict)
104
- numeric_kind_overrides: dict[str, NumericKind] = field(default_factory=dict)
105
+ _exclude_columns: list[str] = field(default_factory=list, init=False)
106
+ _phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict, init=False)
107
+ _column_overrides: dict[str, SemanticType] = field(default_factory=dict, init=False)
108
+ _numeric_kind_overrides: dict[str, NumericKind] = field(default_factory=dict, init=False)
105
109
  profiling: ProfileConfig = field(default_factory=_default_profile_config)
106
110
  imputation: ImputationConfig = field(default_factory=_default_imputation_config)
107
111
  split: SplitConfig = field(default_factory=_default_split_config)
108
112
  random_seed: Optional[int] = None
109
113
 
114
+ @property
115
+ def exclude_columns(self) -> tuple[str, ...]:
116
+ """Hard exclusions — columns dropped globally from every phase.
117
+
118
+ Returns
119
+ -------
120
+ tuple[str, ...]
121
+ A snapshot tuple of columns registered as hard exclusions.
122
+ """
123
+ return tuple(self._exclude_columns)
124
+
125
+ @property
126
+ def phase_exclusions(self) -> "MappingProxyType[PipelinePhase, tuple[str, ...]]":
127
+ """Soft exclusions — columns bypassed for a specific phase but retained in the dataset.
128
+
129
+ Returns
130
+ -------
131
+ MappingProxyType[PipelinePhase, tuple[str, ...]]
132
+ A read-only view mapping each phase to a tuple of excluded columns.
133
+ """
134
+ from types import MappingProxyType
135
+ return MappingProxyType({k: tuple(v) for k, v in self._phase_exclusions.items()})
136
+
137
+ @property
138
+ def column_overrides(self) -> "MappingProxyType[str, SemanticType]":
139
+ """Explicit semantic type assignments respected by all downstream phases.
140
+
141
+ Returns
142
+ -------
143
+ MappingProxyType[str, SemanticType]
144
+ A read-only view mapping columns to their explicitly assigned SemanticType.
145
+ """
146
+ from types import MappingProxyType
147
+ return MappingProxyType(self._column_overrides)
148
+
149
+ @property
150
+ def numeric_kind_overrides(self) -> "MappingProxyType[str, NumericKind]":
151
+ """Explicit NumericKind assignments for individual columns, applied after auto-detection in Phase 1.
152
+
153
+ Returns
154
+ -------
155
+ MappingProxyType[str, NumericKind]
156
+ A read-only view mapping columns to their explicitly assigned NumericKind.
157
+ """
158
+ from types import MappingProxyType
159
+ return MappingProxyType(self._numeric_kind_overrides)
160
+
110
161
  def resolve_active_columns(
111
162
  self, phase: PipelinePhase, available_columns: list[str]
112
163
  ) -> list[str]:
@@ -131,38 +182,61 @@ class PipelineConfig:
131
182
  original order.
132
183
  """
133
184
  hard_set = set(self.exclude_columns)
134
- soft_set = set(self.phase_exclusions.get(phase, []))
185
+ soft_set = set(self.phase_exclusions.get(phase, ()))
135
186
  excluded = hard_set | soft_set
136
187
  return [c for c in available_columns if c not in excluded]
137
188
 
138
- def add_exclusions(self, cols: list[str]) -> None:
189
+ def add_exclusion(self, column: Union[str, list[str]]) -> None:
139
190
  """Add columns to the hard exclusion set, deduplicating automatically.
140
191
 
141
- Columns already present in ``exclude_columns`` and duplicate entries
142
- within ``cols`` are silently ignored. Calling with an empty list is a
192
+ Columns already present in the exclusion list and duplicate entries
193
+ within the input are silently ignored. Calling with an empty list is a
143
194
  no-op.
144
195
 
145
196
  Parameters
146
197
  ----------
147
- cols : list[str]
148
- Column names to register as hard exclusions. Deduplication is
198
+ column : str or list[str]
199
+ Column name(s) to register as hard exclusions. Deduplication is
149
200
  handled here; callers do not need to pre-deduplicate.
150
201
  """
151
- existing = set(self.exclude_columns)
202
+ cols = [column] if isinstance(column, str) else column
203
+ existing = set(self._exclude_columns)
204
+ for col in cols:
205
+ if col not in existing:
206
+ self._exclude_columns.append(col)
207
+ existing.add(col)
208
+
209
+ def add_phase_exclusion(self, phase: Union[PipelinePhase, str], column: Union[str, list[str]]) -> None:
210
+ """Add columns to the soft exclusion set for a specific phase.
211
+
212
+ Parameters
213
+ ----------
214
+ phase : PipelinePhase or str
215
+ The phase for which to exclude the column(s).
216
+ column : str or list[str]
217
+ Column name(s) to register as soft exclusions for this phase.
218
+ Deduplication is handled automatically.
219
+ """
220
+ if isinstance(phase, str):
221
+ phase = PipelinePhase(phase)
222
+
223
+ cols = [column] if isinstance(column, str) else column
224
+ phase_list = self._phase_exclusions.setdefault(phase, [])
225
+ existing = set(phase_list)
152
226
  for col in cols:
153
227
  if col not in existing:
154
- self.exclude_columns.append(col)
228
+ phase_list.append(col)
155
229
  existing.add(col)
156
230
 
157
231
  def set_column_type(
158
- self, column: str, semantic_type: Union[str, SemanticType]
232
+ self, column: Union[str, list[str]], semantic_type: Union[str, SemanticType]
159
233
  ) -> None:
160
- """Explicitly set the semantic type for a column, overriding auto-detection.
234
+ """Explicitly set the semantic type for one or more columns, overriding auto-detection.
161
235
 
162
236
  Parameters
163
237
  ----------
164
- column : str
165
- Name of the column to override.
238
+ column : str or list[str]
239
+ Name of the column(s) to override.
166
240
  semantic_type : str or SemanticType
167
241
  The desired semantic type. Accepts enum values or their string
168
242
  equivalents (e.g. ``"numeric"``, ``"categorical"``).
@@ -182,32 +256,19 @@ class PipelineConfig:
182
256
  f"Unknown semantic type {semantic_type!r}. "
183
257
  f"Valid values: {valid}"
184
258
  )
185
- self.column_overrides[column] = semantic_type
186
-
187
- def set_columns_type(
188
- self, columns: list[str], semantic_type: Union[str, SemanticType]
189
- ) -> None:
190
- """Assign the same semantic type to every column in the list.
191
-
192
- Parameters
193
- ----------
194
- columns : list[str]
195
- Column names to override.
196
- semantic_type : str or SemanticType
197
- The desired semantic type applied to every column in the list.
198
- """
199
- for column in columns:
200
- self.set_column_type(column, semantic_type)
259
+ cols = [column] if isinstance(column, str) else column
260
+ for col in cols:
261
+ self._column_overrides[col] = semantic_type
201
262
 
202
263
  def set_numeric_kind(
203
- self, column: str, kind: Union[str, NumericKind]
264
+ self, column: Union[str, list[str]], kind: Union[str, NumericKind]
204
265
  ) -> None:
205
- """Explicitly set the ``NumericKind`` for a single column.
266
+ """Explicitly set the ``NumericKind`` for one or more columns.
206
267
 
207
268
  Parameters
208
269
  ----------
209
- column : str
210
- Name of the column to override.
270
+ column : str or list[str]
271
+ Name of the column(s) to override.
211
272
  kind : str or NumericKind
212
273
  The desired numeric kind. Accepts enum values or their string
213
274
  equivalents (``"continuous"``, ``"bounded_discrete"``).
@@ -227,22 +288,9 @@ class PipelineConfig:
227
288
  raise ValueError(
228
289
  f"Unknown NumericKind {kind!r}. Valid values: {valid}"
229
290
  )
230
- self.numeric_kind_overrides[column] = kind
231
-
232
- def set_columns_numeric_kind(
233
- self, columns: list[str], kind: Union[str, NumericKind]
234
- ) -> None:
235
- """Assign the same ``NumericKind`` to every column in the list.
236
-
237
- Parameters
238
- ----------
239
- columns : list[str]
240
- Column names to override.
241
- kind : str or NumericKind
242
- The desired numeric kind applied to every column in the list.
243
- """
244
- for column in columns:
245
- self.set_numeric_kind(column, kind)
291
+ cols = [column] if isinstance(column, str) else column
292
+ for col in cols:
293
+ self._numeric_kind_overrides[col] = kind
246
294
 
247
295
  def to_dict(self) -> dict:
248
296
  """Serialise the pipeline configuration to a plain dictionary.
@@ -288,28 +336,28 @@ class PipelineConfig:
288
336
  Fully populated configuration instance with all nested sub-configs
289
337
  restored.
290
338
  """
291
- from dataforge_ml.profiling._config import ProfileConfig, NumericKind as _NumericKind
339
+ from dataforge_ml.profiling._config import ProfileConfig
292
340
  from dataforge_ml.imputation._config import ImputationConfig
293
341
  from dataforge_ml.splitting._config import SplitConfig
294
- return cls(
295
- exclude_columns=list(data.get("exclude_columns", [])),
296
- phase_exclusions={
297
- PipelinePhase(phase_str): list(cols)
298
- for phase_str, cols in data.get("phase_exclusions", {}).items()
299
- },
300
- column_overrides={
301
- col: SemanticType(sem_str)
302
- for col, sem_str in data.get("column_overrides", {}).items()
303
- },
304
- numeric_kind_overrides={
305
- col: _NumericKind(kind_str)
306
- for col, kind_str in data.get("numeric_kind_overrides", {}).items()
307
- },
342
+ cfg = cls(
308
343
  profiling=ProfileConfig.from_dict(data.get("profiling", {})),
309
344
  imputation=ImputationConfig.from_dict(data.get("imputation", {})),
310
345
  split=SplitConfig.from_dict(data.get("split", {})),
311
346
  random_seed=data.get("random_seed"),
312
347
  )
348
+
349
+ cfg.add_exclusion(data.get("exclude_columns", []))
350
+
351
+ for phase_str, cols in data.get("phase_exclusions", {}).items():
352
+ cfg.add_phase_exclusion(phase_str, cols)
353
+
354
+ for col, sem_str in data.get("column_overrides", {}).items():
355
+ cfg.set_column_type(col, sem_str)
356
+
357
+ for col, kind_str in data.get("numeric_kind_overrides", {}).items():
358
+ cfg.set_numeric_kind(col, kind_str)
359
+
360
+ return cfg
313
361
 
314
362
  def to_json(self, indent: int = 2) -> str:
315
363
  """Serialise the pipeline configuration to a JSON string.