dataforge-ml 1.0.0__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. dataforge_ml-1.0.1/PKG-INFO +95 -0
  2. dataforge_ml-1.0.1/README.md +71 -0
  3. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/pyproject.toml +3 -2
  4. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/__init__.py +1 -1
  5. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/config.py +10 -0
  6. dataforge_ml-1.0.1/src/dataforge_ml/imputation/__init__.py +21 -0
  7. dataforge_ml-1.0.1/src/dataforge_ml/imputation/_config.py +165 -0
  8. dataforge_ml-1.0.1/src/dataforge_ml/imputation/_fitted_imputer.py +290 -0
  9. dataforge_ml-1.0.1/src/dataforge_ml/imputation/_numeric_imputer.py +372 -0
  10. dataforge_ml-1.0.1/src/dataforge_ml/imputation/orchestrator.py +163 -0
  11. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/models/_data_types.py +2 -0
  12. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/__init__.py +1 -1
  13. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_categorical.py +10 -1
  14. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_categorical_config.py +4 -1
  15. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_config.py +2 -0
  16. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_correlation_config.py +1 -1
  17. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_missingness_profiler.py +1 -1
  18. dataforge_ml-1.0.0/src/dataforge_ml/profiling/structural.py → dataforge_ml-1.0.1/src/dataforge_ml/profiling/orchestrator.py +2 -1
  19. dataforge_ml-1.0.1/src/dataforge_ml/splitting/_profile_signals.py +174 -0
  20. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/splitting/_splitter.py +114 -2
  21. {dataforge_ml-1.0.0/src/dataforge_ml/profiling → dataforge_ml-1.0.1/src/dataforge_ml/utils}/_null_detection.py +2 -2
  22. dataforge_ml-1.0.1/src/dataforge_ml/utils/_null_normalization.py +64 -0
  23. dataforge_ml-1.0.1/src/dataforge_ml.egg-info/PKG-INFO +95 -0
  24. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml.egg-info/SOURCES.txt +9 -2
  25. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml.egg-info/requires.txt +1 -0
  26. dataforge_ml-1.0.0/PKG-INFO +0 -36
  27. dataforge_ml-1.0.0/README.md +0 -13
  28. dataforge_ml-1.0.0/src/dataforge_ml.egg-info/PKG-INFO +0 -36
  29. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/LICENSE +0 -0
  30. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/setup.cfg +0 -0
  31. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/models/__init__.py +0 -0
  32. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/models/_data_structure.py +0 -0
  33. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_base.py +0 -0
  34. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
  35. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
  36. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
  37. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
  38. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
  39. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  40. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
  41. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_numeric_profiler.py +0 -0
  42. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_tabular.py +0 -0
  43. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_target_config.py +0 -0
  44. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
  45. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_text_config.py +0 -0
  46. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
  47. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/profiling/_type_detector.py +0 -0
  48. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/splitting/__init__.py +0 -0
  49. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/splitting/_config.py +0 -0
  50. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/utils/__init__.py +0 -0
  51. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml/utils/data_loader.py +0 -0
  52. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  53. {dataforge_ml-1.0.0 → dataforge_ml-1.0.1}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -0,0 +1,95 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataforge-ml
3
+ Version: 1.0.1
4
+ Summary: A automated feature engineering and designing pipeline library
5
+ License: MIT
6
+ Classifier: License :: OSI Approved :: MIT License
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.10
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Requires-Python: >3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: polars>=1.0.0
15
+ Requires-Dist: scikit-learn>=1.0.0
16
+ Requires-Dist: scipy>=1.10.0
17
+ Requires-Dist: numpy>=2.0.0
18
+ Requires-Dist: pandas>=2.0.0
19
+ Requires-Dist: chardet>=5.0.0
20
+ Requires-Dist: iterative-stratification>=0.1.9
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest>=8.0; extra == "dev"
23
+ Dynamic: license-file
24
+
25
+ # DataForgeML
26
+
27
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/DEVunderdog/DataForgeML)
28
+
29
+ Automated data profiling and splitting pipeline for ML datasets.
30
+
31
+ DataForgeML inspects your dataset, detects each column's semantic type (numeric, categorical, boolean, text, datetime, or identifier), computes per-column statistics and missingness, and produces a structured result ready for downstream feature engineering — no manual schema wrangling required.
32
+
33
+ ## Installation
34
+
35
+ ```bash
36
+ pip install dataforge-ml
37
+ ```
38
+
39
+ ## Quick Start
40
+
41
+ ```python
42
+ from dataforge_ml import DataLoader, PipelineConfig, StructuralProfiler
43
+
44
+ df = DataLoader().load("titanic.csv")
45
+
46
+ config = PipelineConfig()
47
+ result = StructuralProfiler(config).profile(df)
48
+
49
+ print(result.columns["Age"].semantic_type) # SemanticType.Numeric
50
+ print(result.dataset.row_count) # total rows
51
+ ```
52
+
53
+ `DataLoader` auto-detects encoding and delimiter. Supported formats: CSV, TSV, Parquet, JSON, NDJSON, JSONL, XLSX, XLS, Arrow, Feather.
54
+
55
+ ## Column Type Overrides
56
+
57
+ Override the auto-detected type for any column before profiling:
58
+
59
+ ```python
60
+ config = PipelineConfig()
61
+ config.set_column_type("PassengerId", "identifier") # skip stats entirely
62
+ config.set_columns_type(["Survived", "Pclass"], "categorical")
63
+
64
+ result = StructuralProfiler(config).profile(df)
65
+ ```
66
+
67
+ To drop a column from all processing entirely, use `exclude_columns`:
68
+
69
+ ```python
70
+ config = PipelineConfig(exclude_columns=["PassengerId", "Name"])
71
+ ```
72
+
73
+ ## Splitting
74
+
75
+ ```python
76
+ from dataforge_ml import DataLoader, DataSplitter
77
+
78
+ df = DataLoader().load("titanic.csv")
79
+ splitter = DataSplitter(df, target="Survived", random_seed=42)
80
+
81
+ # Random train/test split (stratified by default when target is set)
82
+ split = splitter.random_split(test_size=0.2)
83
+ print(split.train.shape, split.test.shape)
84
+
85
+ # Chronological split (no temporal leakage)
86
+ split = splitter.time_split(time_column="date", test_size=0.2)
87
+
88
+ # K-fold cross-validation
89
+ for fold in splitter.kfold(k=5):
90
+ print(f"Fold {fold.fold_index}: train={fold.train_size}, val={fold.val_size}")
91
+ ```
92
+
93
+ ## License
94
+
95
+ MIT
@@ -0,0 +1,71 @@
1
+ # DataForgeML
2
+
3
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/DEVunderdog/DataForgeML)
4
+
5
+ Automated data profiling and splitting pipeline for ML datasets.
6
+
7
+ DataForgeML inspects your dataset, detects each column's semantic type (numeric, categorical, boolean, text, datetime, or identifier), computes per-column statistics and missingness, and produces a structured result ready for downstream feature engineering — no manual schema wrangling required.
8
+
9
+ ## Installation
10
+
11
+ ```bash
12
+ pip install dataforge-ml
13
+ ```
14
+
15
+ ## Quick Start
16
+
17
+ ```python
18
+ from dataforge_ml import DataLoader, PipelineConfig, StructuralProfiler
19
+
20
+ df = DataLoader().load("titanic.csv")
21
+
22
+ config = PipelineConfig()
23
+ result = StructuralProfiler(config).profile(df)
24
+
25
+ print(result.columns["Age"].semantic_type) # SemanticType.Numeric
26
+ print(result.dataset.row_count) # total rows
27
+ ```
28
+
29
+ `DataLoader` auto-detects encoding and delimiter. Supported formats: CSV, TSV, Parquet, JSON, NDJSON, JSONL, XLSX, XLS, Arrow, Feather.
30
+
31
+ ## Column Type Overrides
32
+
33
+ Override the auto-detected type for any column before profiling:
34
+
35
+ ```python
36
+ config = PipelineConfig()
37
+ config.set_column_type("PassengerId", "identifier") # skip stats entirely
38
+ config.set_columns_type(["Survived", "Pclass"], "categorical")
39
+
40
+ result = StructuralProfiler(config).profile(df)
41
+ ```
42
+
43
+ To drop a column from all processing entirely, use `exclude_columns`:
44
+
45
+ ```python
46
+ config = PipelineConfig(exclude_columns=["PassengerId", "Name"])
47
+ ```
48
+
49
+ ## Splitting
50
+
51
+ ```python
52
+ from dataforge_ml import DataLoader, DataSplitter
53
+
54
+ df = DataLoader().load("titanic.csv")
55
+ splitter = DataSplitter(df, target="Survived", random_seed=42)
56
+
57
+ # Random train/test split (stratified by default when target is set)
58
+ split = splitter.random_split(test_size=0.2)
59
+ print(split.train.shape, split.test.shape)
60
+
61
+ # Chronological split (no temporal leakage)
62
+ split = splitter.time_split(time_column="date", test_size=0.2)
63
+
64
+ # K-fold cross-validation
65
+ for fold in splitter.kfold(k=5):
66
+ print(f"Fold {fold.fold_index}: train={fold.train_size}, val={fold.val_size}")
67
+ ```
68
+
69
+ ## License
70
+
71
+ MIT
@@ -4,10 +4,10 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "1.0.0"
7
+ version = "1.0.1"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
- requires-python = ">=3.10"
10
+ requires-python = ">3.10"
11
11
  license = {text = "MIT"}
12
12
  classifiers = [
13
13
  "License :: OSI Approved :: MIT License",
@@ -23,6 +23,7 @@ dependencies = [
23
23
  "numpy>=2.0.0",
24
24
  "pandas>=2.0.0",
25
25
  "chardet>=5.0.0",
26
+ "iterative-stratification>=0.1.9",
26
27
  ]
27
28
 
28
29
  [project.optional-dependencies]
@@ -1,5 +1,5 @@
1
1
  from .config import PipelineConfig, PipelinePhase, SemanticType, Modality
2
- from .profiling.structural import StructuralProfiler
2
+ from .profiling.orchestrator import StructuralProfiler
3
3
  from .profiling._config import (
4
4
  ProfileConfig,
5
5
  StructuralProfileResult,
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Union
7
7
 
8
8
  if TYPE_CHECKING:
9
9
  from dataforge_ml.profiling._config import ProfileConfig
10
+ from dataforge_ml.imputation._config import ImputationConfig
10
11
 
11
12
 
12
13
  class SemanticType(StrEnum):
@@ -36,6 +37,11 @@ def _default_profile_config() -> ProfileConfig:
36
37
  return ProfileConfig()
37
38
 
38
39
 
40
+ def _default_imputation_config() -> ImputationConfig:
41
+ from dataforge_ml.imputation._config import ImputationConfig
42
+ return ImputationConfig()
43
+
44
+
39
45
  @dataclass
40
46
  class PipelineConfig:
41
47
  """
@@ -58,6 +64,7 @@ class PipelineConfig:
58
64
  phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
59
65
  column_overrides: dict[str, SemanticType] = field(default_factory=dict)
60
66
  profiling: ProfileConfig = field(default_factory=_default_profile_config)
67
+ imputation: ImputationConfig = field(default_factory=_default_imputation_config)
61
68
 
62
69
  def resolve_active_columns(
63
70
  self, phase: PipelinePhase, available_columns: list[str]
@@ -107,11 +114,13 @@ class PipelineConfig:
107
114
  for col, sem_type in self.column_overrides.items()
108
115
  },
109
116
  "profiling": self.profiling.to_dict(),
117
+ "imputation": self.imputation.to_dict(),
110
118
  }
111
119
 
112
120
  @classmethod
113
121
  def from_dict(cls, data: dict) -> PipelineConfig:
114
122
  from dataforge_ml.profiling._config import ProfileConfig
123
+ from dataforge_ml.imputation._config import ImputationConfig
115
124
  return cls(
116
125
  exclude_columns=list(data.get("exclude_columns", [])),
117
126
  phase_exclusions={
@@ -123,6 +132,7 @@ class PipelineConfig:
123
132
  for col, sem_str in data.get("column_overrides", {}).items()
124
133
  },
125
134
  profiling=ProfileConfig.from_dict(data.get("profiling", {})),
135
+ imputation=ImputationConfig.from_dict(data.get("imputation", {})),
126
136
  )
127
137
 
128
138
  def to_json(self, indent: int = 2) -> str:
@@ -0,0 +1,21 @@
1
+ from ._config import (
2
+ ColumnImputationRecord,
3
+ ImputationConfig,
4
+ ImputationResult,
5
+ ImputationStrategy,
6
+ NumericImputationConfig,
7
+ )
8
+ from ._fitted_imputer import FittedImputer, UnfittedColumnError
9
+ from .orchestrator import ImputationOrchestrator, SplitImbalanceWarning
10
+
11
+ __all__ = [
12
+ "ImputationStrategy",
13
+ "NumericImputationConfig",
14
+ "ImputationConfig",
15
+ "ColumnImputationRecord",
16
+ "ImputationResult",
17
+ "FittedImputer",
18
+ "UnfittedColumnError",
19
+ "ImputationOrchestrator",
20
+ "SplitImbalanceWarning",
21
+ ]
@@ -0,0 +1,165 @@
1
+ """
2
+ Configuration and result dataclasses for the imputation phase — Phase 2.
3
+
4
+ ImputationConfig controls strategy thresholds and MNAR declarations.
5
+ Result dataclasses carry per-column audit records and the imputed DataFrame.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from enum import StrEnum
12
+ from typing import Any, Optional
13
+
14
+ import polars as pl
15
+
16
+ from ..config import SemanticType
17
+
18
+
19
+ class ImputationStrategy(StrEnum):
20
+ Mean = "mean"
21
+ Median = "median"
22
+ Mode = "mode"
23
+ KNN = "knn"
24
+ Regression = "regression"
25
+ MICE = "mice"
26
+ Constant = "constant"
27
+ Dropped = "dropped"
28
+ Passthrough = "passthrough"
29
+
30
+
31
+ @dataclass
32
+ class NumericImputationConfig:
33
+ """
34
+ Operational thresholds for the numeric imputation sub-processor.
35
+
36
+ Parameters
37
+ ----------
38
+ knn_max_rows : int
39
+ Maximum number of rows before KNN is skipped in favour of Regression.
40
+ knn_max_features : int
41
+ Maximum number of features before KNN is skipped in favour of Regression.
42
+ regression_min_rows : int
43
+ Minimum number of rows required to fit a stable Regression model.
44
+ mnar_constant_fill : float
45
+ Constant value used to fill MNAR-declared numeric columns.
46
+ """
47
+
48
+ knn_max_rows: int = 50_000
49
+ knn_max_features: int = 50
50
+ regression_min_rows: int = 500
51
+ mnar_constant_fill: float = -1
52
+
53
+ def to_dict(self) -> dict:
54
+ return {
55
+ "knn_max_rows": self.knn_max_rows,
56
+ "knn_max_features": self.knn_max_features,
57
+ "regression_min_rows": self.regression_min_rows,
58
+ "mnar_constant_fill": self.mnar_constant_fill,
59
+ }
60
+
61
+ @classmethod
62
+ def from_dict(cls, data: dict) -> NumericImputationConfig:
63
+ return cls(
64
+ knn_max_rows=int(data.get("knn_max_rows", 50_000)),
65
+ knn_max_features=int(data.get("knn_max_features", 50)),
66
+ regression_min_rows=int(data.get("regression_min_rows", 500)),
67
+ mnar_constant_fill=float(data.get("mnar_constant_fill", -1)),
68
+ )
69
+
70
+
71
+ @dataclass
72
+ class ImputationConfig:
73
+ """
74
+ Cross-type Phase 2 configuration.
75
+
76
+ Parameters
77
+ ----------
78
+ numeric : NumericImputationConfig
79
+ Thresholds and fill values for numeric imputation.
80
+ mnar_columns : list[str]
81
+ Columns declared by the user as Missing Not At Random.
82
+ These receive Constant fill + a missingness indicator regardless
83
+ of the signals detected in Phase 1.
84
+ add_indicator_columns : list[str]
85
+ Columns for which a binary missingness indicator should be added
86
+ even when they are not MNAR.
87
+ """
88
+
89
+ numeric: NumericImputationConfig = field(default_factory=NumericImputationConfig)
90
+ mnar_columns: list[str] = field(default_factory=list)
91
+ add_indicator_columns: list[str] = field(default_factory=list)
92
+
93
+ def to_dict(self) -> dict:
94
+ return {
95
+ "numeric": self.numeric.to_dict(),
96
+ "mnar_columns": list(self.mnar_columns),
97
+ "add_indicator_columns": list(self.add_indicator_columns),
98
+ }
99
+
100
+ @classmethod
101
+ def from_dict(cls, data: dict) -> ImputationConfig:
102
+ return cls(
103
+ numeric=NumericImputationConfig.from_dict(data.get("numeric", {})),
104
+ mnar_columns=list(data.get("mnar_columns", [])),
105
+ add_indicator_columns=list(data.get("add_indicator_columns", [])),
106
+ )
107
+
108
+
109
+ @dataclass
110
+ class ColumnImputationRecord:
111
+ """
112
+ Per-column audit entry produced after fit().
113
+
114
+ Parameters
115
+ ----------
116
+ column : str
117
+ Column name.
118
+ semantic_type : SemanticType
119
+ Detected semantic type of the column.
120
+ strategy : ImputationStrategy
121
+ Strategy applied to this column.
122
+ fill_value : Any, optional
123
+ Scalar fill value learned from training data (None for model-based strategies).
124
+ indicator_added : bool
125
+ Whether a binary missingness indicator column was appended.
126
+ signals : list[str]
127
+ Human-readable reasons that drove the strategy decision.
128
+ """
129
+
130
+ column: str
131
+ semantic_type: SemanticType
132
+ strategy: ImputationStrategy
133
+ fill_value: Optional[Any] = None
134
+ indicator_added: bool = False
135
+ signals: list[str] = field(default_factory=list)
136
+
137
+ def to_dict(self) -> dict:
138
+ return {
139
+ "column": self.column,
140
+ "semantic_type": str(self.semantic_type),
141
+ "strategy": str(self.strategy),
142
+ "fill_value": self.fill_value,
143
+ "indicator_added": self.indicator_added,
144
+ "signals": list(self.signals),
145
+ }
146
+
147
+
148
+ @dataclass
149
+ class ImputationResult:
150
+ """
151
+ Output of FittedImputer.transform().
152
+
153
+ Parameters
154
+ ----------
155
+ dataframe : pl.DataFrame
156
+ DataFrame with imputed values (and any indicator columns appended).
157
+ records : dict[str, ColumnImputationRecord]
158
+ Per-column audit log keyed by column name.
159
+ dropped_columns : list[str]
160
+ Columns removed because they exceeded the drop threshold (>50% missing).
161
+ """
162
+
163
+ dataframe: pl.DataFrame
164
+ records: dict[str, ColumnImputationRecord] = field(default_factory=dict)
165
+ dropped_columns: list[str] = field(default_factory=list)