dataforge-ml 0.9.0__tar.gz → 0.11.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/PKG-INFO +3 -3
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/README.md +2 -2
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/pyproject.toml +1 -1
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/__init__.py +10 -5
- dataforge_ml-0.11.0/src/dataforge_ml/config.py +133 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/__init__.py +1 -4
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/config.py +1 -161
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/structural.py +1 -6
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml.egg-info/PKG-INFO +3 -3
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml.egg-info/SOURCES.txt +1 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/LICENSE +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/setup.cfg +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_base.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_categorical.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_null_detection.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_numeric_profiler.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_tabular.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_type_detector.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataforge-ml
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.0
|
|
4
4
|
Summary: A automated feature engineering and designing pipeline library
|
|
5
5
|
License: MIT
|
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -21,9 +21,9 @@ Provides-Extra: dev
|
|
|
21
21
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
22
22
|
Dynamic: license-file
|
|
23
23
|
|
|
24
|
-
#
|
|
24
|
+
# DataForgeML
|
|
25
25
|
|
|
26
|
-
Automated feature engineering and data profiling pipeline library for
|
|
26
|
+
Automated feature engineering and data profiling pipeline library for datasets.
|
|
27
27
|
|
|
28
28
|
## Installation
|
|
29
29
|
|
|
@@ -1,21 +1,26 @@
|
|
|
1
|
+
from .config import PipelineConfig, PipelinePhase, SemanticType, Modality
|
|
1
2
|
from .profiling.structural import StructuralProfiler
|
|
2
3
|
from .profiling.config import (
|
|
3
4
|
ProfileConfig,
|
|
4
|
-
SemanticType,
|
|
5
|
-
Modality,
|
|
6
5
|
StructuralProfileResult,
|
|
6
|
+
ColumnProfile,
|
|
7
|
+
DatasetStats,
|
|
7
8
|
)
|
|
8
9
|
from .splitting import DataSplitter, SplitResult, FoldResult
|
|
9
10
|
from .utils.data_loader import DataLoader
|
|
10
11
|
|
|
11
12
|
__all__ = [
|
|
12
|
-
"
|
|
13
|
-
"
|
|
13
|
+
"PipelineConfig",
|
|
14
|
+
"PipelinePhase",
|
|
14
15
|
"ProfileConfig",
|
|
15
16
|
"SemanticType",
|
|
16
17
|
"Modality",
|
|
18
|
+
"StructuralProfiler",
|
|
19
|
+
"StructuralProfileResult",
|
|
20
|
+
"ColumnProfile",
|
|
21
|
+
"DatasetStats",
|
|
17
22
|
"DataSplitter",
|
|
18
23
|
"SplitResult",
|
|
19
24
|
"FoldResult",
|
|
20
|
-
"DataLoader"
|
|
25
|
+
"DataLoader",
|
|
21
26
|
]
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from enum import StrEnum
|
|
6
|
+
from typing import TYPE_CHECKING, Union
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from dataforge_ml.profiling.config import ProfileConfig
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SemanticType(StrEnum):
|
|
13
|
+
Numeric = "numeric"
|
|
14
|
+
Categorical = "categorical"
|
|
15
|
+
Datetime = "datetime"
|
|
16
|
+
Boolean = "boolean"
|
|
17
|
+
Text = "text"
|
|
18
|
+
Identifier = "identifier"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Modality(StrEnum):
|
|
22
|
+
Tabular = "tabular"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PipelinePhase(StrEnum):
|
|
26
|
+
Profiling = "profiling"
|
|
27
|
+
Imputation = "imputation"
|
|
28
|
+
OutlierDetection = "outlier_detection"
|
|
29
|
+
Normalization = "normalization"
|
|
30
|
+
Encoding = "encoding"
|
|
31
|
+
Scaling = "scaling"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _default_profile_config() -> ProfileConfig:
|
|
35
|
+
from dataforge_ml.profiling.config import ProfileConfig
|
|
36
|
+
return ProfileConfig()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class PipelineConfig:
|
|
41
|
+
"""
|
|
42
|
+
Master configuration for the full 6-phase feature engineering pipeline.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
exclude_columns : list[str]
|
|
47
|
+
Hard exclusions — columns dropped globally from every phase.
|
|
48
|
+
phase_exclusions : dict[PipelinePhase, list[str]]
|
|
49
|
+
Soft exclusions — columns bypassed for a specific phase but retained
|
|
50
|
+
in the dataset.
|
|
51
|
+
column_overrides : dict[str, SemanticType]
|
|
52
|
+
Explicit semantic type assignments respected by all downstream phases.
|
|
53
|
+
profiling : ProfileConfig
|
|
54
|
+
Phase 1-specific parameters (correlation, chunking, memory threshold).
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
exclude_columns: list[str] = field(default_factory=list)
|
|
58
|
+
phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
|
|
59
|
+
column_overrides: dict[str, SemanticType] = field(default_factory=dict)
|
|
60
|
+
profiling: ProfileConfig = field(default_factory=_default_profile_config)
|
|
61
|
+
|
|
62
|
+
def resolve_active_columns(
|
|
63
|
+
self, phase: PipelinePhase, available_columns: list[str]
|
|
64
|
+
) -> list[str]:
|
|
65
|
+
"""
|
|
66
|
+
Return the columns the given phase should operate on.
|
|
67
|
+
|
|
68
|
+
Hard exclusions are applied first, then phase-specific soft exclusions.
|
|
69
|
+
Columns absent from available_columns are silently ignored in both lists.
|
|
70
|
+
"""
|
|
71
|
+
hard_set = set(self.exclude_columns)
|
|
72
|
+
soft_set = set(self.phase_exclusions.get(phase, []))
|
|
73
|
+
excluded = hard_set | soft_set
|
|
74
|
+
return [c for c in available_columns if c not in excluded]
|
|
75
|
+
|
|
76
|
+
def set_column_type(
|
|
77
|
+
self, column: str, semantic_type: Union[str, SemanticType]
|
|
78
|
+
) -> None:
|
|
79
|
+
"""Explicitly set the semantic type for a column, overriding auto-detection."""
|
|
80
|
+
if isinstance(semantic_type, str):
|
|
81
|
+
try:
|
|
82
|
+
semantic_type = SemanticType(semantic_type)
|
|
83
|
+
except ValueError:
|
|
84
|
+
valid = [e.value for e in SemanticType]
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"Unknown semantic type {semantic_type!r}. "
|
|
87
|
+
f"Valid values: {valid}"
|
|
88
|
+
)
|
|
89
|
+
self.column_overrides[column] = semantic_type
|
|
90
|
+
|
|
91
|
+
def set_columns_type(
|
|
92
|
+
self, columns: list[str], semantic_type: Union[str, SemanticType]
|
|
93
|
+
) -> None:
|
|
94
|
+
"""Assign the same semantic type to every column in the list."""
|
|
95
|
+
for column in columns:
|
|
96
|
+
self.set_column_type(column, semantic_type)
|
|
97
|
+
|
|
98
|
+
def to_dict(self) -> dict:
|
|
99
|
+
return {
|
|
100
|
+
"exclude_columns": list(self.exclude_columns),
|
|
101
|
+
"phase_exclusions": {
|
|
102
|
+
str(phase): list(cols)
|
|
103
|
+
for phase, cols in self.phase_exclusions.items()
|
|
104
|
+
},
|
|
105
|
+
"column_overrides": {
|
|
106
|
+
col: str(sem_type)
|
|
107
|
+
for col, sem_type in self.column_overrides.items()
|
|
108
|
+
},
|
|
109
|
+
"profiling": self.profiling.to_dict(),
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
@classmethod
|
|
113
|
+
def from_dict(cls, data: dict) -> PipelineConfig:
|
|
114
|
+
from dataforge_ml.profiling.config import ProfileConfig
|
|
115
|
+
return cls(
|
|
116
|
+
exclude_columns=list(data.get("exclude_columns", [])),
|
|
117
|
+
phase_exclusions={
|
|
118
|
+
PipelinePhase(phase_str): list(cols)
|
|
119
|
+
for phase_str, cols in data.get("phase_exclusions", {}).items()
|
|
120
|
+
},
|
|
121
|
+
column_overrides={
|
|
122
|
+
col: SemanticType(sem_str)
|
|
123
|
+
for col, sem_str in data.get("column_overrides", {}).items()
|
|
124
|
+
},
|
|
125
|
+
profiling=ProfileConfig.from_dict(data.get("profiling", {})),
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def to_json(self, indent: int = 2) -> str:
|
|
129
|
+
return json.dumps(self.to_dict(), indent=indent)
|
|
130
|
+
|
|
131
|
+
@classmethod
|
|
132
|
+
def from_json(cls, json_str: str) -> PipelineConfig:
|
|
133
|
+
return cls.from_dict(json.loads(json_str))
|
|
@@ -1,10 +1,7 @@
|
|
|
1
1
|
from .structural import StructuralProfiler
|
|
2
|
+
from ..config import PipelineConfig, PipelinePhase, SemanticType, Modality
|
|
2
3
|
from .config import (
|
|
3
4
|
ProfileConfig,
|
|
4
|
-
PipelineConfig,
|
|
5
|
-
PipelinePhase,
|
|
6
|
-
SemanticType,
|
|
7
|
-
Modality,
|
|
8
5
|
TypeFlag,
|
|
9
6
|
NumericKind,
|
|
10
7
|
NumericStats,
|
|
@@ -12,6 +12,7 @@ from dataclasses import dataclass, field
|
|
|
12
12
|
from enum import StrEnum
|
|
13
13
|
from typing import Optional, Union
|
|
14
14
|
|
|
15
|
+
from ..config import SemanticType, Modality
|
|
15
16
|
from ._missingness_config import (
|
|
16
17
|
ColumnMissingnessProfile,
|
|
17
18
|
)
|
|
@@ -31,36 +32,6 @@ from ._boolean_config import BooleanStats
|
|
|
31
32
|
from ._text_config import TextStats
|
|
32
33
|
from ._target_config import TargetProfileResult
|
|
33
34
|
|
|
34
|
-
# ---------------------------------------------------------------------------
|
|
35
|
-
# Core enums
|
|
36
|
-
# ---------------------------------------------------------------------------
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
class SemanticType(StrEnum):
|
|
40
|
-
Numeric = "numeric"
|
|
41
|
-
Categorical = "categorical"
|
|
42
|
-
Datetime = "datetime"
|
|
43
|
-
Boolean = "boolean"
|
|
44
|
-
Text = "text"
|
|
45
|
-
Identifier = "identifier"
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class Modality(StrEnum):
|
|
49
|
-
Tabular = "tabular"
|
|
50
|
-
# Placeholder slots for future modalities — no implementation yet.
|
|
51
|
-
# Image = "image"
|
|
52
|
-
# TimeSeries = "time_series"
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
class PipelinePhase(StrEnum):
|
|
56
|
-
Profiling = "profiling"
|
|
57
|
-
Imputation = "imputation"
|
|
58
|
-
OutlierDetection = "outlier_detection"
|
|
59
|
-
Normalization = "normalization"
|
|
60
|
-
Encoding = "encoding"
|
|
61
|
-
Scaling = "scaling"
|
|
62
|
-
|
|
63
|
-
|
|
64
35
|
# ---------------------------------------------------------------------------
|
|
65
36
|
# Type-detection enums — kept for TypeDetector compatibility
|
|
66
37
|
# ---------------------------------------------------------------------------
|
|
@@ -227,10 +198,6 @@ class ProfileConfig:
|
|
|
227
198
|
Data modality. Currently only Tabular is implemented.
|
|
228
199
|
target_column : Optional[str]
|
|
229
200
|
Name of the label/target column, if any.
|
|
230
|
-
column_overrides : dict[str, SemanticType]
|
|
231
|
-
Explicit semantic type assignments that override auto-detection.
|
|
232
|
-
exclude_columns : list[str]
|
|
233
|
-
Columns to skip entirely during profiling.
|
|
234
201
|
compute_correlation : bool
|
|
235
202
|
Whether to compute the feature-feature correlation matrix.
|
|
236
203
|
correlation_target_column : Optional[str]
|
|
@@ -243,47 +210,16 @@ class ProfileConfig:
|
|
|
243
210
|
|
|
244
211
|
modality: Modality = Modality.Tabular
|
|
245
212
|
target_columns: list[str] = field(default_factory=list)
|
|
246
|
-
column_overrides: dict[str, SemanticType] = field(default_factory=dict)
|
|
247
|
-
exclude_columns: list[str] = field(default_factory=list)
|
|
248
213
|
compute_correlation: bool = False
|
|
249
214
|
correlation_target_column: Optional[str] = None
|
|
250
215
|
memory_threshold_mb: float = 500.0
|
|
251
216
|
chunk_size: int = 100_000
|
|
252
217
|
|
|
253
|
-
def set_column_type(self, column: str, semantic_type: Union[str, "SemanticType"]) -> None:
|
|
254
|
-
"""
|
|
255
|
-
Explicitly set the semantic type for a column, overriding auto-detection.
|
|
256
|
-
|
|
257
|
-
The override is the sole source of truth for that column's type — the
|
|
258
|
-
type detector's verdict is ignored during profiling. Calling this method
|
|
259
|
-
multiple times on the same column is valid; the last call wins.
|
|
260
|
-
|
|
261
|
-
Parameters
|
|
262
|
-
----------
|
|
263
|
-
column : str
|
|
264
|
-
Name of the column to override.
|
|
265
|
-
semantic_type : str | SemanticType
|
|
266
|
-
Target semantic type. Accepts a plain string (e.g. ``"numeric"``,
|
|
267
|
-
``"categorical"``) or a ``SemanticType`` enum value. Invalid strings
|
|
268
|
-
raise ``ValueError``.
|
|
269
|
-
"""
|
|
270
|
-
if isinstance(semantic_type, str):
|
|
271
|
-
try:
|
|
272
|
-
semantic_type = SemanticType(semantic_type)
|
|
273
|
-
except ValueError:
|
|
274
|
-
valid = [e.value for e in SemanticType]
|
|
275
|
-
raise ValueError(
|
|
276
|
-
f"Unknown semantic type {semantic_type!r}. "
|
|
277
|
-
f"Valid values: {valid}"
|
|
278
|
-
)
|
|
279
|
-
self.column_overrides[column] = semantic_type
|
|
280
218
|
|
|
281
219
|
def to_dict(self) -> dict:
|
|
282
220
|
return {
|
|
283
221
|
"modality": str(self.modality),
|
|
284
222
|
"target_columns": list(self.target_columns),
|
|
285
|
-
"column_overrides": {k: str(v) for k, v in self.column_overrides.items()},
|
|
286
|
-
"exclude_columns": list(self.exclude_columns),
|
|
287
223
|
"compute_correlation": self.compute_correlation,
|
|
288
224
|
"correlation_target_column": self.correlation_target_column,
|
|
289
225
|
"memory_threshold_mb": self.memory_threshold_mb,
|
|
@@ -295,10 +231,6 @@ class ProfileConfig:
|
|
|
295
231
|
return cls(
|
|
296
232
|
modality=Modality(data.get("modality", Modality.Tabular)),
|
|
297
233
|
target_columns=list(data.get("target_columns", [])),
|
|
298
|
-
column_overrides={
|
|
299
|
-
k: SemanticType(v) for k, v in data.get("column_overrides", {}).items()
|
|
300
|
-
},
|
|
301
|
-
exclude_columns=list(data.get("exclude_columns", [])),
|
|
302
234
|
compute_correlation=bool(data.get("compute_correlation", False)),
|
|
303
235
|
correlation_target_column=data.get("correlation_target_column"),
|
|
304
236
|
memory_threshold_mb=float(data.get("memory_threshold_mb", 500.0)),
|
|
@@ -313,98 +245,6 @@ class ProfileConfig:
|
|
|
313
245
|
return cls.from_dict(json.loads(json_str))
|
|
314
246
|
|
|
315
247
|
|
|
316
|
-
@dataclass
|
|
317
|
-
class PipelineConfig:
|
|
318
|
-
"""
|
|
319
|
-
Master configuration for the full 6-phase feature engineering pipeline.
|
|
320
|
-
|
|
321
|
-
Parameters
|
|
322
|
-
----------
|
|
323
|
-
exclude_columns : list[str]
|
|
324
|
-
Hard exclusions — columns dropped globally from every phase.
|
|
325
|
-
phase_exclusions : dict[PipelinePhase, list[str]]
|
|
326
|
-
Soft exclusions — columns bypassed for a specific phase but retained
|
|
327
|
-
in the dataset.
|
|
328
|
-
column_overrides : dict[str, SemanticType]
|
|
329
|
-
Explicit semantic type assignments respected by all downstream phases.
|
|
330
|
-
profiling : ProfileConfig
|
|
331
|
-
Phase 1-specific parameters (correlation, chunking, memory threshold).
|
|
332
|
-
"""
|
|
333
|
-
|
|
334
|
-
exclude_columns: list[str] = field(default_factory=list)
|
|
335
|
-
phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
|
|
336
|
-
column_overrides: dict[str, SemanticType] = field(default_factory=dict)
|
|
337
|
-
profiling: ProfileConfig = field(default_factory=ProfileConfig)
|
|
338
|
-
|
|
339
|
-
def resolve_active_columns(
|
|
340
|
-
self, phase: PipelinePhase, available_columns: list[str]
|
|
341
|
-
) -> list[str]:
|
|
342
|
-
"""
|
|
343
|
-
Return the columns the given phase should operate on.
|
|
344
|
-
|
|
345
|
-
Hard exclusions are applied first, then phase-specific soft exclusions.
|
|
346
|
-
Columns absent from available_columns are silently ignored in both lists.
|
|
347
|
-
"""
|
|
348
|
-
hard_set = set(self.exclude_columns)
|
|
349
|
-
soft_set = set(self.phase_exclusions.get(phase, []))
|
|
350
|
-
excluded = hard_set | soft_set
|
|
351
|
-
return [c for c in available_columns if c not in excluded]
|
|
352
|
-
|
|
353
|
-
def set_column_type(
|
|
354
|
-
self, column: str, semantic_type: Union[str, "SemanticType"]
|
|
355
|
-
) -> None:
|
|
356
|
-
"""
|
|
357
|
-
Explicitly set the semantic type for a column, overriding auto-detection.
|
|
358
|
-
This override is respected by all downstream phases.
|
|
359
|
-
"""
|
|
360
|
-
if isinstance(semantic_type, str):
|
|
361
|
-
try:
|
|
362
|
-
semantic_type = SemanticType(semantic_type)
|
|
363
|
-
except ValueError:
|
|
364
|
-
valid = [e.value for e in SemanticType]
|
|
365
|
-
raise ValueError(
|
|
366
|
-
f"Unknown semantic type {semantic_type!r}. "
|
|
367
|
-
f"Valid values: {valid}"
|
|
368
|
-
)
|
|
369
|
-
self.column_overrides[column] = semantic_type
|
|
370
|
-
|
|
371
|
-
def to_dict(self) -> dict:
|
|
372
|
-
return {
|
|
373
|
-
"exclude_columns": list(self.exclude_columns),
|
|
374
|
-
"phase_exclusions": {
|
|
375
|
-
str(phase): list(cols)
|
|
376
|
-
for phase, cols in self.phase_exclusions.items()
|
|
377
|
-
},
|
|
378
|
-
"column_overrides": {
|
|
379
|
-
col: str(sem_type)
|
|
380
|
-
for col, sem_type in self.column_overrides.items()
|
|
381
|
-
},
|
|
382
|
-
"profiling": self.profiling.to_dict(),
|
|
383
|
-
}
|
|
384
|
-
|
|
385
|
-
@classmethod
|
|
386
|
-
def from_dict(cls, data: dict) -> "PipelineConfig":
|
|
387
|
-
return cls(
|
|
388
|
-
exclude_columns=list(data.get("exclude_columns", [])),
|
|
389
|
-
phase_exclusions={
|
|
390
|
-
PipelinePhase(phase_str): list(cols)
|
|
391
|
-
for phase_str, cols in data.get("phase_exclusions", {}).items()
|
|
392
|
-
},
|
|
393
|
-
column_overrides={
|
|
394
|
-
col: SemanticType(sem_str)
|
|
395
|
-
for col, sem_str in data.get("column_overrides", {}).items()
|
|
396
|
-
},
|
|
397
|
-
profiling=ProfileConfig.from_dict(data.get("profiling", {})),
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
def to_json(self, indent: int = 2) -> str:
|
|
401
|
-
return json.dumps(self.to_dict(), indent=indent)
|
|
402
|
-
|
|
403
|
-
@classmethod
|
|
404
|
-
def from_json(cls, json_str: str) -> "PipelineConfig":
|
|
405
|
-
return cls.from_dict(json.loads(json_str))
|
|
406
|
-
|
|
407
|
-
|
|
408
248
|
@dataclass
|
|
409
249
|
class ColumnTypeInfo:
|
|
410
250
|
column: str
|
|
@@ -34,15 +34,12 @@ from ._missingness_profiler import MissingnessProfiler
|
|
|
34
34
|
from ._target_profiler import TargetProfiler
|
|
35
35
|
from ._correlation_profiler import CorrelationProfiler
|
|
36
36
|
from ._type_detector import TypeDetector
|
|
37
|
+
from ..config import PipelineConfig, PipelinePhase, SemanticType, Modality
|
|
37
38
|
from .config import (
|
|
38
|
-
PipelineConfig,
|
|
39
|
-
PipelinePhase,
|
|
40
39
|
ColumnProfile,
|
|
41
40
|
StructuralProfileResult,
|
|
42
41
|
RowMissingnessDistribution,
|
|
43
|
-
SemanticType,
|
|
44
42
|
TypeFlag,
|
|
45
|
-
Modality,
|
|
46
43
|
)
|
|
47
44
|
|
|
48
45
|
_ROW_DROP_THRESHOLD = 0.50
|
|
@@ -67,8 +64,6 @@ class StructuralProfiler:
|
|
|
67
64
|
|
|
68
65
|
def __init__(self, config: PipelineConfig | None = None) -> None:
|
|
69
66
|
self.config: PipelineConfig = config or PipelineConfig()
|
|
70
|
-
# Keep sub-profilers aligned with the master column_overrides.
|
|
71
|
-
self.config.profiling.column_overrides = self.config.column_overrides
|
|
72
67
|
|
|
73
68
|
if self.config.profiling.modality == Modality.Tabular:
|
|
74
69
|
self.modality_profiler: ModalityProfiler = TabularProfiler()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataforge-ml
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.0
|
|
4
4
|
Summary: A automated feature engineering and designing pipeline library
|
|
5
5
|
License: MIT
|
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -21,9 +21,9 @@ Provides-Extra: dev
|
|
|
21
21
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
22
22
|
Dynamic: license-file
|
|
23
23
|
|
|
24
|
-
#
|
|
24
|
+
# DataForgeML
|
|
25
25
|
|
|
26
|
-
Automated feature engineering and data profiling pipeline library for
|
|
26
|
+
Automated feature engineering and data profiling pipeline library for datasets.
|
|
27
27
|
|
|
28
28
|
## Installation
|
|
29
29
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_categorical_config.py
RENAMED
|
File without changes
|
{dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_correlation_config.py
RENAMED
|
File without changes
|
{dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_missingness_config.py
RENAMED
|
File without changes
|
{dataforge_ml-0.9.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|