dataforge-ml 0.10.0__tar.gz → 0.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/PKG-INFO +1 -1
  2. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/pyproject.toml +1 -1
  3. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/__init__.py +10 -5
  4. dataforge_ml-0.11.0/src/dataforge_ml/config.py +133 -0
  5. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/__init__.py +1 -4
  6. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/config.py +1 -122
  7. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/structural.py +1 -4
  8. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml.egg-info/PKG-INFO +1 -1
  9. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml.egg-info/SOURCES.txt +1 -0
  10. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/LICENSE +0 -0
  11. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/README.md +0 -0
  12. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/setup.cfg +0 -0
  13. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/models/__init__.py +0 -0
  14. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/models/_data_structure.py +0 -0
  15. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/models/_data_types.py +0 -0
  16. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_base.py +0 -0
  17. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
  18. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
  19. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_categorical.py +0 -0
  20. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
  21. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
  22. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
  23. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
  24. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
  25. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  26. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
  27. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_null_detection.py +0 -0
  28. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
  29. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_numeric_profiler.py +0 -0
  30. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_tabular.py +0 -0
  31. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
  32. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
  33. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
  34. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
  35. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/profiling/_type_detector.py +0 -0
  36. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/splitting/__init__.py +0 -0
  37. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/splitting/_config.py +0 -0
  38. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
  39. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/utils/__init__.py +0 -0
  40. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml/utils/data_loader.py +0 -0
  41. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  42. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
  43. {dataforge_ml-0.10.0 → dataforge_ml-0.11.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.10.0
3
+ Version: 0.11.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "0.10.0"
7
+ version = "0.11.0"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -1,21 +1,26 @@
1
+ from .config import PipelineConfig, PipelinePhase, SemanticType, Modality
1
2
  from .profiling.structural import StructuralProfiler
2
3
  from .profiling.config import (
3
4
  ProfileConfig,
4
- SemanticType,
5
- Modality,
6
5
  StructuralProfileResult,
6
+ ColumnProfile,
7
+ DatasetStats,
7
8
  )
8
9
  from .splitting import DataSplitter, SplitResult, FoldResult
9
10
  from .utils.data_loader import DataLoader
10
11
 
11
12
  __all__ = [
12
- "StructuralProfiler",
13
- "StructuralProfileResult",
13
+ "PipelineConfig",
14
+ "PipelinePhase",
14
15
  "ProfileConfig",
15
16
  "SemanticType",
16
17
  "Modality",
18
+ "StructuralProfiler",
19
+ "StructuralProfileResult",
20
+ "ColumnProfile",
21
+ "DatasetStats",
17
22
  "DataSplitter",
18
23
  "SplitResult",
19
24
  "FoldResult",
20
- "DataLoader"
25
+ "DataLoader",
21
26
  ]
@@ -0,0 +1,133 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass, field
5
+ from enum import StrEnum
6
+ from typing import TYPE_CHECKING, Union
7
+
8
+ if TYPE_CHECKING:
9
+ from dataforge_ml.profiling.config import ProfileConfig
10
+
11
+
12
+ class SemanticType(StrEnum):
13
+ Numeric = "numeric"
14
+ Categorical = "categorical"
15
+ Datetime = "datetime"
16
+ Boolean = "boolean"
17
+ Text = "text"
18
+ Identifier = "identifier"
19
+
20
+
21
+ class Modality(StrEnum):
22
+ Tabular = "tabular"
23
+
24
+
25
+ class PipelinePhase(StrEnum):
26
+ Profiling = "profiling"
27
+ Imputation = "imputation"
28
+ OutlierDetection = "outlier_detection"
29
+ Normalization = "normalization"
30
+ Encoding = "encoding"
31
+ Scaling = "scaling"
32
+
33
+
34
+ def _default_profile_config() -> ProfileConfig:
35
+ from dataforge_ml.profiling.config import ProfileConfig
36
+ return ProfileConfig()
37
+
38
+
39
+ @dataclass
40
+ class PipelineConfig:
41
+ """
42
+ Master configuration for the full 6-phase feature engineering pipeline.
43
+
44
+ Parameters
45
+ ----------
46
+ exclude_columns : list[str]
47
+ Hard exclusions — columns dropped globally from every phase.
48
+ phase_exclusions : dict[PipelinePhase, list[str]]
49
+ Soft exclusions — columns bypassed for a specific phase but retained
50
+ in the dataset.
51
+ column_overrides : dict[str, SemanticType]
52
+ Explicit semantic type assignments respected by all downstream phases.
53
+ profiling : ProfileConfig
54
+ Phase 1-specific parameters (correlation, chunking, memory threshold).
55
+ """
56
+
57
+ exclude_columns: list[str] = field(default_factory=list)
58
+ phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
59
+ column_overrides: dict[str, SemanticType] = field(default_factory=dict)
60
+ profiling: ProfileConfig = field(default_factory=_default_profile_config)
61
+
62
+ def resolve_active_columns(
63
+ self, phase: PipelinePhase, available_columns: list[str]
64
+ ) -> list[str]:
65
+ """
66
+ Return the columns the given phase should operate on.
67
+
68
+ Hard exclusions are applied first, then phase-specific soft exclusions.
69
+ Columns absent from available_columns are silently ignored in both lists.
70
+ """
71
+ hard_set = set(self.exclude_columns)
72
+ soft_set = set(self.phase_exclusions.get(phase, []))
73
+ excluded = hard_set | soft_set
74
+ return [c for c in available_columns if c not in excluded]
75
+
76
+ def set_column_type(
77
+ self, column: str, semantic_type: Union[str, SemanticType]
78
+ ) -> None:
79
+ """Explicitly set the semantic type for a column, overriding auto-detection."""
80
+ if isinstance(semantic_type, str):
81
+ try:
82
+ semantic_type = SemanticType(semantic_type)
83
+ except ValueError:
84
+ valid = [e.value for e in SemanticType]
85
+ raise ValueError(
86
+ f"Unknown semantic type {semantic_type!r}. "
87
+ f"Valid values: {valid}"
88
+ )
89
+ self.column_overrides[column] = semantic_type
90
+
91
+ def set_columns_type(
92
+ self, columns: list[str], semantic_type: Union[str, SemanticType]
93
+ ) -> None:
94
+ """Assign the same semantic type to every column in the list."""
95
+ for column in columns:
96
+ self.set_column_type(column, semantic_type)
97
+
98
+ def to_dict(self) -> dict:
99
+ return {
100
+ "exclude_columns": list(self.exclude_columns),
101
+ "phase_exclusions": {
102
+ str(phase): list(cols)
103
+ for phase, cols in self.phase_exclusions.items()
104
+ },
105
+ "column_overrides": {
106
+ col: str(sem_type)
107
+ for col, sem_type in self.column_overrides.items()
108
+ },
109
+ "profiling": self.profiling.to_dict(),
110
+ }
111
+
112
+ @classmethod
113
+ def from_dict(cls, data: dict) -> PipelineConfig:
114
+ from dataforge_ml.profiling.config import ProfileConfig
115
+ return cls(
116
+ exclude_columns=list(data.get("exclude_columns", [])),
117
+ phase_exclusions={
118
+ PipelinePhase(phase_str): list(cols)
119
+ for phase_str, cols in data.get("phase_exclusions", {}).items()
120
+ },
121
+ column_overrides={
122
+ col: SemanticType(sem_str)
123
+ for col, sem_str in data.get("column_overrides", {}).items()
124
+ },
125
+ profiling=ProfileConfig.from_dict(data.get("profiling", {})),
126
+ )
127
+
128
+ def to_json(self, indent: int = 2) -> str:
129
+ return json.dumps(self.to_dict(), indent=indent)
130
+
131
+ @classmethod
132
+ def from_json(cls, json_str: str) -> PipelineConfig:
133
+ return cls.from_dict(json.loads(json_str))
@@ -1,10 +1,7 @@
1
1
  from .structural import StructuralProfiler
2
+ from ..config import PipelineConfig, PipelinePhase, SemanticType, Modality
2
3
  from .config import (
3
4
  ProfileConfig,
4
- PipelineConfig,
5
- PipelinePhase,
6
- SemanticType,
7
- Modality,
8
5
  TypeFlag,
9
6
  NumericKind,
10
7
  NumericStats,
@@ -12,6 +12,7 @@ from dataclasses import dataclass, field
12
12
  from enum import StrEnum
13
13
  from typing import Optional, Union
14
14
 
15
+ from ..config import SemanticType, Modality
15
16
  from ._missingness_config import (
16
17
  ColumnMissingnessProfile,
17
18
  )
@@ -31,36 +32,6 @@ from ._boolean_config import BooleanStats
31
32
  from ._text_config import TextStats
32
33
  from ._target_config import TargetProfileResult
33
34
 
34
- # ---------------------------------------------------------------------------
35
- # Core enums
36
- # ---------------------------------------------------------------------------
37
-
38
-
39
- class SemanticType(StrEnum):
40
- Numeric = "numeric"
41
- Categorical = "categorical"
42
- Datetime = "datetime"
43
- Boolean = "boolean"
44
- Text = "text"
45
- Identifier = "identifier"
46
-
47
-
48
- class Modality(StrEnum):
49
- Tabular = "tabular"
50
- # Placeholder slots for future modalities — no implementation yet.
51
- # Image = "image"
52
- # TimeSeries = "time_series"
53
-
54
-
55
- class PipelinePhase(StrEnum):
56
- Profiling = "profiling"
57
- Imputation = "imputation"
58
- OutlierDetection = "outlier_detection"
59
- Normalization = "normalization"
60
- Encoding = "encoding"
61
- Scaling = "scaling"
62
-
63
-
64
35
  # ---------------------------------------------------------------------------
65
36
  # Type-detection enums — kept for TypeDetector compatibility
66
37
  # ---------------------------------------------------------------------------
@@ -274,98 +245,6 @@ class ProfileConfig:
274
245
  return cls.from_dict(json.loads(json_str))
275
246
 
276
247
 
277
- @dataclass
278
- class PipelineConfig:
279
- """
280
- Master configuration for the full 6-phase feature engineering pipeline.
281
-
282
- Parameters
283
- ----------
284
- exclude_columns : list[str]
285
- Hard exclusions — columns dropped globally from every phase.
286
- phase_exclusions : dict[PipelinePhase, list[str]]
287
- Soft exclusions — columns bypassed for a specific phase but retained
288
- in the dataset.
289
- column_overrides : dict[str, SemanticType]
290
- Explicit semantic type assignments respected by all downstream phases.
291
- profiling : ProfileConfig
292
- Phase 1-specific parameters (correlation, chunking, memory threshold).
293
- """
294
-
295
- exclude_columns: list[str] = field(default_factory=list)
296
- phase_exclusions: dict[PipelinePhase, list[str]] = field(default_factory=dict)
297
- column_overrides: dict[str, SemanticType] = field(default_factory=dict)
298
- profiling: ProfileConfig = field(default_factory=ProfileConfig)
299
-
300
- def resolve_active_columns(
301
- self, phase: PipelinePhase, available_columns: list[str]
302
- ) -> list[str]:
303
- """
304
- Return the columns the given phase should operate on.
305
-
306
- Hard exclusions are applied first, then phase-specific soft exclusions.
307
- Columns absent from available_columns are silently ignored in both lists.
308
- """
309
- hard_set = set(self.exclude_columns)
310
- soft_set = set(self.phase_exclusions.get(phase, []))
311
- excluded = hard_set | soft_set
312
- return [c for c in available_columns if c not in excluded]
313
-
314
- def set_column_type(
315
- self, column: str, semantic_type: Union[str, "SemanticType"]
316
- ) -> None:
317
- """
318
- Explicitly set the semantic type for a column, overriding auto-detection.
319
- This override is respected by all downstream phases.
320
- """
321
- if isinstance(semantic_type, str):
322
- try:
323
- semantic_type = SemanticType(semantic_type)
324
- except ValueError:
325
- valid = [e.value for e in SemanticType]
326
- raise ValueError(
327
- f"Unknown semantic type {semantic_type!r}. "
328
- f"Valid values: {valid}"
329
- )
330
- self.column_overrides[column] = semantic_type
331
-
332
- def to_dict(self) -> dict:
333
- return {
334
- "exclude_columns": list(self.exclude_columns),
335
- "phase_exclusions": {
336
- str(phase): list(cols)
337
- for phase, cols in self.phase_exclusions.items()
338
- },
339
- "column_overrides": {
340
- col: str(sem_type)
341
- for col, sem_type in self.column_overrides.items()
342
- },
343
- "profiling": self.profiling.to_dict(),
344
- }
345
-
346
- @classmethod
347
- def from_dict(cls, data: dict) -> "PipelineConfig":
348
- return cls(
349
- exclude_columns=list(data.get("exclude_columns", [])),
350
- phase_exclusions={
351
- PipelinePhase(phase_str): list(cols)
352
- for phase_str, cols in data.get("phase_exclusions", {}).items()
353
- },
354
- column_overrides={
355
- col: SemanticType(sem_str)
356
- for col, sem_str in data.get("column_overrides", {}).items()
357
- },
358
- profiling=ProfileConfig.from_dict(data.get("profiling", {})),
359
- )
360
-
361
- def to_json(self, indent: int = 2) -> str:
362
- return json.dumps(self.to_dict(), indent=indent)
363
-
364
- @classmethod
365
- def from_json(cls, json_str: str) -> "PipelineConfig":
366
- return cls.from_dict(json.loads(json_str))
367
-
368
-
369
248
  @dataclass
370
249
  class ColumnTypeInfo:
371
250
  column: str
@@ -34,15 +34,12 @@ from ._missingness_profiler import MissingnessProfiler
34
34
  from ._target_profiler import TargetProfiler
35
35
  from ._correlation_profiler import CorrelationProfiler
36
36
  from ._type_detector import TypeDetector
37
+ from ..config import PipelineConfig, PipelinePhase, SemanticType, Modality
37
38
  from .config import (
38
- PipelineConfig,
39
- PipelinePhase,
40
39
  ColumnProfile,
41
40
  StructuralProfileResult,
42
41
  RowMissingnessDistribution,
43
- SemanticType,
44
42
  TypeFlag,
45
- Modality,
46
43
  )
47
44
 
48
45
  _ROW_DROP_THRESHOLD = 0.50
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 0.10.0
3
+ Version: 0.11.0
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -2,6 +2,7 @@ LICENSE
2
2
  README.md
3
3
  pyproject.toml
4
4
  src/dataforge_ml/__init__.py
5
+ src/dataforge_ml/config.py
5
6
  src/dataforge_ml.egg-info/PKG-INFO
6
7
  src/dataforge_ml.egg-info/SOURCES.txt
7
8
  src/dataforge_ml.egg-info/dependency_links.txt
File without changes
File without changes
File without changes