dataforge-ml 0.9.0__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/PKG-INFO +3 -3
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/README.md +2 -2
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/pyproject.toml +1 -1
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/config.py +0 -39
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/structural.py +0 -2
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/PKG-INFO +3 -3
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/LICENSE +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/setup.cfg +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/__init__.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_base.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_categorical.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_null_detection.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_numeric_profiler.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_tabular.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_type_detector.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
- {dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataforge-ml
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: A automated feature engineering and designing pipeline library
|
|
5
5
|
License: MIT
|
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -21,9 +21,9 @@ Provides-Extra: dev
|
|
|
21
21
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
22
22
|
Dynamic: license-file
|
|
23
23
|
|
|
24
|
-
#
|
|
24
|
+
# DataForgeML
|
|
25
25
|
|
|
26
|
-
Automated feature engineering and data profiling pipeline library for
|
|
26
|
+
Automated feature engineering and data profiling pipeline library for datasets.
|
|
27
27
|
|
|
28
28
|
## Installation
|
|
29
29
|
|
|
@@ -227,10 +227,6 @@ class ProfileConfig:
|
|
|
227
227
|
Data modality. Currently only Tabular is implemented.
|
|
228
228
|
target_column : Optional[str]
|
|
229
229
|
Name of the label/target column, if any.
|
|
230
|
-
column_overrides : dict[str, SemanticType]
|
|
231
|
-
Explicit semantic type assignments that override auto-detection.
|
|
232
|
-
exclude_columns : list[str]
|
|
233
|
-
Columns to skip entirely during profiling.
|
|
234
230
|
compute_correlation : bool
|
|
235
231
|
Whether to compute the feature-feature correlation matrix.
|
|
236
232
|
correlation_target_column : Optional[str]
|
|
@@ -243,47 +239,16 @@ class ProfileConfig:
|
|
|
243
239
|
|
|
244
240
|
modality: Modality = Modality.Tabular
|
|
245
241
|
target_columns: list[str] = field(default_factory=list)
|
|
246
|
-
column_overrides: dict[str, SemanticType] = field(default_factory=dict)
|
|
247
|
-
exclude_columns: list[str] = field(default_factory=list)
|
|
248
242
|
compute_correlation: bool = False
|
|
249
243
|
correlation_target_column: Optional[str] = None
|
|
250
244
|
memory_threshold_mb: float = 500.0
|
|
251
245
|
chunk_size: int = 100_000
|
|
252
246
|
|
|
253
|
-
def set_column_type(self, column: str, semantic_type: Union[str, "SemanticType"]) -> None:
|
|
254
|
-
"""
|
|
255
|
-
Explicitly set the semantic type for a column, overriding auto-detection.
|
|
256
|
-
|
|
257
|
-
The override is the sole source of truth for that column's type — the
|
|
258
|
-
type detector's verdict is ignored during profiling. Calling this method
|
|
259
|
-
multiple times on the same column is valid; the last call wins.
|
|
260
|
-
|
|
261
|
-
Parameters
|
|
262
|
-
----------
|
|
263
|
-
column : str
|
|
264
|
-
Name of the column to override.
|
|
265
|
-
semantic_type : str | SemanticType
|
|
266
|
-
Target semantic type. Accepts a plain string (e.g. ``"numeric"``,
|
|
267
|
-
``"categorical"``) or a ``SemanticType`` enum value. Invalid strings
|
|
268
|
-
raise ``ValueError``.
|
|
269
|
-
"""
|
|
270
|
-
if isinstance(semantic_type, str):
|
|
271
|
-
try:
|
|
272
|
-
semantic_type = SemanticType(semantic_type)
|
|
273
|
-
except ValueError:
|
|
274
|
-
valid = [e.value for e in SemanticType]
|
|
275
|
-
raise ValueError(
|
|
276
|
-
f"Unknown semantic type {semantic_type!r}. "
|
|
277
|
-
f"Valid values: {valid}"
|
|
278
|
-
)
|
|
279
|
-
self.column_overrides[column] = semantic_type
|
|
280
247
|
|
|
281
248
|
def to_dict(self) -> dict:
|
|
282
249
|
return {
|
|
283
250
|
"modality": str(self.modality),
|
|
284
251
|
"target_columns": list(self.target_columns),
|
|
285
|
-
"column_overrides": {k: str(v) for k, v in self.column_overrides.items()},
|
|
286
|
-
"exclude_columns": list(self.exclude_columns),
|
|
287
252
|
"compute_correlation": self.compute_correlation,
|
|
288
253
|
"correlation_target_column": self.correlation_target_column,
|
|
289
254
|
"memory_threshold_mb": self.memory_threshold_mb,
|
|
@@ -295,10 +260,6 @@ class ProfileConfig:
|
|
|
295
260
|
return cls(
|
|
296
261
|
modality=Modality(data.get("modality", Modality.Tabular)),
|
|
297
262
|
target_columns=list(data.get("target_columns", [])),
|
|
298
|
-
column_overrides={
|
|
299
|
-
k: SemanticType(v) for k, v in data.get("column_overrides", {}).items()
|
|
300
|
-
},
|
|
301
|
-
exclude_columns=list(data.get("exclude_columns", [])),
|
|
302
263
|
compute_correlation=bool(data.get("compute_correlation", False)),
|
|
303
264
|
correlation_target_column=data.get("correlation_target_column"),
|
|
304
265
|
memory_threshold_mb=float(data.get("memory_threshold_mb", 500.0)),
|
|
@@ -67,8 +67,6 @@ class StructuralProfiler:
|
|
|
67
67
|
|
|
68
68
|
def __init__(self, config: PipelineConfig | None = None) -> None:
|
|
69
69
|
self.config: PipelineConfig = config or PipelineConfig()
|
|
70
|
-
# Keep sub-profilers aligned with the master column_overrides.
|
|
71
|
-
self.config.profiling.column_overrides = self.config.column_overrides
|
|
72
70
|
|
|
73
71
|
if self.config.profiling.modality == Modality.Tabular:
|
|
74
72
|
self.modality_profiler: ModalityProfiler = TabularProfiler()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataforge-ml
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: A automated feature engineering and designing pipeline library
|
|
5
5
|
License: MIT
|
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -21,9 +21,9 @@ Provides-Extra: dev
|
|
|
21
21
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
22
22
|
Dynamic: license-file
|
|
23
23
|
|
|
24
|
-
#
|
|
24
|
+
# DataForgeML
|
|
25
25
|
|
|
26
|
-
Automated feature engineering and data profiling pipeline library for
|
|
26
|
+
Automated feature engineering and data profiling pipeline library for datasets.
|
|
27
27
|
|
|
28
28
|
## Installation
|
|
29
29
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_categorical_config.py
RENAMED
|
File without changes
|
{dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_correlation_config.py
RENAMED
|
File without changes
|
{dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_missingness_config.py
RENAMED
|
File without changes
|
{dataforge_ml-0.9.0 → dataforge_ml-0.10.0}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|