dataforge-ml 0.7.0__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/PKG-INFO +1 -1
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/pyproject.toml +1 -1
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_missingness_profiler.py +3 -1
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_numeric_profiler.py +3 -2
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/config.py +29 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/structural.py +5 -1
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/utils/data_loader.py +1 -3
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/PKG-INFO +1 -1
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/LICENSE +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/README.md +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/setup.cfg +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/__init__.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_base.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_categorical.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_tabular.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_type_detector.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
- {dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
{dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
@@ -208,7 +208,9 @@ class MissingnessProfiler(DatasetLevelProfiler[MissingnessProfileResult]):
|
|
|
208
208
|
|
|
209
209
|
r = profile.effective_null_ratio
|
|
210
210
|
|
|
211
|
-
if r
|
|
211
|
+
if r == 0.0:
|
|
212
|
+
profile.severity = None
|
|
213
|
+
elif r < _SEVERITY_MINOR:
|
|
212
214
|
profile.severity = MissingSeverity.Minor
|
|
213
215
|
elif r < _SEVERITY_MODERATE:
|
|
214
216
|
profile.severity = MissingSeverity.Moderate
|
|
@@ -254,16 +254,17 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
254
254
|
for i in range(top_rows)
|
|
255
255
|
]
|
|
256
256
|
else:
|
|
257
|
-
# ---
|
|
257
|
+
# --- Histogram Distribution (Continuous) ---
|
|
258
258
|
import numpy as np
|
|
259
259
|
|
|
260
260
|
counts, bin_edges = np.histogram(clean_f64.to_numpy(), bins="auto")
|
|
261
|
+
n_clean = clean_f64.len()
|
|
261
262
|
profile.histogram = [
|
|
262
263
|
HistogramBin(
|
|
263
264
|
lower_bound=float(bin_edges[i]),
|
|
264
265
|
upper_bound=float(bin_edges[i + 1]),
|
|
265
266
|
count=int(counts[i]),
|
|
266
|
-
percentage=int(counts[i]) /
|
|
267
|
+
percentage=int(counts[i]) / n_clean if n_clean > 0 else 0.0,
|
|
267
268
|
)
|
|
268
269
|
for i in range(len(counts))
|
|
269
270
|
]
|
|
@@ -71,6 +71,7 @@ class TypeFlag(StrEnum):
|
|
|
71
71
|
SequentialIndex = "sequential_index"
|
|
72
72
|
FloatSequentialIndex = "float_sequential_index"
|
|
73
73
|
FreeTextCandidate = "free_text_candidate"
|
|
74
|
+
UserOverride = "user_override"
|
|
74
75
|
|
|
75
76
|
|
|
76
77
|
# ---------------------------------------------------------------------------
|
|
@@ -240,6 +241,34 @@ class ProfileConfig:
|
|
|
240
241
|
memory_threshold_mb: float = 500.0
|
|
241
242
|
chunk_size: int = 100_000
|
|
242
243
|
|
|
244
|
+
def set_column_type(self, column: str, semantic_type: Union[str, "SemanticType"]) -> None:
|
|
245
|
+
"""
|
|
246
|
+
Explicitly set the semantic type for a column, overriding auto-detection.
|
|
247
|
+
|
|
248
|
+
The override is the sole source of truth for that column's type — the
|
|
249
|
+
type detector's verdict is ignored during profiling. Calling this method
|
|
250
|
+
multiple times on the same column is valid; the last call wins.
|
|
251
|
+
|
|
252
|
+
Parameters
|
|
253
|
+
----------
|
|
254
|
+
column : str
|
|
255
|
+
Name of the column to override.
|
|
256
|
+
semantic_type : str | SemanticType
|
|
257
|
+
Target semantic type. Accepts a plain string (e.g. ``"numeric"``,
|
|
258
|
+
``"categorical"``) or a ``SemanticType`` enum value. Invalid strings
|
|
259
|
+
raise ``ValueError``.
|
|
260
|
+
"""
|
|
261
|
+
if isinstance(semantic_type, str):
|
|
262
|
+
try:
|
|
263
|
+
semantic_type = SemanticType(semantic_type)
|
|
264
|
+
except ValueError:
|
|
265
|
+
valid = [e.value for e in SemanticType]
|
|
266
|
+
raise ValueError(
|
|
267
|
+
f"Unknown semantic type {semantic_type!r}. "
|
|
268
|
+
f"Valid values: {valid}"
|
|
269
|
+
)
|
|
270
|
+
self.column_overrides[column] = semantic_type
|
|
271
|
+
|
|
243
272
|
def to_dict(self) -> dict:
|
|
244
273
|
return {
|
|
245
274
|
"modality": str(self.modality),
|
|
@@ -40,6 +40,7 @@ from .config import (
|
|
|
40
40
|
StructuralProfileResult,
|
|
41
41
|
RowMissingnessDistribution,
|
|
42
42
|
SemanticType,
|
|
43
|
+
TypeFlag,
|
|
43
44
|
Modality,
|
|
44
45
|
)
|
|
45
46
|
|
|
@@ -130,7 +131,10 @@ class StructuralProfiler:
|
|
|
130
131
|
# Overrides for excluded / non-existent columns are silently ignored.
|
|
131
132
|
for col_name, override_type in self.config.column_overrides.items():
|
|
132
133
|
if col_name in result.columns:
|
|
133
|
-
result.columns[col_name]
|
|
134
|
+
cp = result.columns[col_name]
|
|
135
|
+
cp.semantic_type = override_type
|
|
136
|
+
if TypeFlag.UserOverride not in cp.type_flags:
|
|
137
|
+
cp.type_flags.append(TypeFlag.UserOverride)
|
|
134
138
|
|
|
135
139
|
# ── 6. Per-column profiling routed by SemanticType ───────────────
|
|
136
140
|
# Batch all columns of the same SemanticType together and call each
|
|
@@ -82,8 +82,6 @@ _EXT_LOADERS: dict[str, callable] = {
|
|
|
82
82
|
|
|
83
83
|
|
|
84
84
|
class DataLoader:
|
|
85
|
-
def __init__(self, fmt: str | None = None) -> None:
|
|
86
|
-
self._fmt_override = fmt.lower() if fmt else None
|
|
87
85
|
|
|
88
86
|
def load(
|
|
89
87
|
self,
|
|
@@ -92,7 +90,7 @@ class DataLoader:
|
|
|
92
90
|
) -> pl.DataFrame:
|
|
93
91
|
raw, ext_from_path = _read_raw(source)
|
|
94
92
|
|
|
95
|
-
resolved_fmt = (
|
|
93
|
+
resolved_fmt = (ext_from_path or "").lower()
|
|
96
94
|
|
|
97
95
|
if resolved_fmt not in _EXT_LOADERS:
|
|
98
96
|
label = resolved_fmt if resolved_fmt else "<unknown>"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-0.7.0 → dataforge_ml-0.8.0}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|