dataforge-ml 2.0.0__tar.gz → 2.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-2.0.0/src/dataforge_ml.egg-info → dataforge_ml-2.0.1}/PKG-INFO +1 -1
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/pyproject.toml +1 -1
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_categorical.py +25 -13
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_categorical_config.py +16 -16
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_config.py +73 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_numeric_profiler.py +17 -1
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_target_profiler.py +6 -5
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1/src/dataforge_ml.egg-info}/PKG-INFO +1 -1
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/LICENSE +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/README.md +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/setup.cfg +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/config.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/__init__.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/_config.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/_fitted_imputer.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/_numeric_imputer.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/_regression_estimator_factory.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/_strategy_router.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/_utils.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/orchestrator.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/__init__.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_base.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_nonlinearity_profiler.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_tabular.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_type_detection_config.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_type_detector.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/orchestrator.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/splitting/_profile_signals.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/utils/_null_detection.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/utils/_null_normalization.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml.egg-info/requires.txt +0 -0
- {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -11,9 +11,9 @@ Per-column metrics (opt-in via ProfileConfig.categorical_columns):
|
|
|
11
11
|
7. Free-text / natural-language flag
|
|
12
12
|
(avg word count >5 OR avg char length >50 OR avg token count >10)
|
|
13
13
|
8. Imbalance metrics
|
|
14
|
-
– class ratio
|
|
15
|
-
– Shannon entropy
|
|
16
|
-
– Gini impurity
|
|
14
|
+
– dominant class ratio (max_freq / second_max_freq)
|
|
15
|
+
– normalized Shannon entropy
|
|
16
|
+
– normalized Gini impurity
|
|
17
17
|
|
|
18
18
|
Integration
|
|
19
19
|
-----------
|
|
@@ -221,24 +221,36 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
|
|
|
221
221
|
)
|
|
222
222
|
|
|
223
223
|
# --- Imbalance metrics ---
|
|
224
|
-
# Class Ratio ->
|
|
225
|
-
# Entropy ->
|
|
226
|
-
# Gini ->
|
|
224
|
+
# Dominant Class Ratio -> max_freq / second_max_freq
|
|
225
|
+
# Normalized Entropy -> scaled to [0, 1]
|
|
226
|
+
# Normalized Gini -> scaled to [0, 1]
|
|
227
227
|
counts = vc["count"].cast(pl.Float64)
|
|
228
228
|
total = float(counts.sum())
|
|
229
|
-
|
|
229
|
+
card = len(counts)
|
|
230
|
+
if total > 0 and card >= 2:
|
|
230
231
|
probs = counts / total
|
|
231
|
-
|
|
232
|
-
|
|
232
|
+
|
|
233
|
+
max_freq = float(probs[0])
|
|
234
|
+
second_max_freq = float(probs[1])
|
|
233
235
|
|
|
234
|
-
|
|
236
|
+
dominant_class_ratio = max_freq / second_max_freq if second_max_freq > 0 else float("inf")
|
|
237
|
+
|
|
235
238
|
entropy = float(-(probs * probs.log(base=2)).fill_nan(0.0).sum())
|
|
239
|
+
normalized_entropy = entropy / math.log2(card)
|
|
240
|
+
|
|
236
241
|
gini = float(1.0 - (probs**2).sum())
|
|
242
|
+
normalized_gini = gini / (1.0 - 1.0 / card)
|
|
237
243
|
|
|
238
244
|
profile.imbalance = ImbalanceMetrics(
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
245
|
+
dominant_class_ratio=dominant_class_ratio,
|
|
246
|
+
normalized_shannon_entropy=normalized_entropy,
|
|
247
|
+
normalized_gini=normalized_gini,
|
|
248
|
+
)
|
|
249
|
+
else:
|
|
250
|
+
profile.imbalance = ImbalanceMetrics(
|
|
251
|
+
dominant_class_ratio=None,
|
|
252
|
+
normalized_shannon_entropy=None,
|
|
253
|
+
normalized_gini=None,
|
|
242
254
|
)
|
|
243
255
|
|
|
244
256
|
return vc
|
|
@@ -194,20 +194,20 @@ class ImbalanceMetrics:
|
|
|
194
194
|
|
|
195
195
|
Attributes
|
|
196
196
|
----------
|
|
197
|
-
|
|
198
|
-
Ratio of the
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
Shannon entropy of the class distribution
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
Gini impurity of the class distribution
|
|
205
|
-
|
|
197
|
+
dominant_class_ratio : float, optional
|
|
198
|
+
Ratio of the most-frequent to second-most-frequent class frequency.
|
|
199
|
+
None when cardinality < 2.
|
|
200
|
+
normalized_shannon_entropy : float, optional
|
|
201
|
+
Shannon entropy of the class distribution scaled to [0, 1] by dividing
|
|
202
|
+
by log2(cardinality). None when cardinality < 2.
|
|
203
|
+
normalized_gini : float, optional
|
|
204
|
+
Gini impurity of the class distribution scaled to [0, 1] by dividing
|
|
205
|
+
by (1 - 1/cardinality). None when cardinality < 2.
|
|
206
206
|
"""
|
|
207
207
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
208
|
+
dominant_class_ratio: float | None = None
|
|
209
|
+
normalized_shannon_entropy: float | None = None
|
|
210
|
+
normalized_gini: float | None = None
|
|
211
211
|
|
|
212
212
|
def to_dict(self) -> dict:
|
|
213
213
|
"""
|
|
@@ -216,12 +216,12 @@ class ImbalanceMetrics:
|
|
|
216
216
|
Returns
|
|
217
217
|
-------
|
|
218
218
|
dict
|
|
219
|
-
Keys: ``
|
|
219
|
+
Keys: ``dominant_class_ratio``, ``normalized_shannon_entropy``, ``normalized_gini``.
|
|
220
220
|
"""
|
|
221
221
|
return {
|
|
222
|
-
"
|
|
223
|
-
"
|
|
224
|
-
"
|
|
222
|
+
"dominant_class_ratio": self.dominant_class_ratio,
|
|
223
|
+
"normalized_shannon_entropy": self.normalized_shannon_entropy,
|
|
224
|
+
"normalized_gini": self.normalized_gini,
|
|
225
225
|
}
|
|
226
226
|
|
|
227
227
|
|
|
@@ -236,6 +236,79 @@ class StructuralProfileResult:
|
|
|
236
236
|
"""
|
|
237
237
|
return json.dumps(self.to_dict(), indent=indent, default=str)
|
|
238
238
|
|
|
239
|
+
def to_markdown(self) -> str:
|
|
240
|
+
"""
|
|
241
|
+
Produce a complete, lossless Markdown representation of the profiling result.
|
|
242
|
+
|
|
243
|
+
Returns
|
|
244
|
+
-------
|
|
245
|
+
str
|
|
246
|
+
Markdown string containing a summary table followed by per-column detail
|
|
247
|
+
sections and dataset-level statistics.
|
|
248
|
+
"""
|
|
249
|
+
data = self.to_dict()
|
|
250
|
+
lines = ["# Structural Profile Report\n"]
|
|
251
|
+
|
|
252
|
+
# 1. Summary navigation table
|
|
253
|
+
lines.append("## Summary\n")
|
|
254
|
+
lines.append("| Column | Semantic Type | Missing % | Severity | Key Flags |")
|
|
255
|
+
lines.append("|---|---|---|---|---|")
|
|
256
|
+
|
|
257
|
+
for col_name, col_data in data.get("columns", {}).items():
|
|
258
|
+
sem_type = col_data.get("semantic_type") or "None"
|
|
259
|
+
missingness = col_data.get("missingness") or {}
|
|
260
|
+
missing_pct = missingness.get("effective_null_ratio", 0.0) * 100
|
|
261
|
+
missing_str = f"{missing_pct:.2f}%"
|
|
262
|
+
severity = missingness.get("severity") or "None"
|
|
263
|
+
flags = ", ".join(col_data.get("type_flags", [])) or "None"
|
|
264
|
+
lines.append(f"| {col_name} | {sem_type} | {missing_str} | {severity} | {flags} |")
|
|
265
|
+
|
|
266
|
+
lines.append("\n## Column Details\n")
|
|
267
|
+
|
|
268
|
+
def _format_dict(d: dict, indent: int = 0) -> list[str]:
|
|
269
|
+
out = []
|
|
270
|
+
prefix = " " * indent
|
|
271
|
+
for k, v in d.items():
|
|
272
|
+
if isinstance(v, dict):
|
|
273
|
+
if not v:
|
|
274
|
+
out.append(f"{prefix}- **{k}**: (empty)")
|
|
275
|
+
else:
|
|
276
|
+
out.append(f"{prefix}- **{k}**:")
|
|
277
|
+
out.extend(_format_dict(v, indent + 1))
|
|
278
|
+
elif isinstance(v, list):
|
|
279
|
+
if not v:
|
|
280
|
+
out.append(f"{prefix}- **{k}**: (empty)")
|
|
281
|
+
else:
|
|
282
|
+
out.append(f"{prefix}- **{k}**:")
|
|
283
|
+
for item in v:
|
|
284
|
+
out.append(f"{prefix} - {item}")
|
|
285
|
+
else:
|
|
286
|
+
out.append(f"{prefix}- **{k}**: {v}")
|
|
287
|
+
return out
|
|
288
|
+
|
|
289
|
+
for col_name, col_data in data.get("columns", {}).items():
|
|
290
|
+
lines.append(f"### `{col_name}`\n")
|
|
291
|
+
lines.extend(_format_dict(col_data))
|
|
292
|
+
lines.append("")
|
|
293
|
+
|
|
294
|
+
lines.append("## Dataset\n")
|
|
295
|
+
lines.extend(_format_dict(data.get("dataset", {})))
|
|
296
|
+
lines.append("")
|
|
297
|
+
|
|
298
|
+
lines.append("## Targets\n")
|
|
299
|
+
lines.extend(_format_dict(data.get("targets", {})))
|
|
300
|
+
lines.append("")
|
|
301
|
+
|
|
302
|
+
lines.append("## Numeric Sentinels\n")
|
|
303
|
+
lines.extend(_format_dict(data.get("numeric_sentinels", {})))
|
|
304
|
+
lines.append("")
|
|
305
|
+
|
|
306
|
+
lines.append("## String Sentinels\n")
|
|
307
|
+
lines.extend(_format_dict(data.get("string_sentinels", {})))
|
|
308
|
+
lines.append("")
|
|
309
|
+
|
|
310
|
+
return "\n".join(lines).strip() + "\n"
|
|
311
|
+
|
|
239
312
|
|
|
240
313
|
# ---------------------------------------------------------------------------
|
|
241
314
|
# ProfileConfig — clean break from per-profiler column lists
|
|
@@ -74,6 +74,22 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
74
74
|
data: pl.DataFrame,
|
|
75
75
|
columns: list[str],
|
|
76
76
|
) -> NumericProfileResult:
|
|
77
|
+
"""
|
|
78
|
+
Profile the specified numeric columns in a DataFrame.
|
|
79
|
+
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
data : pl.DataFrame
|
|
83
|
+
The input Polars DataFrame containing the columns to profile.
|
|
84
|
+
columns : list[str]
|
|
85
|
+
A list of column names to profile. Non-numeric columns in this list
|
|
86
|
+
are skipped.
|
|
87
|
+
|
|
88
|
+
Returns
|
|
89
|
+
-------
|
|
90
|
+
NumericProfileResult
|
|
91
|
+
A result object containing distribution statistics for the profiled columns.
|
|
92
|
+
"""
|
|
77
93
|
return self._run(data, columns)
|
|
78
94
|
|
|
79
95
|
# ------------------------------------------------------------------
|
|
@@ -128,7 +144,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
|
|
|
128
144
|
profile.mean = mean
|
|
129
145
|
profile.median = median
|
|
130
146
|
if median == 0.0:
|
|
131
|
-
profile.mean_median_ratio =
|
|
147
|
+
profile.mean_median_ratio = None if mean != 0.0 else 1.0
|
|
132
148
|
else:
|
|
133
149
|
profile.mean_median_ratio = mean / median
|
|
134
150
|
|
|
@@ -137,11 +137,12 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
|
|
|
137
137
|
result.categorical_profile = cat_profile
|
|
138
138
|
|
|
139
139
|
# Flag Imbalances
|
|
140
|
-
ratio = cat_profile.imbalance.
|
|
141
|
-
if ratio
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
140
|
+
ratio = cat_profile.imbalance.dominant_class_ratio
|
|
141
|
+
if ratio is not None:
|
|
142
|
+
if ratio > 20.0:
|
|
143
|
+
result.flags.append(TargetFlag.SevereImbalance)
|
|
144
|
+
elif ratio > 5.0:
|
|
145
|
+
result.flags.append(TargetFlag.HighImbalance)
|
|
145
146
|
|
|
146
147
|
def _profile_regression(
|
|
147
148
|
self, series: pl.Series, n_rows: int, result: TargetProfileResult
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
File without changes
|
{dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_nonlinearity_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_type_detection_config.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|