dataforge-ml 2.0.0__tar.gz → 2.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {dataforge_ml-2.0.0/src/dataforge_ml.egg-info → dataforge_ml-2.0.1}/PKG-INFO +1 -1
  2. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/pyproject.toml +1 -1
  3. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_categorical.py +25 -13
  4. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_categorical_config.py +16 -16
  5. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_config.py +73 -0
  6. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_numeric_profiler.py +17 -1
  7. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_target_profiler.py +6 -5
  8. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1/src/dataforge_ml.egg-info}/PKG-INFO +1 -1
  9. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/LICENSE +0 -0
  10. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/README.md +0 -0
  11. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/setup.cfg +0 -0
  12. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/__init__.py +0 -0
  13. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/config.py +0 -0
  14. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/__init__.py +0 -0
  15. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/_config.py +0 -0
  16. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/_fitted_imputer.py +0 -0
  17. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/_numeric_imputer.py +0 -0
  18. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/_regression_estimator_factory.py +0 -0
  19. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/_strategy_router.py +0 -0
  20. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/_utils.py +0 -0
  21. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/imputation/orchestrator.py +0 -0
  22. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/models/__init__.py +0 -0
  23. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/models/_data_structure.py +0 -0
  24. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/models/_data_types.py +0 -0
  25. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/__init__.py +0 -0
  26. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_base.py +0 -0
  27. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
  28. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
  29. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
  30. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
  31. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
  32. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
  33. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  34. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
  35. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_nonlinearity_profiler.py +0 -0
  36. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
  37. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_tabular.py +0 -0
  38. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_target_config.py +0 -0
  39. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_text_config.py +0 -0
  40. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
  41. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_type_detection_config.py +0 -0
  42. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/_type_detector.py +0 -0
  43. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/profiling/orchestrator.py +0 -0
  44. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/splitting/__init__.py +0 -0
  45. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/splitting/_config.py +0 -0
  46. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/splitting/_profile_signals.py +0 -0
  47. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/splitting/_splitter.py +0 -0
  48. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/utils/__init__.py +0 -0
  49. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/utils/_null_detection.py +0 -0
  50. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/utils/_null_normalization.py +0 -0
  51. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml/utils/data_loader.py +0 -0
  52. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
  53. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  54. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml.egg-info/requires.txt +0 -0
  55. {dataforge_ml-2.0.0 → dataforge_ml-2.0.1}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 2.0.0
3
+ Version: 2.0.1
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "2.0.0"
7
+ version = "2.0.1"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">3.10"
@@ -11,9 +11,9 @@ Per-column metrics (opt-in via ProfileConfig.categorical_columns):
11
11
  7. Free-text / natural-language flag
12
12
  (avg word count >5 OR avg char length >50 OR avg token count >10)
13
13
  8. Imbalance metrics
14
- – class ratio (max_freq / min_freq)
15
- – Shannon entropy
16
- – Gini impurity
14
+ dominant class ratio (max_freq / second_max_freq)
15
+ normalized Shannon entropy
16
+ normalized Gini impurity
17
17
 
18
18
  Integration
19
19
  -----------
@@ -221,24 +221,36 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
221
221
  )
222
222
 
223
223
  # --- Imbalance metrics ---
224
- # Class Ratio -> raw distribution
225
- # Entropy -> randomness / information content
226
- # Gini -> impurity / misclassification risk
224
+ # Dominant Class Ratio -> max_freq / second_max_freq
225
+ # Normalized Entropy -> scaled to [0, 1]
226
+ # Normalized Gini -> scaled to [0, 1]
227
227
  counts = vc["count"].cast(pl.Float64)
228
228
  total = float(counts.sum())
229
- if total > 0:
229
+ card = len(counts)
230
+ if total > 0 and card >= 2:
230
231
  probs = counts / total
231
- max_freq = float(probs.max()) # type: ignore[arg-type]
232
- min_freq = float(probs.min()) # type: ignore[arg-type]
232
+
233
+ max_freq = float(probs[0])
234
+ second_max_freq = float(probs[1])
233
235
 
234
- class_ratio = max_freq / min_freq if min_freq > 0 else float("inf")
236
+ dominant_class_ratio = max_freq / second_max_freq if second_max_freq > 0 else float("inf")
237
+
235
238
  entropy = float(-(probs * probs.log(base=2)).fill_nan(0.0).sum())
239
+ normalized_entropy = entropy / math.log2(card)
240
+
236
241
  gini = float(1.0 - (probs**2).sum())
242
+ normalized_gini = gini / (1.0 - 1.0 / card)
237
243
 
238
244
  profile.imbalance = ImbalanceMetrics(
239
- class_ratio=class_ratio,
240
- shannon_entropy=entropy,
241
- gini_impurity=gini,
245
+ dominant_class_ratio=dominant_class_ratio,
246
+ normalized_shannon_entropy=normalized_entropy,
247
+ normalized_gini=normalized_gini,
248
+ )
249
+ else:
250
+ profile.imbalance = ImbalanceMetrics(
251
+ dominant_class_ratio=None,
252
+ normalized_shannon_entropy=None,
253
+ normalized_gini=None,
242
254
  )
243
255
 
244
256
  return vc
@@ -194,20 +194,20 @@ class ImbalanceMetrics:
194
194
 
195
195
  Attributes
196
196
  ----------
197
- class_ratio : float
198
- Ratio of the least-frequent to most-frequent class, in [0, 1].
199
- A value of 1.0 indicates perfect balance.
200
- shannon_entropy : float
201
- Shannon entropy of the class distribution (nats). Higher values
202
- indicate more uniform distributions.
203
- gini_impurity : float
204
- Gini impurity of the class distribution, in [0, 1]. Zero means
205
- a single class dominates entirely.
197
+ dominant_class_ratio : float, optional
198
+ Ratio of the most-frequent to second-most-frequent class frequency.
199
+ None when cardinality < 2.
200
+ normalized_shannon_entropy : float, optional
201
+ Shannon entropy of the class distribution scaled to [0, 1] by dividing
202
+ by log2(cardinality). None when cardinality < 2.
203
+ normalized_gini : float, optional
204
+ Gini impurity of the class distribution scaled to [0, 1] by dividing
205
+ by (1 - 1/cardinality). None when cardinality < 2.
206
206
  """
207
207
 
208
- class_ratio: float = 0.0
209
- shannon_entropy: float = 0.0
210
- gini_impurity: float = 0.0
208
+ dominant_class_ratio: float | None = None
209
+ normalized_shannon_entropy: float | None = None
210
+ normalized_gini: float | None = None
211
211
 
212
212
  def to_dict(self) -> dict:
213
213
  """
@@ -216,12 +216,12 @@ class ImbalanceMetrics:
216
216
  Returns
217
217
  -------
218
218
  dict
219
- Keys: ``class_ratio``, ``shannon_entropy``, ``gini_impurity``.
219
+ Keys: ``dominant_class_ratio``, ``normalized_shannon_entropy``, ``normalized_gini``.
220
220
  """
221
221
  return {
222
- "class_ratio": self.class_ratio,
223
- "shannon_entropy": self.shannon_entropy,
224
- "gini_impurity": self.gini_impurity,
222
+ "dominant_class_ratio": self.dominant_class_ratio,
223
+ "normalized_shannon_entropy": self.normalized_shannon_entropy,
224
+ "normalized_gini": self.normalized_gini,
225
225
  }
226
226
 
227
227
 
@@ -236,6 +236,79 @@ class StructuralProfileResult:
236
236
  """
237
237
  return json.dumps(self.to_dict(), indent=indent, default=str)
238
238
 
239
+ def to_markdown(self) -> str:
240
+ """
241
+ Produce a complete, lossless Markdown representation of the profiling result.
242
+
243
+ Returns
244
+ -------
245
+ str
246
+ Markdown string containing a summary table followed by per-column detail
247
+ sections and dataset-level statistics.
248
+ """
249
+ data = self.to_dict()
250
+ lines = ["# Structural Profile Report\n"]
251
+
252
+ # 1. Summary navigation table
253
+ lines.append("## Summary\n")
254
+ lines.append("| Column | Semantic Type | Missing % | Severity | Key Flags |")
255
+ lines.append("|---|---|---|---|---|")
256
+
257
+ for col_name, col_data in data.get("columns", {}).items():
258
+ sem_type = col_data.get("semantic_type") or "None"
259
+ missingness = col_data.get("missingness") or {}
260
+ missing_pct = missingness.get("effective_null_ratio", 0.0) * 100
261
+ missing_str = f"{missing_pct:.2f}%"
262
+ severity = missingness.get("severity") or "None"
263
+ flags = ", ".join(col_data.get("type_flags", [])) or "None"
264
+ lines.append(f"| {col_name} | {sem_type} | {missing_str} | {severity} | {flags} |")
265
+
266
+ lines.append("\n## Column Details\n")
267
+
268
+ def _format_dict(d: dict, indent: int = 0) -> list[str]:
269
+ out = []
270
+ prefix = " " * indent
271
+ for k, v in d.items():
272
+ if isinstance(v, dict):
273
+ if not v:
274
+ out.append(f"{prefix}- **{k}**: (empty)")
275
+ else:
276
+ out.append(f"{prefix}- **{k}**:")
277
+ out.extend(_format_dict(v, indent + 1))
278
+ elif isinstance(v, list):
279
+ if not v:
280
+ out.append(f"{prefix}- **{k}**: (empty)")
281
+ else:
282
+ out.append(f"{prefix}- **{k}**:")
283
+ for item in v:
284
+ out.append(f"{prefix} - {item}")
285
+ else:
286
+ out.append(f"{prefix}- **{k}**: {v}")
287
+ return out
288
+
289
+ for col_name, col_data in data.get("columns", {}).items():
290
+ lines.append(f"### `{col_name}`\n")
291
+ lines.extend(_format_dict(col_data))
292
+ lines.append("")
293
+
294
+ lines.append("## Dataset\n")
295
+ lines.extend(_format_dict(data.get("dataset", {})))
296
+ lines.append("")
297
+
298
+ lines.append("## Targets\n")
299
+ lines.extend(_format_dict(data.get("targets", {})))
300
+ lines.append("")
301
+
302
+ lines.append("## Numeric Sentinels\n")
303
+ lines.extend(_format_dict(data.get("numeric_sentinels", {})))
304
+ lines.append("")
305
+
306
+ lines.append("## String Sentinels\n")
307
+ lines.extend(_format_dict(data.get("string_sentinels", {})))
308
+ lines.append("")
309
+
310
+ return "\n".join(lines).strip() + "\n"
311
+
239
312
 
240
313
  # ---------------------------------------------------------------------------
241
314
  # ProfileConfig — clean break from per-profiler column lists
@@ -74,6 +74,22 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
74
74
  data: pl.DataFrame,
75
75
  columns: list[str],
76
76
  ) -> NumericProfileResult:
77
+ """
78
+ Profile the specified numeric columns in a DataFrame.
79
+
80
+ Parameters
81
+ ----------
82
+ data : pl.DataFrame
83
+ The input Polars DataFrame containing the columns to profile.
84
+ columns : list[str]
85
+ A list of column names to profile. Non-numeric columns in this list
86
+ are skipped.
87
+
88
+ Returns
89
+ -------
90
+ NumericProfileResult
91
+ A result object containing distribution statistics for the profiled columns.
92
+ """
77
93
  return self._run(data, columns)
78
94
 
79
95
  # ------------------------------------------------------------------
@@ -128,7 +144,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
128
144
  profile.mean = mean
129
145
  profile.median = median
130
146
  if median == 0.0:
131
- profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
147
+ profile.mean_median_ratio = None if mean != 0.0 else 1.0
132
148
  else:
133
149
  profile.mean_median_ratio = mean / median
134
150
 
@@ -137,11 +137,12 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
137
137
  result.categorical_profile = cat_profile
138
138
 
139
139
  # Flag Imbalances
140
- ratio = cat_profile.imbalance.class_ratio
141
- if ratio > 20.0:
142
- result.flags.append(TargetFlag.SevereImbalance)
143
- elif ratio > 5.0:
144
- result.flags.append(TargetFlag.HighImbalance)
140
+ ratio = cat_profile.imbalance.dominant_class_ratio
141
+ if ratio is not None:
142
+ if ratio > 20.0:
143
+ result.flags.append(TargetFlag.SevereImbalance)
144
+ elif ratio > 5.0:
145
+ result.flags.append(TargetFlag.HighImbalance)
145
146
 
146
147
  def _profile_regression(
147
148
  self, series: pl.Series, n_rows: int, result: TargetProfileResult
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 2.0.0
3
+ Version: 2.0.1
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
File without changes
File without changes
File without changes