dataforge-ml 2.0.0__tar.gz → 2.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {dataforge_ml-2.0.0/src/dataforge_ml.egg-info → dataforge_ml-2.0.2}/PKG-INFO +1 -1
  2. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/pyproject.toml +1 -1
  3. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_categorical.py +25 -13
  4. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_categorical_config.py +16 -16
  5. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_config.py +307 -0
  6. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_numeric_profiler.py +17 -1
  7. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_target_profiler.py +6 -5
  8. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2/src/dataforge_ml.egg-info}/PKG-INFO +1 -1
  9. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/LICENSE +0 -0
  10. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/README.md +0 -0
  11. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/setup.cfg +0 -0
  12. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/__init__.py +0 -0
  13. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/config.py +0 -0
  14. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/__init__.py +0 -0
  15. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_config.py +0 -0
  16. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_fitted_imputer.py +0 -0
  17. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_numeric_imputer.py +0 -0
  18. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_regression_estimator_factory.py +0 -0
  19. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_strategy_router.py +0 -0
  20. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_utils.py +0 -0
  21. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/orchestrator.py +0 -0
  22. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/models/__init__.py +0 -0
  23. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/models/_data_structure.py +0 -0
  24. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/models/_data_types.py +0 -0
  25. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/__init__.py +0 -0
  26. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_base.py +0 -0
  27. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
  28. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
  29. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
  30. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
  31. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
  32. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
  33. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  34. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
  35. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_nonlinearity_profiler.py +0 -0
  36. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
  37. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_tabular.py +0 -0
  38. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_target_config.py +0 -0
  39. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_text_config.py +0 -0
  40. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
  41. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_type_detection_config.py +0 -0
  42. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_type_detector.py +0 -0
  43. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/orchestrator.py +0 -0
  44. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/splitting/__init__.py +0 -0
  45. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/splitting/_config.py +0 -0
  46. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/splitting/_profile_signals.py +0 -0
  47. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/splitting/_splitter.py +0 -0
  48. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/utils/__init__.py +0 -0
  49. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/utils/_null_detection.py +0 -0
  50. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/utils/_null_normalization.py +0 -0
  51. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml/utils/data_loader.py +0 -0
  52. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
  53. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  54. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml.egg-info/requires.txt +0 -0
  55. {dataforge_ml-2.0.0 → dataforge_ml-2.0.2}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 2.0.0
3
+ Version: 2.0.2
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "2.0.0"
7
+ version = "2.0.2"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">3.10"
@@ -11,9 +11,9 @@ Per-column metrics (opt-in via ProfileConfig.categorical_columns):
11
11
  7. Free-text / natural-language flag
12
12
  (avg word count >5 OR avg char length >50 OR avg token count >10)
13
13
  8. Imbalance metrics
14
- – class ratio (max_freq / min_freq)
15
- – Shannon entropy
16
- – Gini impurity
14
+ dominant class ratio (max_freq / second_max_freq)
15
+ normalized Shannon entropy
16
+ normalized Gini impurity
17
17
 
18
18
  Integration
19
19
  -----------
@@ -221,24 +221,36 @@ class CategoricalProfiler(ColumnBatchProfiler[CategoricalProfileResult]):
221
221
  )
222
222
 
223
223
  # --- Imbalance metrics ---
224
- # Class Ratio -> raw distribution
225
- # Entropy -> randomness / information content
226
- # Gini -> impurity / misclassification risk
224
+ # Dominant Class Ratio -> max_freq / second_max_freq
225
+ # Normalized Entropy -> scaled to [0, 1]
226
+ # Normalized Gini -> scaled to [0, 1]
227
227
  counts = vc["count"].cast(pl.Float64)
228
228
  total = float(counts.sum())
229
- if total > 0:
229
+ card = len(counts)
230
+ if total > 0 and card >= 2:
230
231
  probs = counts / total
231
- max_freq = float(probs.max()) # type: ignore[arg-type]
232
- min_freq = float(probs.min()) # type: ignore[arg-type]
232
+
233
+ max_freq = float(probs[0])
234
+ second_max_freq = float(probs[1])
233
235
 
234
- class_ratio = max_freq / min_freq if min_freq > 0 else float("inf")
236
+ dominant_class_ratio = max_freq / second_max_freq if second_max_freq > 0 else float("inf")
237
+
235
238
  entropy = float(-(probs * probs.log(base=2)).fill_nan(0.0).sum())
239
+ normalized_entropy = entropy / math.log2(card)
240
+
236
241
  gini = float(1.0 - (probs**2).sum())
242
+ normalized_gini = gini / (1.0 - 1.0 / card)
237
243
 
238
244
  profile.imbalance = ImbalanceMetrics(
239
- class_ratio=class_ratio,
240
- shannon_entropy=entropy,
241
- gini_impurity=gini,
245
+ dominant_class_ratio=dominant_class_ratio,
246
+ normalized_shannon_entropy=normalized_entropy,
247
+ normalized_gini=normalized_gini,
248
+ )
249
+ else:
250
+ profile.imbalance = ImbalanceMetrics(
251
+ dominant_class_ratio=None,
252
+ normalized_shannon_entropy=None,
253
+ normalized_gini=None,
242
254
  )
243
255
 
244
256
  return vc
@@ -194,20 +194,20 @@ class ImbalanceMetrics:
194
194
 
195
195
  Attributes
196
196
  ----------
197
- class_ratio : float
198
- Ratio of the least-frequent to most-frequent class, in [0, 1].
199
- A value of 1.0 indicates perfect balance.
200
- shannon_entropy : float
201
- Shannon entropy of the class distribution (nats). Higher values
202
- indicate more uniform distributions.
203
- gini_impurity : float
204
- Gini impurity of the class distribution, in [0, 1]. Zero means
205
- a single class dominates entirely.
197
+ dominant_class_ratio : float, optional
198
+ Ratio of the most-frequent to second-most-frequent class frequency.
199
+ None when cardinality < 2.
200
+ normalized_shannon_entropy : float, optional
201
+ Shannon entropy of the class distribution scaled to [0, 1] by dividing
202
+ by log2(cardinality). None when cardinality < 2.
203
+ normalized_gini : float, optional
204
+ Gini impurity of the class distribution scaled to [0, 1] by dividing
205
+ by (1 - 1/cardinality). None when cardinality < 2.
206
206
  """
207
207
 
208
- class_ratio: float = 0.0
209
- shannon_entropy: float = 0.0
210
- gini_impurity: float = 0.0
208
+ dominant_class_ratio: float | None = None
209
+ normalized_shannon_entropy: float | None = None
210
+ normalized_gini: float | None = None
211
211
 
212
212
  def to_dict(self) -> dict:
213
213
  """
@@ -216,12 +216,12 @@ class ImbalanceMetrics:
216
216
  Returns
217
217
  -------
218
218
  dict
219
- Keys: ``class_ratio``, ``shannon_entropy``, ``gini_impurity``.
219
+ Keys: ``dominant_class_ratio``, ``normalized_shannon_entropy``, ``normalized_gini``.
220
220
  """
221
221
  return {
222
- "class_ratio": self.class_ratio,
223
- "shannon_entropy": self.shannon_entropy,
224
- "gini_impurity": self.gini_impurity,
222
+ "dominant_class_ratio": self.dominant_class_ratio,
223
+ "normalized_shannon_entropy": self.normalized_shannon_entropy,
224
+ "normalized_gini": self.normalized_gini,
225
225
  }
226
226
 
227
227
 
@@ -15,7 +15,9 @@ from typing import Optional, Union
15
15
  from ..config import SemanticType, Modality
16
16
  from ._missingness_config import (
17
17
  ColumnMissingnessProfile,
18
+ MissingnessFlag,
18
19
  MissingnessProfileConfig,
20
+ MissingSeverity,
19
21
  RowMissingnessDistribution,
20
22
  )
21
23
  from ._correlation_config import (
@@ -27,6 +29,8 @@ from ._categorical_config import (
27
29
  CategoricalProfileConfig,
28
30
  )
29
31
  from ._numeric_config import (
32
+ NonlinearityTag,
33
+ NumericFlag,
30
34
  NumericStats,
31
35
  NumericProfileConfig,
32
36
  NonlinearityProfileConfig,
@@ -70,6 +74,121 @@ class TypeFlag(StrEnum):
70
74
  AnyStats = Union[NumericStats, CategoricalStats, DatetimeStats, BooleanStats, TextStats]
71
75
 
72
76
 
77
+ def _format_dict_lines(d: dict, indent: int = 0) -> list[str]:
78
+ out = []
79
+ prefix = " " * indent
80
+ for k, v in d.items():
81
+ if isinstance(v, dict):
82
+ if not v:
83
+ out.append(f"{prefix}- **{k}**: (empty)")
84
+ else:
85
+ out.append(f"{prefix}- **{k}**:")
86
+ out.extend(_format_dict_lines(v, indent + 1))
87
+ elif isinstance(v, list):
88
+ if not v:
89
+ out.append(f"{prefix}- **{k}**: (empty)")
90
+ else:
91
+ out.append(f"{prefix}- **{k}**:")
92
+ for item in v:
93
+ out.append(f"{prefix} - {item}")
94
+ else:
95
+ out.append(f"{prefix}- **{k}**: {v}")
96
+ return out
97
+
98
+
99
+ def _top_n_abs_correlations(
100
+ matrix: dict[str, dict[str, float]], col_name: str, n: int = 5
101
+ ) -> list[tuple[str, float]]:
102
+ row = matrix.get(col_name, {})
103
+ pairs = [(other, value) for other, value in row.items() if other != col_name]
104
+ pairs.sort(key=lambda item: abs(item[1]), reverse=True)
105
+ return pairs[:n]
106
+
107
+
108
+ def _is_clean_column(col: "ColumnProfile") -> bool:
109
+ missingness = col.missingness
110
+ if missingness is not None and missingness.flags:
111
+ return False
112
+ severity = missingness.severity if missingness is not None else None
113
+ if severity not in (None, MissingSeverity.Minor):
114
+ return False
115
+ if isinstance(col.stats, NumericStats):
116
+ if col.stats.flags:
117
+ return False
118
+ if col.stats.nonlinearity_tag not in (None, NonlinearityTag.Linear):
119
+ return False
120
+ return True
121
+
122
+
123
+ def _compact_column_detail(
124
+ col: "ColumnProfile",
125
+ feature_correlation: Optional[CorrelationProfileResult],
126
+ ) -> dict:
127
+ """
128
+ Build the per-column dict used in the Flagged Columns detail section.
129
+
130
+ Applies the compact-view field rules (ADR-0040) on top of
131
+ ``ColumnProfile.to_dict()``: drops ``total_rows`` from the missingness
132
+ subsection and ``histogram`` from the stats subsection, and caps
133
+ ``top_values`` (present on both ``NumericStats`` and ``CategoricalStats``)
134
+ to 3 entries. All other fields — including redundant scalar pairs,
135
+ percentiles, bimodal stats, and ``correlated_with`` — pass through
136
+ unchanged. When ``feature_correlation`` carries a row for this column,
137
+ a ``correlations`` entry is added with the top-5 highest absolute
138
+ Pearson and top-5 highest absolute Spearman correlations (descending
139
+ by absolute value) in place of the full N×N matrices.
140
+
141
+ Parameters
142
+ ----------
143
+ col : ColumnProfile
144
+ The column profile to render.
145
+ feature_correlation : CorrelationProfileResult or None
146
+ Dataset-level feature-feature correlation result, if computed.
147
+
148
+ Returns
149
+ -------
150
+ dict
151
+ Trimmed dictionary suitable for ``_format_dict_lines``.
152
+ """
153
+ data = col.to_dict()
154
+ missingness = data.get("missingness")
155
+ if missingness is not None:
156
+ missingness.pop("total_rows", None)
157
+ stats = data.get("stats")
158
+ if stats is not None:
159
+ stats.pop("histogram", None)
160
+ if "top_values" in stats:
161
+ stats["top_values"] = stats["top_values"][:3]
162
+
163
+ if feature_correlation is not None:
164
+ top_pearson = _top_n_abs_correlations(feature_correlation.pearson_matrix, col.name)
165
+ top_spearman = _top_n_abs_correlations(feature_correlation.spearman_matrix, col.name)
166
+ if top_pearson or top_spearman:
167
+ data["correlations"] = {
168
+ "top_pearson": [f"{name}: {value}" for name, value in top_pearson],
169
+ "top_spearman": [f"{name}: {value}" for name, value in top_spearman],
170
+ }
171
+ return data
172
+
173
+
174
+ def _flagged_column_tier(col: "ColumnProfile") -> int:
175
+ missingness = col.missingness
176
+ flags = missingness.flags if missingness is not None else []
177
+ severity = missingness.severity if missingness is not None else None
178
+
179
+ if MissingnessFlag.DropCandidate in flags or MissingnessFlag.FullyNull in flags:
180
+ return 0
181
+ if severity == MissingSeverity.Severe:
182
+ return 1
183
+ if severity == MissingSeverity.High:
184
+ return 2
185
+ if severity == MissingSeverity.Moderate:
186
+ return 3
187
+ if flags:
188
+ return 4
189
+ return 5
190
+
191
+
73
192
  @dataclass
74
193
  class ColumnProfile:
75
194
  """
@@ -236,6 +355,194 @@ class StructuralProfileResult:
236
355
  """
237
356
  return json.dumps(self.to_dict(), indent=indent, default=str)
238
357
 
358
+ def to_markdown(self) -> str:
359
+ """
360
+ Produce a compact, human-oriented Markdown view of the profiling result.
361
+
362
+ The document contains a Dataset Overview section (scalar
363
+ dataset-level fields only — ``memory_breakdown`` and
364
+ ``missingness_matrix`` are omitted), a Column Summary table with one
365
+ row per column, and a Flagged Columns section with a full detail
366
+ subsection for every column that exceeds the clean threshold (see
367
+ ``_is_clean_column``), ordered severity-first then alphabetically
368
+ (see ``_flagged_column_tier``). Within each flagged column's detail
369
+ section, ``histogram`` bins and the missingness ``total_rows`` field
370
+ are dropped, ``top_values`` is capped to 3 entries, and the full
371
+ Pearson/Spearman correlation matrices are replaced by the top-5
372
+ highest absolute correlations for that column (see
373
+ ``_compact_column_detail``); all other fields — including redundant
374
+ scalar pairs, percentiles, bimodal stats, and ``correlated_with`` —
375
+ are kept in full. A Target Analysis section follows with the top-5
376
+ absolute Pearson and Spearman correlations per feature column for
377
+ each declared target, and a Sentinels section renders
378
+ ``numeric_sentinels`` / ``string_sentinels`` unchanged. Use
379
+ ``to_full_markdown()`` for the complete lossless serialization.
380
+
381
+ Returns
382
+ -------
383
+ str
384
+ Markdown string containing the Dataset Overview, Column Summary,
385
+ Flagged Columns, Target Analysis, and Sentinels sections.
386
+ """
387
+ lines = ["# Structural Profile Report (Compact)\n"]
388
+
389
+ ds = self.dataset
390
+ lines.append("## Dataset Overview\n")
391
+ lines.append(f"- **modality**: {ds.modality}")
392
+ lines.append(f"- **row_count**: {ds.row_count}")
393
+ lines.append(f"- **column_count**: {ds.column_count}")
394
+ lines.append(f"- **memory_bytes**: {ds.memory_bytes}")
395
+ lines.append(f"- **duplicate_count**: {ds.duplicate_count}")
396
+ lines.append(f"- **duplicate_ratio**: {ds.duplicate_ratio}")
397
+ lines.append(f"- **overall_sparsity**: {ds.overall_sparsity}")
398
+ lines.append(f"- **was_chunked**: {ds.was_chunked}")
399
+ lines.append("- **row_distribution**:")
400
+ for key, value in ds.row_distribution.to_dict().items():
401
+ lines.append(f" - **{key}**: {value}")
402
+ lines.append("")
403
+
404
+ lines.append("## Column Summary\n")
405
+ lines.append(
406
+ "| Column | Semantic Type | Missing % | Severity | "
407
+ "Missingness Flags | Numeric Flags |"
408
+ )
409
+ lines.append("|---|---|---|---|---|---|")
410
+ for col_name, col in self.columns.items():
411
+ sem_type = col.semantic_type if col.semantic_type else "None"
412
+ missingness = col.missingness
413
+ if missingness is not None:
414
+ missing_str = f"{missingness.effective_null_ratio * 100:.2f}%"
415
+ severity = missingness.severity if missingness.severity else "None"
416
+ missingness_flags = (
417
+ ", ".join(str(f) for f in missingness.flags)
418
+ if missingness.flags
419
+ else "None"
420
+ )
421
+ else:
422
+ missing_str = "0.00%"
423
+ severity = "None"
424
+ missingness_flags = "None"
425
+ numeric_flags = "None"
426
+ if isinstance(col.stats, NumericStats) and col.stats.flags:
427
+ numeric_flags = ", ".join(str(f) for f in col.stats.flags)
428
+ lines.append(
429
+ f"| `{col_name}` | {sem_type} | {missing_str} | {severity} | "
430
+ f"{missingness_flags} | {numeric_flags} |"
431
+ )
432
+ lines.append("")
433
+
434
+ lines.append("## Flagged Columns\n")
435
+ flagged_columns = [
436
+ col for col in self.columns.values() if not _is_clean_column(col)
437
+ ]
438
+ flagged_columns.sort(key=lambda col: (_flagged_column_tier(col), col.name))
439
+ for col in flagged_columns:
440
+ lines.append(f"### `{col.name}`\n")
441
+ lines.extend(
442
+ _format_dict_lines(
443
+ _compact_column_detail(col, self.dataset.feature_correlation)
444
+ )
445
+ )
446
+ lines.append("")
447
+
448
+ if self.dataset.target_correlations:
449
+ lines.append("## Target Analysis\n")
450
+ for target_name, corr in self.dataset.target_correlations.items():
451
+ lines.append(f"### Target: `{target_name}`\n")
452
+ feature_cols = [
453
+ c for c in corr.analysed_numeric_columns if c != target_name
454
+ ]
455
+ for feat in feature_cols:
456
+ top_pearson = _top_n_abs_correlations(corr.pearson_matrix, feat)
457
+ top_spearman = _top_n_abs_correlations(corr.spearman_matrix, feat)
458
+ lines.append(f"#### `{feat}`\n")
459
+ lines.extend(
460
+ _format_dict_lines(
461
+ {
462
+ "top_pearson": [
463
+ f"{name}: {value}" for name, value in top_pearson
464
+ ],
465
+ "top_spearman": [
466
+ f"{name}: {value}" for name, value in top_spearman
467
+ ],
468
+ }
469
+ )
470
+ )
471
+ lines.append("")
472
+
473
+ lines.append("## Sentinels\n")
474
+ lines.extend(
475
+ _format_dict_lines(
476
+ {
477
+ "numeric_sentinels": dict(self.numeric_sentinels),
478
+ "string_sentinels": {
479
+ k: list(v) for k, v in self.string_sentinels.items()
480
+ },
481
+ }
482
+ )
483
+ )
484
+ lines.append("")
485
+
486
+ return "\n".join(lines).strip() + "\n"
487
+
488
+ def to_full_markdown(self) -> str:
489
+ """
490
+ Produce a complete, lossless Markdown serialization for debugging and archival use.
491
+
492
+ Every field present in ``to_dict()`` — including histogram bins, full
493
+ correlation matrices, memory breakdown, and all per-column fields —
494
+ is rendered as Markdown. For an 82-column dataset this produces
495
+ roughly 1 MB of text; for human inspection of large datasets prefer
496
+ ``to_markdown()`` once the compact view lands (ADR-0040).
497
+
498
+ Returns
499
+ -------
500
+ str
501
+ Markdown string containing a summary table followed by per-column detail
502
+ sections and dataset-level statistics.
503
+ """
504
+ data = self.to_dict()
505
+ lines = ["# Structural Profile Report\n"]
506
+
507
+ # 1. Summary navigation table
508
+ lines.append("## Summary\n")
509
+ lines.append("| Column | Semantic Type | Missing % | Severity | Key Flags |")
510
+ lines.append("|---|---|---|---|---|")
511
+
512
+ for col_name, col_data in data.get("columns", {}).items():
513
+ sem_type = col_data.get("semantic_type") or "None"
514
+ missingness = col_data.get("missingness") or {}
515
+ missing_pct = missingness.get("effective_null_ratio", 0.0) * 100
516
+ missing_str = f"{missing_pct:.2f}%"
517
+ severity = missingness.get("severity") or "None"
518
+ flags = ", ".join(col_data.get("type_flags", [])) or "None"
519
+ lines.append(f"| {col_name} | {sem_type} | {missing_str} | {severity} | {flags} |")
520
+
521
+ lines.append("\n## Column Details\n")
522
+
523
+ for col_name, col_data in data.get("columns", {}).items():
524
+ lines.append(f"### `{col_name}`\n")
525
+ lines.extend(_format_dict_lines(col_data))
526
+ lines.append("")
527
+
528
+ lines.append("## Dataset\n")
529
+ lines.extend(_format_dict_lines(data.get("dataset", {})))
530
+ lines.append("")
531
+
532
+ lines.append("## Targets\n")
533
+ lines.extend(_format_dict_lines(data.get("targets", {})))
534
+ lines.append("")
535
+
536
+ lines.append("## Numeric Sentinels\n")
537
+ lines.extend(_format_dict_lines(data.get("numeric_sentinels", {})))
538
+ lines.append("")
539
+
540
+ lines.append("## String Sentinels\n")
541
+ lines.extend(_format_dict_lines(data.get("string_sentinels", {})))
542
+ lines.append("")
543
+
544
+ return "\n".join(lines).strip() + "\n"
545
+
239
546
 
240
547
  # ---------------------------------------------------------------------------
241
548
  # ProfileConfig — clean break from per-profiler column lists
@@ -74,6 +74,22 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
74
74
  data: pl.DataFrame,
75
75
  columns: list[str],
76
76
  ) -> NumericProfileResult:
77
+ """
78
+ Profile the specified numeric columns in a DataFrame.
79
+
80
+ Parameters
81
+ ----------
82
+ data : pl.DataFrame
83
+ The input Polars DataFrame containing the columns to profile.
84
+ columns : list[str]
85
+ A list of column names to profile. Non-numeric columns in this list
86
+ are skipped.
87
+
88
+ Returns
89
+ -------
90
+ NumericProfileResult
91
+ A result object containing distribution statistics for the profiled columns.
92
+ """
77
93
  return self._run(data, columns)
78
94
 
79
95
  # ------------------------------------------------------------------
@@ -128,7 +144,7 @@ class NumericProfiler(ColumnBatchProfiler[NumericProfileResult]):
128
144
  profile.mean = mean
129
145
  profile.median = median
130
146
  if median == 0.0:
131
- profile.mean_median_ratio = float("inf") if mean != 0.0 else 1.0
147
+ profile.mean_median_ratio = None if mean != 0.0 else 1.0
132
148
  else:
133
149
  profile.mean_median_ratio = mean / median
134
150
 
@@ -137,11 +137,12 @@ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
137
137
  result.categorical_profile = cat_profile
138
138
 
139
139
  # Flag Imbalances
140
- ratio = cat_profile.imbalance.class_ratio
141
- if ratio > 20.0:
142
- result.flags.append(TargetFlag.SevereImbalance)
143
- elif ratio > 5.0:
144
- result.flags.append(TargetFlag.HighImbalance)
140
+ ratio = cat_profile.imbalance.dominant_class_ratio
141
+ if ratio is not None:
142
+ if ratio > 20.0:
143
+ result.flags.append(TargetFlag.SevereImbalance)
144
+ elif ratio > 5.0:
145
+ result.flags.append(TargetFlag.HighImbalance)
145
146
 
146
147
  def _profile_regression(
147
148
  self, series: pl.Series, n_rows: int, result: TargetProfileResult
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 2.0.0
3
+ Version: 2.0.2
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
File without changes
File without changes
File without changes