dataforge-ml 2.0.1__tar.gz → 2.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {dataforge_ml-2.0.1/src/dataforge_ml.egg-info → dataforge_ml-2.0.2}/PKG-INFO +1 -1
  2. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/pyproject.toml +1 -1
  3. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_config.py +262 -28
  4. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2/src/dataforge_ml.egg-info}/PKG-INFO +1 -1
  5. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/LICENSE +0 -0
  6. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/README.md +0 -0
  7. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/setup.cfg +0 -0
  8. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/__init__.py +0 -0
  9. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/config.py +0 -0
  10. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/__init__.py +0 -0
  11. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_config.py +0 -0
  12. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_fitted_imputer.py +0 -0
  13. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_numeric_imputer.py +0 -0
  14. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_regression_estimator_factory.py +0 -0
  15. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_strategy_router.py +0 -0
  16. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_utils.py +0 -0
  17. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/orchestrator.py +0 -0
  18. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/models/__init__.py +0 -0
  19. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/models/_data_structure.py +0 -0
  20. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/models/_data_types.py +0 -0
  21. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/__init__.py +0 -0
  22. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_base.py +0 -0
  23. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
  24. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
  25. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_categorical.py +0 -0
  26. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
  27. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
  28. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
  29. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
  30. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
  31. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
  32. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
  33. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_nonlinearity_profiler.py +0 -0
  34. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
  35. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_numeric_profiler.py +0 -0
  36. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_tabular.py +0 -0
  37. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_target_config.py +0 -0
  38. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
  39. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_text_config.py +0 -0
  40. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
  41. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_type_detection_config.py +0 -0
  42. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_type_detector.py +0 -0
  43. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/orchestrator.py +0 -0
  44. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/splitting/__init__.py +0 -0
  45. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/splitting/_config.py +0 -0
  46. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/splitting/_profile_signals.py +0 -0
  47. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/splitting/_splitter.py +0 -0
  48. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/utils/__init__.py +0 -0
  49. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/utils/_null_detection.py +0 -0
  50. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/utils/_null_normalization.py +0 -0
  51. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/utils/data_loader.py +0 -0
  52. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
  53. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
  54. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml.egg-info/requires.txt +0 -0
  55. {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 2.0.1
3
+ Version: 2.0.2
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dataforge-ml"
7
- version = "2.0.1"
7
+ version = "2.0.2"
8
8
  description = "A automated feature engineering and designing pipeline library"
9
9
  readme = "README.md"
10
10
  requires-python = ">3.10"
@@ -15,7 +15,9 @@ from typing import Optional, Union
15
15
  from ..config import SemanticType, Modality
16
16
  from ._missingness_config import (
17
17
  ColumnMissingnessProfile,
18
+ MissingnessFlag,
18
19
  MissingnessProfileConfig,
20
+ MissingSeverity,
19
21
  RowMissingnessDistribution,
20
22
  )
21
23
  from ._correlation_config import (
@@ -27,6 +29,8 @@ from ._categorical_config import (
27
29
  CategoricalProfileConfig,
28
30
  )
29
31
  from ._numeric_config import (
32
+ NonlinearityTag,
33
+ NumericFlag,
30
34
  NumericStats,
31
35
  NumericProfileConfig,
32
36
  NonlinearityProfileConfig,
@@ -70,6 +74,121 @@ class TypeFlag(StrEnum):
70
74
  AnyStats = Union[NumericStats, CategoricalStats, DatetimeStats, BooleanStats, TextStats]
71
75
 
72
76
 
77
+ def _format_dict_lines(d: dict, indent: int = 0) -> list[str]:
78
+ out = []
79
+ prefix = " " * indent
80
+ for k, v in d.items():
81
+ if isinstance(v, dict):
82
+ if not v:
83
+ out.append(f"{prefix}- **{k}**: (empty)")
84
+ else:
85
+ out.append(f"{prefix}- **{k}**:")
86
+ out.extend(_format_dict_lines(v, indent + 1))
87
+ elif isinstance(v, list):
88
+ if not v:
89
+ out.append(f"{prefix}- **{k}**: (empty)")
90
+ else:
91
+ out.append(f"{prefix}- **{k}**:")
92
+ for item in v:
93
+ out.append(f"{prefix} - {item}")
94
+ else:
95
+ out.append(f"{prefix}- **{k}**: {v}")
96
+ return out
97
+
98
+
99
+ def _top_n_abs_correlations(
100
+ matrix: dict[str, dict[str, float]], col_name: str, n: int = 5
101
+ ) -> list[tuple[str, float]]:
102
+ row = matrix.get(col_name, {})
103
+ pairs = [(other, value) for other, value in row.items() if other != col_name]
104
+ pairs.sort(key=lambda item: abs(item[1]), reverse=True)
105
+ return pairs[:n]
106
+
107
+
108
+ def _is_clean_column(col: "ColumnProfile") -> bool:
109
+ missingness = col.missingness
110
+ if missingness is not None and missingness.flags:
111
+ return False
112
+ severity = missingness.severity if missingness is not None else None
113
+ if severity not in (None, MissingSeverity.Minor):
114
+ return False
115
+ if isinstance(col.stats, NumericStats):
116
+ if col.stats.flags:
117
+ return False
118
+ if col.stats.nonlinearity_tag not in (None, NonlinearityTag.Linear):
119
+ return False
120
+ return True
121
+
122
+
123
+ def _compact_column_detail(
124
+ col: "ColumnProfile",
125
+ feature_correlation: Optional[CorrelationProfileResult],
126
+ ) -> dict:
127
+ """
128
+ Build the per-column dict used in the Flagged Columns detail section.
129
+
130
+ Applies the compact-view field rules (ADR-0040) on top of
131
+ ``ColumnProfile.to_dict()``: drops ``total_rows`` from the missingness
132
+ subsection and ``histogram`` from the stats subsection, and caps
133
+ ``top_values`` (present on both ``NumericStats`` and ``CategoricalStats``)
134
+ to 3 entries. All other fields — including redundant scalar pairs,
135
+ percentiles, bimodal stats, and ``correlated_with`` — pass through
136
+ unchanged. When ``feature_correlation`` carries a row for this column,
137
+ a ``correlations`` entry is added with the top-5 highest absolute
138
+ Pearson and top-5 highest absolute Spearman correlations (descending
139
+ by absolute value) in place of the full N×N matrices.
140
+
141
+ Parameters
142
+ ----------
143
+ col : ColumnProfile
144
+ The column profile to render.
145
+ feature_correlation : CorrelationProfileResult or None
146
+ Dataset-level feature-feature correlation result, if computed.
147
+
148
+ Returns
149
+ -------
150
+ dict
151
+ Trimmed dictionary suitable for ``_format_dict_lines``.
152
+ """
153
+ data = col.to_dict()
154
+ missingness = data.get("missingness")
155
+ if missingness is not None:
156
+ missingness.pop("total_rows", None)
157
+ stats = data.get("stats")
158
+ if stats is not None:
159
+ stats.pop("histogram", None)
160
+ if "top_values" in stats:
161
+ stats["top_values"] = stats["top_values"][:3]
162
+
163
+ if feature_correlation is not None:
164
+ top_pearson = _top_n_abs_correlations(feature_correlation.pearson_matrix, col.name)
165
+ top_spearman = _top_n_abs_correlations(feature_correlation.spearman_matrix, col.name)
166
+ if top_pearson or top_spearman:
167
+ data["correlations"] = {
168
+ "top_pearson": [f"{name}: {value}" for name, value in top_pearson],
169
+ "top_spearman": [f"{name}: {value}" for name, value in top_spearman],
170
+ }
171
+ return data
172
+
173
+
174
+ def _flagged_column_tier(col: "ColumnProfile") -> int:
175
+ missingness = col.missingness
176
+ flags = missingness.flags if missingness is not None else []
177
+ severity = missingness.severity if missingness is not None else None
178
+
179
+ if MissingnessFlag.DropCandidate in flags or MissingnessFlag.FullyNull in flags:
180
+ return 0
181
+ if severity == MissingSeverity.Severe:
182
+ return 1
183
+ if severity == MissingSeverity.High:
184
+ return 2
185
+ if severity == MissingSeverity.Moderate:
186
+ return 3
187
+ if flags:
188
+ return 4
189
+ return 5
190
+
191
+
73
192
  @dataclass
74
193
  class ColumnProfile:
75
194
  """
@@ -238,7 +357,143 @@ class StructuralProfileResult:
238
357
 
239
358
  def to_markdown(self) -> str:
240
359
  """
241
- Produce a complete, lossless Markdown representation of the profiling result.
360
+ Produce a compact, human-oriented Markdown view of the profiling result.
361
+
362
+ The document contains a Dataset Overview section (scalar
363
+ dataset-level fields only — ``memory_breakdown`` and
364
+ ``missingness_matrix`` are omitted), a Column Summary table with one
365
+ row per column, and a Flagged Columns section with a full detail
366
+ subsection for every column that exceeds the clean threshold (see
367
+ ``_is_clean_column``), ordered severity-first then alphabetically
368
+ (see ``_flagged_column_tier``). Within each flagged column's detail
369
+ section, ``histogram`` bins and the missingness ``total_rows`` field
370
+ are dropped, ``top_values`` is capped to 3 entries, and the full
371
+ Pearson/Spearman correlation matrices are replaced by the top-5
372
+ highest absolute correlations for that column (see
373
+ ``_compact_column_detail``); all other fields — including redundant
374
+ scalar pairs, percentiles, bimodal stats, and ``correlated_with`` —
375
+ are kept in full. A Target Analysis section follows with the top-5
376
+ absolute Pearson and Spearman correlations per feature column for
377
+ each declared target, and a Sentinels section renders
378
+ ``numeric_sentinels`` / ``string_sentinels`` unchanged. Use
379
+ ``to_full_markdown()`` for the complete lossless serialization.
380
+
381
+ Returns
382
+ -------
383
+ str
384
+ Markdown string containing the Dataset Overview, Column Summary,
385
+ Flagged Columns, Target Analysis, and Sentinels sections.
386
+ """
387
+ lines = ["# Structural Profile Report (Compact)\n"]
388
+
389
+ ds = self.dataset
390
+ lines.append("## Dataset Overview\n")
391
+ lines.append(f"- **modality**: {ds.modality}")
392
+ lines.append(f"- **row_count**: {ds.row_count}")
393
+ lines.append(f"- **column_count**: {ds.column_count}")
394
+ lines.append(f"- **memory_bytes**: {ds.memory_bytes}")
395
+ lines.append(f"- **duplicate_count**: {ds.duplicate_count}")
396
+ lines.append(f"- **duplicate_ratio**: {ds.duplicate_ratio}")
397
+ lines.append(f"- **overall_sparsity**: {ds.overall_sparsity}")
398
+ lines.append(f"- **was_chunked**: {ds.was_chunked}")
399
+ lines.append("- **row_distribution**:")
400
+ for key, value in ds.row_distribution.to_dict().items():
401
+ lines.append(f" - **{key}**: {value}")
402
+ lines.append("")
403
+
404
+ lines.append("## Column Summary\n")
405
+ lines.append(
406
+ "| Column | Semantic Type | Missing % | Severity | "
407
+ "Missingness Flags | Numeric Flags |"
408
+ )
409
+ lines.append("|---|---|---|---|---|---|")
410
+ for col_name, col in self.columns.items():
411
+ sem_type = col.semantic_type if col.semantic_type else "None"
412
+ missingness = col.missingness
413
+ if missingness is not None:
414
+ missing_str = f"{missingness.effective_null_ratio * 100:.2f}%"
415
+ severity = missingness.severity if missingness.severity else "None"
416
+ missingness_flags = (
417
+ ", ".join(str(f) for f in missingness.flags)
418
+ if missingness.flags
419
+ else "None"
420
+ )
421
+ else:
422
+ missing_str = "0.00%"
423
+ severity = "None"
424
+ missingness_flags = "None"
425
+ numeric_flags = "None"
426
+ if isinstance(col.stats, NumericStats) and col.stats.flags:
427
+ numeric_flags = ", ".join(str(f) for f in col.stats.flags)
428
+ lines.append(
429
+ f"| `{col_name}` | {sem_type} | {missing_str} | {severity} | "
430
+ f"{missingness_flags} | {numeric_flags} |"
431
+ )
432
+ lines.append("")
433
+
434
+ lines.append("## Flagged Columns\n")
435
+ flagged_columns = [
436
+ col for col in self.columns.values() if not _is_clean_column(col)
437
+ ]
438
+ flagged_columns.sort(key=lambda col: (_flagged_column_tier(col), col.name))
439
+ for col in flagged_columns:
440
+ lines.append(f"### `{col.name}`\n")
441
+ lines.extend(
442
+ _format_dict_lines(
443
+ _compact_column_detail(col, self.dataset.feature_correlation)
444
+ )
445
+ )
446
+ lines.append("")
447
+
448
+ if self.dataset.target_correlations:
449
+ lines.append("## Target Analysis\n")
450
+ for target_name, corr in self.dataset.target_correlations.items():
451
+ lines.append(f"### Target: `{target_name}`\n")
452
+ feature_cols = [
453
+ c for c in corr.analysed_numeric_columns if c != target_name
454
+ ]
455
+ for feat in feature_cols:
456
+ top_pearson = _top_n_abs_correlations(corr.pearson_matrix, feat)
457
+ top_spearman = _top_n_abs_correlations(corr.spearman_matrix, feat)
458
+ lines.append(f"#### `{feat}`\n")
459
+ lines.extend(
460
+ _format_dict_lines(
461
+ {
462
+ "top_pearson": [
463
+ f"{name}: {value}" for name, value in top_pearson
464
+ ],
465
+ "top_spearman": [
466
+ f"{name}: {value}" for name, value in top_spearman
467
+ ],
468
+ }
469
+ )
470
+ )
471
+ lines.append("")
472
+
473
+ lines.append("## Sentinels\n")
474
+ lines.extend(
475
+ _format_dict_lines(
476
+ {
477
+ "numeric_sentinels": dict(self.numeric_sentinels),
478
+ "string_sentinels": {
479
+ k: list(v) for k, v in self.string_sentinels.items()
480
+ },
481
+ }
482
+ )
483
+ )
484
+ lines.append("")
485
+
486
+ return "\n".join(lines).strip() + "\n"
487
+
488
+ def to_full_markdown(self) -> str:
489
+ """
490
+ Produce a complete, lossless Markdown serialization for debugging and archival use.
491
+
492
+ Every field present in ``to_dict()`` — including histogram bins, full
493
+ correlation matrices, memory breakdown, and all per-column fields —
494
+ is rendered as Markdown. For an 82-column dataset this produces
495
+ roughly 1 MB of text; for human inspection of large datasets prefer
496
+ ``to_markdown()`` once the compact view lands (ADR-0040).
242
497
 
243
498
  Returns
244
499
  -------
@@ -264,47 +519,26 @@ class StructuralProfileResult:
264
519
  lines.append(f"| {col_name} | {sem_type} | {missing_str} | {severity} | {flags} |")
265
520
 
266
521
  lines.append("\n## Column Details\n")
267
-
268
- def _format_dict(d: dict, indent: int = 0) -> list[str]:
269
- out = []
270
- prefix = " " * indent
271
- for k, v in d.items():
272
- if isinstance(v, dict):
273
- if not v:
274
- out.append(f"{prefix}- **{k}**: (empty)")
275
- else:
276
- out.append(f"{prefix}- **{k}**:")
277
- out.extend(_format_dict(v, indent + 1))
278
- elif isinstance(v, list):
279
- if not v:
280
- out.append(f"{prefix}- **{k}**: (empty)")
281
- else:
282
- out.append(f"{prefix}- **{k}**:")
283
- for item in v:
284
- out.append(f"{prefix} - {item}")
285
- else:
286
- out.append(f"{prefix}- **{k}**: {v}")
287
- return out
288
522
 
289
523
  for col_name, col_data in data.get("columns", {}).items():
290
524
  lines.append(f"### `{col_name}`\n")
291
- lines.extend(_format_dict(col_data))
525
+ lines.extend(_format_dict_lines(col_data))
292
526
  lines.append("")
293
527
 
294
528
  lines.append("## Dataset\n")
295
- lines.extend(_format_dict(data.get("dataset", {})))
529
+ lines.extend(_format_dict_lines(data.get("dataset", {})))
296
530
  lines.append("")
297
-
531
+
298
532
  lines.append("## Targets\n")
299
- lines.extend(_format_dict(data.get("targets", {})))
533
+ lines.extend(_format_dict_lines(data.get("targets", {})))
300
534
  lines.append("")
301
535
 
302
536
  lines.append("## Numeric Sentinels\n")
303
- lines.extend(_format_dict(data.get("numeric_sentinels", {})))
537
+ lines.extend(_format_dict_lines(data.get("numeric_sentinels", {})))
304
538
  lines.append("")
305
539
 
306
540
  lines.append("## String Sentinels\n")
307
- lines.extend(_format_dict(data.get("string_sentinels", {})))
541
+ lines.extend(_format_dict_lines(data.get("string_sentinels", {})))
308
542
  lines.append("")
309
543
 
310
544
  return "\n".join(lines).strip() + "\n"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataforge-ml
3
- Version: 2.0.1
3
+ Version: 2.0.2
4
4
  Summary: A automated feature engineering and designing pipeline library
5
5
  License: MIT
6
6
  Classifier: License :: OSI Approved :: MIT License
File without changes
File without changes
File without changes