dataforge-ml 2.0.1__tar.gz → 2.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-2.0.1/src/dataforge_ml.egg-info → dataforge_ml-2.0.2}/PKG-INFO +1 -1
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/pyproject.toml +1 -1
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_config.py +262 -28
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2/src/dataforge_ml.egg-info}/PKG-INFO +1 -1
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/LICENSE +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/README.md +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/setup.cfg +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/config.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/__init__.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_config.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_fitted_imputer.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_numeric_imputer.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_regression_estimator_factory.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_strategy_router.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/_utils.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/imputation/orchestrator.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/__init__.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_base.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_categorical.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_nonlinearity_profiler.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_numeric_profiler.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_tabular.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_type_detection_config.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_type_detector.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/orchestrator.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/splitting/_profile_signals.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/utils/_null_detection.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/utils/_null_normalization.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml.egg-info/requires.txt +0 -0
- {dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -15,7 +15,9 @@ from typing import Optional, Union
|
|
|
15
15
|
from ..config import SemanticType, Modality
|
|
16
16
|
from ._missingness_config import (
|
|
17
17
|
ColumnMissingnessProfile,
|
|
18
|
+
MissingnessFlag,
|
|
18
19
|
MissingnessProfileConfig,
|
|
20
|
+
MissingSeverity,
|
|
19
21
|
RowMissingnessDistribution,
|
|
20
22
|
)
|
|
21
23
|
from ._correlation_config import (
|
|
@@ -27,6 +29,8 @@ from ._categorical_config import (
|
|
|
27
29
|
CategoricalProfileConfig,
|
|
28
30
|
)
|
|
29
31
|
from ._numeric_config import (
|
|
32
|
+
NonlinearityTag,
|
|
33
|
+
NumericFlag,
|
|
30
34
|
NumericStats,
|
|
31
35
|
NumericProfileConfig,
|
|
32
36
|
NonlinearityProfileConfig,
|
|
@@ -70,6 +74,121 @@ class TypeFlag(StrEnum):
|
|
|
70
74
|
AnyStats = Union[NumericStats, CategoricalStats, DatetimeStats, BooleanStats, TextStats]
|
|
71
75
|
|
|
72
76
|
|
|
77
|
+
def _format_dict_lines(d: dict, indent: int = 0) -> list[str]:
|
|
78
|
+
out = []
|
|
79
|
+
prefix = " " * indent
|
|
80
|
+
for k, v in d.items():
|
|
81
|
+
if isinstance(v, dict):
|
|
82
|
+
if not v:
|
|
83
|
+
out.append(f"{prefix}- **{k}**: (empty)")
|
|
84
|
+
else:
|
|
85
|
+
out.append(f"{prefix}- **{k}**:")
|
|
86
|
+
out.extend(_format_dict_lines(v, indent + 1))
|
|
87
|
+
elif isinstance(v, list):
|
|
88
|
+
if not v:
|
|
89
|
+
out.append(f"{prefix}- **{k}**: (empty)")
|
|
90
|
+
else:
|
|
91
|
+
out.append(f"{prefix}- **{k}**:")
|
|
92
|
+
for item in v:
|
|
93
|
+
out.append(f"{prefix} - {item}")
|
|
94
|
+
else:
|
|
95
|
+
out.append(f"{prefix}- **{k}**: {v}")
|
|
96
|
+
return out
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _top_n_abs_correlations(
|
|
100
|
+
matrix: dict[str, dict[str, float]], col_name: str, n: int = 5
|
|
101
|
+
) -> list[tuple[str, float]]:
|
|
102
|
+
row = matrix.get(col_name, {})
|
|
103
|
+
pairs = [(other, value) for other, value in row.items() if other != col_name]
|
|
104
|
+
pairs.sort(key=lambda item: abs(item[1]), reverse=True)
|
|
105
|
+
return pairs[:n]
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _is_clean_column(col: "ColumnProfile") -> bool:
|
|
109
|
+
missingness = col.missingness
|
|
110
|
+
if missingness is not None and missingness.flags:
|
|
111
|
+
return False
|
|
112
|
+
severity = missingness.severity if missingness is not None else None
|
|
113
|
+
if severity not in (None, MissingSeverity.Minor):
|
|
114
|
+
return False
|
|
115
|
+
if isinstance(col.stats, NumericStats):
|
|
116
|
+
if col.stats.flags:
|
|
117
|
+
return False
|
|
118
|
+
if col.stats.nonlinearity_tag not in (None, NonlinearityTag.Linear):
|
|
119
|
+
return False
|
|
120
|
+
return True
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _compact_column_detail(
|
|
124
|
+
col: "ColumnProfile",
|
|
125
|
+
feature_correlation: Optional[CorrelationProfileResult],
|
|
126
|
+
) -> dict:
|
|
127
|
+
"""
|
|
128
|
+
Build the per-column dict used in the Flagged Columns detail section.
|
|
129
|
+
|
|
130
|
+
Applies the compact-view field rules (ADR-0040) on top of
|
|
131
|
+
``ColumnProfile.to_dict()``: drops ``total_rows`` from the missingness
|
|
132
|
+
subsection and ``histogram`` from the stats subsection, and caps
|
|
133
|
+
``top_values`` (present on both ``NumericStats`` and ``CategoricalStats``)
|
|
134
|
+
to 3 entries. All other fields — including redundant scalar pairs,
|
|
135
|
+
percentiles, bimodal stats, and ``correlated_with`` — pass through
|
|
136
|
+
unchanged. When ``feature_correlation`` carries a row for this column,
|
|
137
|
+
a ``correlations`` entry is added with the top-5 highest absolute
|
|
138
|
+
Pearson and top-5 highest absolute Spearman correlations (descending
|
|
139
|
+
by absolute value) in place of the full N×N matrices.
|
|
140
|
+
|
|
141
|
+
Parameters
|
|
142
|
+
----------
|
|
143
|
+
col : ColumnProfile
|
|
144
|
+
The column profile to render.
|
|
145
|
+
feature_correlation : CorrelationProfileResult or None
|
|
146
|
+
Dataset-level feature-feature correlation result, if computed.
|
|
147
|
+
|
|
148
|
+
Returns
|
|
149
|
+
-------
|
|
150
|
+
dict
|
|
151
|
+
Trimmed dictionary suitable for ``_format_dict_lines``.
|
|
152
|
+
"""
|
|
153
|
+
data = col.to_dict()
|
|
154
|
+
missingness = data.get("missingness")
|
|
155
|
+
if missingness is not None:
|
|
156
|
+
missingness.pop("total_rows", None)
|
|
157
|
+
stats = data.get("stats")
|
|
158
|
+
if stats is not None:
|
|
159
|
+
stats.pop("histogram", None)
|
|
160
|
+
if "top_values" in stats:
|
|
161
|
+
stats["top_values"] = stats["top_values"][:3]
|
|
162
|
+
|
|
163
|
+
if feature_correlation is not None:
|
|
164
|
+
top_pearson = _top_n_abs_correlations(feature_correlation.pearson_matrix, col.name)
|
|
165
|
+
top_spearman = _top_n_abs_correlations(feature_correlation.spearman_matrix, col.name)
|
|
166
|
+
if top_pearson or top_spearman:
|
|
167
|
+
data["correlations"] = {
|
|
168
|
+
"top_pearson": [f"{name}: {value}" for name, value in top_pearson],
|
|
169
|
+
"top_spearman": [f"{name}: {value}" for name, value in top_spearman],
|
|
170
|
+
}
|
|
171
|
+
return data
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _flagged_column_tier(col: "ColumnProfile") -> int:
|
|
175
|
+
missingness = col.missingness
|
|
176
|
+
flags = missingness.flags if missingness is not None else []
|
|
177
|
+
severity = missingness.severity if missingness is not None else None
|
|
178
|
+
|
|
179
|
+
if MissingnessFlag.DropCandidate in flags or MissingnessFlag.FullyNull in flags:
|
|
180
|
+
return 0
|
|
181
|
+
if severity == MissingSeverity.Severe:
|
|
182
|
+
return 1
|
|
183
|
+
if severity == MissingSeverity.High:
|
|
184
|
+
return 2
|
|
185
|
+
if severity == MissingSeverity.Moderate:
|
|
186
|
+
return 3
|
|
187
|
+
if flags:
|
|
188
|
+
return 4
|
|
189
|
+
return 5
|
|
190
|
+
|
|
191
|
+
|
|
73
192
|
@dataclass
|
|
74
193
|
class ColumnProfile:
|
|
75
194
|
"""
|
|
@@ -238,7 +357,143 @@ class StructuralProfileResult:
|
|
|
238
357
|
|
|
239
358
|
def to_markdown(self) -> str:
|
|
240
359
|
"""
|
|
241
|
-
Produce a
|
|
360
|
+
Produce a compact, human-oriented Markdown view of the profiling result.
|
|
361
|
+
|
|
362
|
+
The document contains a Dataset Overview section (scalar
|
|
363
|
+
dataset-level fields only — ``memory_breakdown`` and
|
|
364
|
+
``missingness_matrix`` are omitted), a Column Summary table with one
|
|
365
|
+
row per column, and a Flagged Columns section with a full detail
|
|
366
|
+
subsection for every column that exceeds the clean threshold (see
|
|
367
|
+
``_is_clean_column``), ordered severity-first then alphabetically
|
|
368
|
+
(see ``_flagged_column_tier``). Within each flagged column's detail
|
|
369
|
+
section, ``histogram`` bins and the missingness ``total_rows`` field
|
|
370
|
+
are dropped, ``top_values`` is capped to 3 entries, and the full
|
|
371
|
+
Pearson/Spearman correlation matrices are replaced by the top-5
|
|
372
|
+
highest absolute correlations for that column (see
|
|
373
|
+
``_compact_column_detail``); all other fields — including redundant
|
|
374
|
+
scalar pairs, percentiles, bimodal stats, and ``correlated_with`` —
|
|
375
|
+
are kept in full. A Target Analysis section follows with the top-5
|
|
376
|
+
absolute Pearson and Spearman correlations per feature column for
|
|
377
|
+
each declared target, and a Sentinels section renders
|
|
378
|
+
``numeric_sentinels`` / ``string_sentinels`` unchanged. Use
|
|
379
|
+
``to_full_markdown()`` for the complete lossless serialization.
|
|
380
|
+
|
|
381
|
+
Returns
|
|
382
|
+
-------
|
|
383
|
+
str
|
|
384
|
+
Markdown string containing the Dataset Overview, Column Summary,
|
|
385
|
+
Flagged Columns, Target Analysis, and Sentinels sections.
|
|
386
|
+
"""
|
|
387
|
+
lines = ["# Structural Profile Report (Compact)\n"]
|
|
388
|
+
|
|
389
|
+
ds = self.dataset
|
|
390
|
+
lines.append("## Dataset Overview\n")
|
|
391
|
+
lines.append(f"- **modality**: {ds.modality}")
|
|
392
|
+
lines.append(f"- **row_count**: {ds.row_count}")
|
|
393
|
+
lines.append(f"- **column_count**: {ds.column_count}")
|
|
394
|
+
lines.append(f"- **memory_bytes**: {ds.memory_bytes}")
|
|
395
|
+
lines.append(f"- **duplicate_count**: {ds.duplicate_count}")
|
|
396
|
+
lines.append(f"- **duplicate_ratio**: {ds.duplicate_ratio}")
|
|
397
|
+
lines.append(f"- **overall_sparsity**: {ds.overall_sparsity}")
|
|
398
|
+
lines.append(f"- **was_chunked**: {ds.was_chunked}")
|
|
399
|
+
lines.append("- **row_distribution**:")
|
|
400
|
+
for key, value in ds.row_distribution.to_dict().items():
|
|
401
|
+
lines.append(f" - **{key}**: {value}")
|
|
402
|
+
lines.append("")
|
|
403
|
+
|
|
404
|
+
lines.append("## Column Summary\n")
|
|
405
|
+
lines.append(
|
|
406
|
+
"| Column | Semantic Type | Missing % | Severity | "
|
|
407
|
+
"Missingness Flags | Numeric Flags |"
|
|
408
|
+
)
|
|
409
|
+
lines.append("|---|---|---|---|---|---|")
|
|
410
|
+
for col_name, col in self.columns.items():
|
|
411
|
+
sem_type = col.semantic_type if col.semantic_type else "None"
|
|
412
|
+
missingness = col.missingness
|
|
413
|
+
if missingness is not None:
|
|
414
|
+
missing_str = f"{missingness.effective_null_ratio * 100:.2f}%"
|
|
415
|
+
severity = missingness.severity if missingness.severity else "None"
|
|
416
|
+
missingness_flags = (
|
|
417
|
+
", ".join(str(f) for f in missingness.flags)
|
|
418
|
+
if missingness.flags
|
|
419
|
+
else "None"
|
|
420
|
+
)
|
|
421
|
+
else:
|
|
422
|
+
missing_str = "0.00%"
|
|
423
|
+
severity = "None"
|
|
424
|
+
missingness_flags = "None"
|
|
425
|
+
numeric_flags = "None"
|
|
426
|
+
if isinstance(col.stats, NumericStats) and col.stats.flags:
|
|
427
|
+
numeric_flags = ", ".join(str(f) for f in col.stats.flags)
|
|
428
|
+
lines.append(
|
|
429
|
+
f"| `{col_name}` | {sem_type} | {missing_str} | {severity} | "
|
|
430
|
+
f"{missingness_flags} | {numeric_flags} |"
|
|
431
|
+
)
|
|
432
|
+
lines.append("")
|
|
433
|
+
|
|
434
|
+
lines.append("## Flagged Columns\n")
|
|
435
|
+
flagged_columns = [
|
|
436
|
+
col for col in self.columns.values() if not _is_clean_column(col)
|
|
437
|
+
]
|
|
438
|
+
flagged_columns.sort(key=lambda col: (_flagged_column_tier(col), col.name))
|
|
439
|
+
for col in flagged_columns:
|
|
440
|
+
lines.append(f"### `{col.name}`\n")
|
|
441
|
+
lines.extend(
|
|
442
|
+
_format_dict_lines(
|
|
443
|
+
_compact_column_detail(col, self.dataset.feature_correlation)
|
|
444
|
+
)
|
|
445
|
+
)
|
|
446
|
+
lines.append("")
|
|
447
|
+
|
|
448
|
+
if self.dataset.target_correlations:
|
|
449
|
+
lines.append("## Target Analysis\n")
|
|
450
|
+
for target_name, corr in self.dataset.target_correlations.items():
|
|
451
|
+
lines.append(f"### Target: `{target_name}`\n")
|
|
452
|
+
feature_cols = [
|
|
453
|
+
c for c in corr.analysed_numeric_columns if c != target_name
|
|
454
|
+
]
|
|
455
|
+
for feat in feature_cols:
|
|
456
|
+
top_pearson = _top_n_abs_correlations(corr.pearson_matrix, feat)
|
|
457
|
+
top_spearman = _top_n_abs_correlations(corr.spearman_matrix, feat)
|
|
458
|
+
lines.append(f"#### `{feat}`\n")
|
|
459
|
+
lines.extend(
|
|
460
|
+
_format_dict_lines(
|
|
461
|
+
{
|
|
462
|
+
"top_pearson": [
|
|
463
|
+
f"{name}: {value}" for name, value in top_pearson
|
|
464
|
+
],
|
|
465
|
+
"top_spearman": [
|
|
466
|
+
f"{name}: {value}" for name, value in top_spearman
|
|
467
|
+
],
|
|
468
|
+
}
|
|
469
|
+
)
|
|
470
|
+
)
|
|
471
|
+
lines.append("")
|
|
472
|
+
|
|
473
|
+
lines.append("## Sentinels\n")
|
|
474
|
+
lines.extend(
|
|
475
|
+
_format_dict_lines(
|
|
476
|
+
{
|
|
477
|
+
"numeric_sentinels": dict(self.numeric_sentinels),
|
|
478
|
+
"string_sentinels": {
|
|
479
|
+
k: list(v) for k, v in self.string_sentinels.items()
|
|
480
|
+
},
|
|
481
|
+
}
|
|
482
|
+
)
|
|
483
|
+
)
|
|
484
|
+
lines.append("")
|
|
485
|
+
|
|
486
|
+
return "\n".join(lines).strip() + "\n"
|
|
487
|
+
|
|
488
|
+
def to_full_markdown(self) -> str:
|
|
489
|
+
"""
|
|
490
|
+
Produce a complete, lossless Markdown serialization for debugging and archival use.
|
|
491
|
+
|
|
492
|
+
Every field present in ``to_dict()`` — including histogram bins, full
|
|
493
|
+
correlation matrices, memory breakdown, and all per-column fields —
|
|
494
|
+
is rendered as Markdown. For an 82-column dataset this produces
|
|
495
|
+
roughly 1 MB of text; for human inspection of large datasets prefer
|
|
496
|
+
``to_markdown()`` once the compact view lands (ADR-0040).
|
|
242
497
|
|
|
243
498
|
Returns
|
|
244
499
|
-------
|
|
@@ -264,47 +519,26 @@ class StructuralProfileResult:
|
|
|
264
519
|
lines.append(f"| {col_name} | {sem_type} | {missing_str} | {severity} | {flags} |")
|
|
265
520
|
|
|
266
521
|
lines.append("\n## Column Details\n")
|
|
267
|
-
|
|
268
|
-
def _format_dict(d: dict, indent: int = 0) -> list[str]:
|
|
269
|
-
out = []
|
|
270
|
-
prefix = " " * indent
|
|
271
|
-
for k, v in d.items():
|
|
272
|
-
if isinstance(v, dict):
|
|
273
|
-
if not v:
|
|
274
|
-
out.append(f"{prefix}- **{k}**: (empty)")
|
|
275
|
-
else:
|
|
276
|
-
out.append(f"{prefix}- **{k}**:")
|
|
277
|
-
out.extend(_format_dict(v, indent + 1))
|
|
278
|
-
elif isinstance(v, list):
|
|
279
|
-
if not v:
|
|
280
|
-
out.append(f"{prefix}- **{k}**: (empty)")
|
|
281
|
-
else:
|
|
282
|
-
out.append(f"{prefix}- **{k}**:")
|
|
283
|
-
for item in v:
|
|
284
|
-
out.append(f"{prefix} - {item}")
|
|
285
|
-
else:
|
|
286
|
-
out.append(f"{prefix}- **{k}**: {v}")
|
|
287
|
-
return out
|
|
288
522
|
|
|
289
523
|
for col_name, col_data in data.get("columns", {}).items():
|
|
290
524
|
lines.append(f"### `{col_name}`\n")
|
|
291
|
-
lines.extend(
|
|
525
|
+
lines.extend(_format_dict_lines(col_data))
|
|
292
526
|
lines.append("")
|
|
293
527
|
|
|
294
528
|
lines.append("## Dataset\n")
|
|
295
|
-
lines.extend(
|
|
529
|
+
lines.extend(_format_dict_lines(data.get("dataset", {})))
|
|
296
530
|
lines.append("")
|
|
297
|
-
|
|
531
|
+
|
|
298
532
|
lines.append("## Targets\n")
|
|
299
|
-
lines.extend(
|
|
533
|
+
lines.extend(_format_dict_lines(data.get("targets", {})))
|
|
300
534
|
lines.append("")
|
|
301
535
|
|
|
302
536
|
lines.append("## Numeric Sentinels\n")
|
|
303
|
-
lines.extend(
|
|
537
|
+
lines.extend(_format_dict_lines(data.get("numeric_sentinels", {})))
|
|
304
538
|
lines.append("")
|
|
305
539
|
|
|
306
540
|
lines.append("## String Sentinels\n")
|
|
307
|
-
lines.extend(
|
|
541
|
+
lines.extend(_format_dict_lines(data.get("string_sentinels", {})))
|
|
308
542
|
lines.append("")
|
|
309
543
|
|
|
310
544
|
return "\n".join(lines).strip() + "\n"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
File without changes
|
{dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_nonlinearity_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-2.0.1 → dataforge_ml-2.0.2}/src/dataforge_ml/profiling/_type_detection_config.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|