dataforge-ml 0.3.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/PKG-INFO +3 -1
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/pyproject.toml +3 -1
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_boolean_config.py +9 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_categorical_config.py +29 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_correlation_config.py +37 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_datetime_config.py +25 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_missingness_config.py +13 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_numeric_config.py +36 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_target_config.py +11 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_text_config.py +13 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/config.py +52 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml.egg-info/PKG-INFO +3 -1
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml.egg-info/requires.txt +2 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/LICENSE +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/README.md +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/setup.cfg +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/__init__.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_base.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_categorical.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_correlation_profiler.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_numeric_profiler.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_tabular.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_type_detector.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/structural.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataforge-ml
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: A automated feature engineering and designing pipeline library
|
|
5
5
|
License: MIT
|
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -15,6 +15,8 @@ Requires-Dist: polars>=1.0.0
|
|
|
15
15
|
Requires-Dist: scikit-learn>=1.0.0
|
|
16
16
|
Requires-Dist: scipy>=1.10.0
|
|
17
17
|
Requires-Dist: numpy>=2.0.0
|
|
18
|
+
Requires-Dist: pandas>=2.0.0
|
|
19
|
+
Requires-Dist: chardet>=5.0.0
|
|
18
20
|
Provides-Extra: dev
|
|
19
21
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
20
22
|
Dynamic: license-file
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "dataforge-ml"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.5.0"
|
|
8
8
|
description = "A automated feature engineering and designing pipeline library"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -21,6 +21,8 @@ dependencies = [
|
|
|
21
21
|
"scikit-learn>=1.0.0",
|
|
22
22
|
"scipy>=1.10.0",
|
|
23
23
|
"numpy>=2.0.0",
|
|
24
|
+
"pandas>=2.0.0",
|
|
25
|
+
"chardet>=5.0.0",
|
|
24
26
|
]
|
|
25
27
|
|
|
26
28
|
[project.optional-dependencies]
|
|
@@ -18,6 +18,15 @@ class BooleanStats:
|
|
|
18
18
|
false_ratio: float = 0.0
|
|
19
19
|
mode: Optional[bool] = None
|
|
20
20
|
|
|
21
|
+
def to_dict(self) -> dict:
|
|
22
|
+
return {
|
|
23
|
+
"true_count": self.true_count,
|
|
24
|
+
"false_count": self.false_count,
|
|
25
|
+
"true_ratio": self.true_ratio,
|
|
26
|
+
"false_ratio": self.false_ratio,
|
|
27
|
+
"mode": self.mode,
|
|
28
|
+
}
|
|
29
|
+
|
|
21
30
|
|
|
22
31
|
@dataclass
|
|
23
32
|
class BooleanProfileResult:
|
|
@@ -27,6 +27,9 @@ class TopValueEntry:
|
|
|
27
27
|
count: int
|
|
28
28
|
percentage: float
|
|
29
29
|
|
|
30
|
+
def to_dict(self) -> dict:
|
|
31
|
+
return {"value": self.value, "count": self.count, "percentage": self.percentage}
|
|
32
|
+
|
|
30
33
|
|
|
31
34
|
@dataclass
|
|
32
35
|
class RareCategoryStats:
|
|
@@ -35,6 +38,14 @@ class RareCategoryStats:
|
|
|
35
38
|
total_rare_rows: int = 0
|
|
36
39
|
rare_row_percentage: float = 0.0
|
|
37
40
|
|
|
41
|
+
def to_dict(self) -> dict:
|
|
42
|
+
return {
|
|
43
|
+
"threshold_pct": self.threshold_pct,
|
|
44
|
+
"rare_category_count": self.rare_category_count,
|
|
45
|
+
"total_rare_rows": self.total_rare_rows,
|
|
46
|
+
"rare_row_percentage": self.rare_row_percentage,
|
|
47
|
+
}
|
|
48
|
+
|
|
38
49
|
|
|
39
50
|
@dataclass
|
|
40
51
|
class ImbalanceMetrics:
|
|
@@ -42,6 +53,13 @@ class ImbalanceMetrics:
|
|
|
42
53
|
shannon_entropy: float = 0.0
|
|
43
54
|
gini_impurity: float = 0.0
|
|
44
55
|
|
|
56
|
+
def to_dict(self) -> dict:
|
|
57
|
+
return {
|
|
58
|
+
"class_ratio": self.class_ratio,
|
|
59
|
+
"shannon_entropy": self.shannon_entropy,
|
|
60
|
+
"gini_impurity": self.gini_impurity,
|
|
61
|
+
}
|
|
62
|
+
|
|
45
63
|
|
|
46
64
|
@dataclass
|
|
47
65
|
class CategoricalStats:
|
|
@@ -55,6 +73,17 @@ class CategoricalStats:
|
|
|
55
73
|
imbalance: ImbalanceMetrics = field(default_factory=ImbalanceMetrics)
|
|
56
74
|
flags: list[CategoricalFlag] = field(default_factory=list)
|
|
57
75
|
|
|
76
|
+
def to_dict(self) -> dict:
|
|
77
|
+
return {
|
|
78
|
+
"cardinality": self.cardinality,
|
|
79
|
+
"unique_ratio": self.unique_ratio,
|
|
80
|
+
"mode_frequency": self.mode_frequency,
|
|
81
|
+
"top_values": [v.to_dict() for v in self.top_values],
|
|
82
|
+
"rare_categories": self.rare_categories.to_dict(),
|
|
83
|
+
"imbalance": self.imbalance.to_dict(),
|
|
84
|
+
"flags": [str(f) for f in self.flags],
|
|
85
|
+
}
|
|
86
|
+
|
|
58
87
|
|
|
59
88
|
CategoricalColumnProfile = CategoricalStats
|
|
60
89
|
|
|
@@ -66,6 +66,13 @@ class CorrelationPair:
|
|
|
66
66
|
spearman_r: Optional[float] = None
|
|
67
67
|
near_redundant: bool = False
|
|
68
68
|
|
|
69
|
+
def to_dict(self) -> dict:
|
|
70
|
+
return {
|
|
71
|
+
"col_a": self.col_a, "col_b": self.col_b,
|
|
72
|
+
"pearson_r": self.pearson_r, "spearman_r": self.spearman_r,
|
|
73
|
+
"near_redundant": self.near_redundant,
|
|
74
|
+
}
|
|
75
|
+
|
|
69
76
|
|
|
70
77
|
# ---------------------------------------------------------------------------
|
|
71
78
|
# Feature–target entries
|
|
@@ -84,6 +91,9 @@ class NumericTargetCorrelation:
|
|
|
84
91
|
feature: str
|
|
85
92
|
pearson_r: Optional[float] = None
|
|
86
93
|
|
|
94
|
+
def to_dict(self) -> dict:
|
|
95
|
+
return {"feature": self.feature, "pearson_r": self.pearson_r}
|
|
96
|
+
|
|
87
97
|
|
|
88
98
|
@dataclass
|
|
89
99
|
class CategoricalTargetCorrelation:
|
|
@@ -108,6 +118,12 @@ class CategoricalTargetCorrelation:
|
|
|
108
118
|
p_value: Optional[float] = None
|
|
109
119
|
eta_squared: Optional[float] = None
|
|
110
120
|
|
|
121
|
+
def to_dict(self) -> dict:
|
|
122
|
+
return {
|
|
123
|
+
"feature": self.feature, "f_statistic": self.f_statistic,
|
|
124
|
+
"p_value": self.p_value, "eta_squared": self.eta_squared,
|
|
125
|
+
}
|
|
126
|
+
|
|
111
127
|
|
|
112
128
|
# ---------------------------------------------------------------------------
|
|
113
129
|
# Mutual information
|
|
@@ -131,6 +147,9 @@ class MutualInformationEntry:
|
|
|
131
147
|
mi_score: float = 0.0
|
|
132
148
|
rank: int = 0
|
|
133
149
|
|
|
150
|
+
def to_dict(self) -> dict:
|
|
151
|
+
return {"feature": self.feature, "mi_score": self.mi_score, "rank": self.rank}
|
|
152
|
+
|
|
134
153
|
|
|
135
154
|
# ---------------------------------------------------------------------------
|
|
136
155
|
# Near-redundancy summary
|
|
@@ -148,6 +167,9 @@ class NearRedundancyGroup:
|
|
|
148
167
|
columns: list[str] = field(default_factory=list)
|
|
149
168
|
suggested_drop: list[str] = field(default_factory=list)
|
|
150
169
|
|
|
170
|
+
def to_dict(self) -> dict:
|
|
171
|
+
return {"columns": list(self.columns), "suggested_drop": list(self.suggested_drop)}
|
|
172
|
+
|
|
151
173
|
|
|
152
174
|
# ---------------------------------------------------------------------------
|
|
153
175
|
# Top-level result
|
|
@@ -223,3 +245,18 @@ class CorrelationProfileResult:
|
|
|
223
245
|
|
|
224
246
|
def get_spearman(self, col_a: str, col_b: str) -> Optional[float]:
|
|
225
247
|
return self.spearman_matrix.get(col_a, {}).get(col_b)
|
|
248
|
+
|
|
249
|
+
def to_dict(self) -> dict:
|
|
250
|
+
return {
|
|
251
|
+
"analysed_numeric_columns": list(self.analysed_numeric_columns),
|
|
252
|
+
"pearson_matrix": {k: dict(v) for k, v in self.pearson_matrix.items()},
|
|
253
|
+
"spearman_matrix": {k: dict(v) for k, v in self.spearman_matrix.items()},
|
|
254
|
+
"pairwise": [p.to_dict() for p in self.pairwise],
|
|
255
|
+
"near_redundant_pairs": [p.to_dict() for p in self.near_redundant_pairs],
|
|
256
|
+
"near_redundancy_groups": [g.to_dict() for g in self.near_redundancy_groups],
|
|
257
|
+
"target_column": self.target_column,
|
|
258
|
+
"target_type": str(self.target_type) if self.target_type else None,
|
|
259
|
+
"feature_target_numeric": [f.to_dict() for f in self.feature_target_numeric],
|
|
260
|
+
"feature_target_categorical": [f.to_dict() for f in self.feature_target_categorical],
|
|
261
|
+
"mutual_information": [m.to_dict() for m in self.mutual_information],
|
|
262
|
+
}
|
|
@@ -58,6 +58,18 @@ class TemporalSignals:
|
|
|
58
58
|
features.append("is_month_end")
|
|
59
59
|
return features
|
|
60
60
|
|
|
61
|
+
def to_dict(self) -> dict:
|
|
62
|
+
return {
|
|
63
|
+
"has_year": self.has_year,
|
|
64
|
+
"has_month": self.has_month,
|
|
65
|
+
"has_day": self.has_day,
|
|
66
|
+
"has_day_of_week": self.has_day_of_week,
|
|
67
|
+
"has_hour": self.has_hour,
|
|
68
|
+
"has_is_weekend": self.has_is_weekend,
|
|
69
|
+
"has_is_month_end": self.has_is_month_end,
|
|
70
|
+
"extractable_features": self.extractable_features(),
|
|
71
|
+
}
|
|
72
|
+
|
|
61
73
|
|
|
62
74
|
@dataclass
|
|
63
75
|
class DatetimeStats:
|
|
@@ -74,6 +86,19 @@ class DatetimeStats:
|
|
|
74
86
|
def has_flag(self, flag: DatetimeFlag) -> bool:
|
|
75
87
|
return flag in self.flags
|
|
76
88
|
|
|
89
|
+
def to_dict(self) -> dict:
|
|
90
|
+
return {
|
|
91
|
+
"min_date": self.min_date,
|
|
92
|
+
"max_date": self.max_date,
|
|
93
|
+
"date_range_days": self.date_range_days,
|
|
94
|
+
"future_date_count": self.future_date_count,
|
|
95
|
+
"inferred_granularity": str(self.inferred_granularity) if self.inferred_granularity else None,
|
|
96
|
+
"median_gap_seconds": self.median_gap_seconds,
|
|
97
|
+
"gap_cv": self.gap_cv,
|
|
98
|
+
"signals": self.signals.to_dict(),
|
|
99
|
+
"flags": [str(f) for f in self.flags],
|
|
100
|
+
}
|
|
101
|
+
|
|
77
102
|
|
|
78
103
|
@dataclass
|
|
79
104
|
class DatetimeProfileResult:
|
|
@@ -81,6 +81,19 @@ class ColumnMissingnessProfile:
|
|
|
81
81
|
def has_flag(self, flag: MissingnessFlag) -> bool:
|
|
82
82
|
return flag in self.flags
|
|
83
83
|
|
|
84
|
+
def to_dict(self) -> dict:
|
|
85
|
+
return {
|
|
86
|
+
"column": self.column,
|
|
87
|
+
"total_rows": self.total_rows,
|
|
88
|
+
"standard_null_count": self.standard_null_count,
|
|
89
|
+
"effective_null_count": self.effective_null_count,
|
|
90
|
+
"standard_null_ratio": self.standard_null_ratio,
|
|
91
|
+
"effective_null_ratio": self.effective_null_ratio,
|
|
92
|
+
"severity": str(self.severity) if self.severity else None,
|
|
93
|
+
"flags": [str(f) for f in self.flags],
|
|
94
|
+
"correlated_with": list(self.correlated_with),
|
|
95
|
+
}
|
|
96
|
+
|
|
84
97
|
def __str__(self) -> str: # pragma: no cover
|
|
85
98
|
lines = [
|
|
86
99
|
f" Column : {self.column}",
|
|
@@ -28,6 +28,12 @@ class PercentileSnapshot:
|
|
|
28
28
|
return self.p75 - self.p25
|
|
29
29
|
return None
|
|
30
30
|
|
|
31
|
+
def to_dict(self) -> dict:
|
|
32
|
+
return {
|
|
33
|
+
"p1": self.p1, "p5": self.p5, "p25": self.p25, "p50": self.p50,
|
|
34
|
+
"p75": self.p75, "p95": self.p95, "p99": self.p99,
|
|
35
|
+
}
|
|
36
|
+
|
|
31
37
|
|
|
32
38
|
class SkewSeverity(StrEnum):
|
|
33
39
|
Normal = "normal"
|
|
@@ -53,6 +59,9 @@ class NumericTopValueEntry:
|
|
|
53
59
|
count: int
|
|
54
60
|
percentage: float
|
|
55
61
|
|
|
62
|
+
def to_dict(self) -> dict:
|
|
63
|
+
return {"value": self.value, "count": self.count, "percentage": self.percentage}
|
|
64
|
+
|
|
56
65
|
|
|
57
66
|
@dataclass
|
|
58
67
|
class HistogramBin:
|
|
@@ -61,6 +70,12 @@ class HistogramBin:
|
|
|
61
70
|
count: int
|
|
62
71
|
percentage: float
|
|
63
72
|
|
|
73
|
+
def to_dict(self) -> dict:
|
|
74
|
+
return {
|
|
75
|
+
"lower_bound": self.lower_bound, "upper_bound": self.upper_bound,
|
|
76
|
+
"count": self.count, "percentage": self.percentage,
|
|
77
|
+
}
|
|
78
|
+
|
|
64
79
|
|
|
65
80
|
@dataclass
|
|
66
81
|
class NumericStats:
|
|
@@ -89,6 +104,27 @@ class NumericStats:
|
|
|
89
104
|
def has_flag(self, flag: NumericFlag) -> bool:
|
|
90
105
|
return flag in self.flags
|
|
91
106
|
|
|
107
|
+
def to_dict(self) -> dict:
|
|
108
|
+
return {
|
|
109
|
+
"mean": self.mean,
|
|
110
|
+
"median": self.median,
|
|
111
|
+
"mean_median_ratio": self.mean_median_ratio,
|
|
112
|
+
"mode": self.mode,
|
|
113
|
+
"mode_frequency": self.mode_frequency,
|
|
114
|
+
"top_values": [v.to_dict() for v in self.top_values],
|
|
115
|
+
"histogram": [b.to_dict() for b in self.histogram],
|
|
116
|
+
"std": self.std,
|
|
117
|
+
"variance": self.variance,
|
|
118
|
+
"min": self.min,
|
|
119
|
+
"max": self.max,
|
|
120
|
+
"percentiles": self.percentiles.to_dict(),
|
|
121
|
+
"skewness": self.skewness,
|
|
122
|
+
"kurtosis": self.kurtosis,
|
|
123
|
+
"skewness_severity": str(self.skewness_severity) if self.skewness_severity else None,
|
|
124
|
+
"kurtosis_tag": str(self.kurtosis_tag) if self.kurtosis_tag else None,
|
|
125
|
+
"flags": [str(f) for f in self.flags],
|
|
126
|
+
}
|
|
127
|
+
|
|
92
128
|
|
|
93
129
|
ColumnNumericProfile = NumericStats
|
|
94
130
|
|
|
@@ -47,6 +47,17 @@ class TargetProfileResult:
|
|
|
47
47
|
def has_flag(self, flag: TargetFlag) -> bool:
|
|
48
48
|
return flag in self.flags
|
|
49
49
|
|
|
50
|
+
def to_dict(self) -> dict:
|
|
51
|
+
return {
|
|
52
|
+
"column": self.column,
|
|
53
|
+
"problem_type": str(self.problem_type),
|
|
54
|
+
"missing_count": self.missing_count,
|
|
55
|
+
"missing_ratio": self.missing_ratio,
|
|
56
|
+
"numeric_profile": self.numeric_profile.to_dict() if self.numeric_profile else None,
|
|
57
|
+
"categorical_profile": self.categorical_profile.to_dict() if self.categorical_profile else None,
|
|
58
|
+
"flags": [str(f) for f in self.flags],
|
|
59
|
+
}
|
|
60
|
+
|
|
50
61
|
def __str__(self) -> str:
|
|
51
62
|
lines = [
|
|
52
63
|
"=== Target Variable Profile ===",
|
|
@@ -21,6 +21,19 @@ class TextStats:
|
|
|
21
21
|
empty_ratio: float = 0.0
|
|
22
22
|
whitespace_ratio: float = 0.0
|
|
23
23
|
|
|
24
|
+
def to_dict(self) -> dict:
|
|
25
|
+
return {
|
|
26
|
+
"avg_token_count": self.avg_token_count,
|
|
27
|
+
"median_token_count": self.median_token_count,
|
|
28
|
+
"vocabulary_size": self.vocabulary_size,
|
|
29
|
+
"char_length_min": self.char_length_min,
|
|
30
|
+
"char_length_max": self.char_length_max,
|
|
31
|
+
"char_length_mean": self.char_length_mean,
|
|
32
|
+
"char_length_median": self.char_length_median,
|
|
33
|
+
"empty_ratio": self.empty_ratio,
|
|
34
|
+
"whitespace_ratio": self.whitespace_ratio,
|
|
35
|
+
}
|
|
36
|
+
|
|
24
37
|
|
|
25
38
|
@dataclass
|
|
26
39
|
class TextProfileResult:
|
|
@@ -91,6 +91,18 @@ class ColumnProfile:
|
|
|
91
91
|
is_target: bool = False
|
|
92
92
|
stats: Optional[AnyStats] = None
|
|
93
93
|
|
|
94
|
+
def to_dict(self) -> dict:
|
|
95
|
+
return {
|
|
96
|
+
"name": self.name,
|
|
97
|
+
"semantic_type": str(self.semantic_type) if self.semantic_type else None,
|
|
98
|
+
"type_flags": [str(f) for f in self.type_flags],
|
|
99
|
+
"original_dtype": self.original_dtype,
|
|
100
|
+
"inferred_dtype": self.inferred_dtype,
|
|
101
|
+
"missingness": self.missingness.to_dict() if self.missingness else None,
|
|
102
|
+
"is_target": self.is_target,
|
|
103
|
+
"stats": self.stats.to_dict() if self.stats else None,
|
|
104
|
+
}
|
|
105
|
+
|
|
94
106
|
|
|
95
107
|
@dataclass
|
|
96
108
|
class RowMissingnessDistribution:
|
|
@@ -106,6 +118,16 @@ class RowMissingnessDistribution:
|
|
|
106
118
|
pct_over_half_missing: float = 0.0
|
|
107
119
|
drop_candidate_row_count: int = 0
|
|
108
120
|
|
|
121
|
+
def to_dict(self) -> dict:
|
|
122
|
+
return {
|
|
123
|
+
"pct_zero_missing": self.pct_zero_missing,
|
|
124
|
+
"pct_one_to_two": self.pct_one_to_two,
|
|
125
|
+
"pct_three_to_five": self.pct_three_to_five,
|
|
126
|
+
"pct_over_five": self.pct_over_five,
|
|
127
|
+
"pct_over_half_missing": self.pct_over_half_missing,
|
|
128
|
+
"drop_candidate_row_count": self.drop_candidate_row_count,
|
|
129
|
+
}
|
|
130
|
+
|
|
109
131
|
|
|
110
132
|
@dataclass
|
|
111
133
|
class MemoryBreakdown:
|
|
@@ -118,6 +140,9 @@ class MemoryBreakdown:
|
|
|
118
140
|
def top_consumers(self, n: int = 10) -> list[tuple[str, int]]:
|
|
119
141
|
return self.sorted_by_usage[:n]
|
|
120
142
|
|
|
143
|
+
def to_dict(self) -> dict:
|
|
144
|
+
return {"column_bytes": dict(self.column_bytes)}
|
|
145
|
+
|
|
121
146
|
|
|
122
147
|
@dataclass
|
|
123
148
|
class DatasetStats:
|
|
@@ -141,6 +166,23 @@ class DatasetStats:
|
|
|
141
166
|
default_factory=dict,
|
|
142
167
|
)
|
|
143
168
|
|
|
169
|
+
def to_dict(self) -> dict:
|
|
170
|
+
return {
|
|
171
|
+
"modality": str(self.modality),
|
|
172
|
+
"row_count": self.row_count,
|
|
173
|
+
"column_count": self.column_count,
|
|
174
|
+
"memory_bytes": self.memory_bytes,
|
|
175
|
+
"memory_breakdown": self.memory_breakdown.to_dict() if self.memory_breakdown else None,
|
|
176
|
+
"duplicate_count": self.duplicate_count,
|
|
177
|
+
"duplicate_ratio": self.duplicate_ratio,
|
|
178
|
+
"overall_sparsity": self.overall_sparsity,
|
|
179
|
+
"was_chunked": self.was_chunked,
|
|
180
|
+
"missingness_matrix": self.missingness_matrix,
|
|
181
|
+
"row_distribution": self.row_distribution.to_dict(),
|
|
182
|
+
"feature_correlation": self.feature_correlation.to_dict() if self.feature_correlation else None,
|
|
183
|
+
"target_correlations": {k: v.to_dict() for k, v in self.target_correlations.items()},
|
|
184
|
+
}
|
|
185
|
+
|
|
144
186
|
|
|
145
187
|
@dataclass
|
|
146
188
|
class StructuralProfileResult:
|
|
@@ -148,6 +190,16 @@ class StructuralProfileResult:
|
|
|
148
190
|
dataset: DatasetStats = field(default_factory=DatasetStats)
|
|
149
191
|
targets: dict[str, TargetProfileResult] = field(default_factory=dict)
|
|
150
192
|
|
|
193
|
+
def to_dict(self) -> dict:
|
|
194
|
+
return {
|
|
195
|
+
"columns": {k: v.to_dict() for k, v in self.columns.items()},
|
|
196
|
+
"dataset": self.dataset.to_dict(),
|
|
197
|
+
"targets": {k: v.to_dict() for k, v in self.targets.items()},
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
def to_json(self, indent: int = 2) -> str:
|
|
201
|
+
return json.dumps(self.to_dict(), indent=indent, default=str)
|
|
202
|
+
|
|
151
203
|
|
|
152
204
|
# ---------------------------------------------------------------------------
|
|
153
205
|
# ProfileConfig — clean break from per-profiler column lists
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataforge-ml
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: A automated feature engineering and designing pipeline library
|
|
5
5
|
License: MIT
|
|
6
6
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -15,6 +15,8 @@ Requires-Dist: polars>=1.0.0
|
|
|
15
15
|
Requires-Dist: scikit-learn>=1.0.0
|
|
16
16
|
Requires-Dist: scipy>=1.10.0
|
|
17
17
|
Requires-Dist: numpy>=2.0.0
|
|
18
|
+
Requires-Dist: pandas>=2.0.0
|
|
19
|
+
Requires-Dist: chardet>=5.0.0
|
|
18
20
|
Provides-Extra: dev
|
|
19
21
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
20
22
|
Dynamic: license-file
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
{dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|