PyPI - dataforge-ml - Versions diffs - 0.3.0__tar.gz → 0.5.0__tar.gz - Mend

dataforge-ml 0.3.0tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 0.3.0
+Version: 0.5.0
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License
@@ -15,6 +15,8 @@ Requires-Dist: polars>=1.0.0
 Requires-Dist: scikit-learn>=1.0.0
 Requires-Dist: scipy>=1.10.0
 Requires-Dist: numpy>=2.0.0
+Requires-Dist: pandas>=2.0.0
+Requires-Dist: chardet>=5.0.0
 Provides-Extra: dev
 Requires-Dist: pytest>=8.0; extra == "dev"
 Dynamic: license-file

{dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "dataforge-ml"
-version = "0.3.0"
+version = "0.5.0"
 description = "A automated feature engineering and designing pipeline library"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -21,6 +21,8 @@ dependencies = [
     "scikit-learn>=1.0.0",
     "scipy>=1.10.0",
     "numpy>=2.0.0",
+    "pandas>=2.0.0",
+    "chardet>=5.0.0",
 ]
 [project.optional-dependencies]

{dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_boolean_config.py RENAMED Viewed

@@ -18,6 +18,15 @@ class BooleanStats:
     false_ratio: float = 0.0
     mode: Optional[bool] = None
+    def to_dict(self) -> dict:
+        return {
+            "true_count": self.true_count,
+            "false_count": self.false_count,
+            "true_ratio": self.true_ratio,
+            "false_ratio": self.false_ratio,
+            "mode": self.mode,
+        }
 @dataclass
 class BooleanProfileResult:

{dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_categorical_config.py RENAMED Viewed

@@ -27,6 +27,9 @@ class TopValueEntry:
     count: int
     percentage: float
+    def to_dict(self) -> dict:
+        return {"value": self.value, "count": self.count, "percentage": self.percentage}
 @dataclass
 class RareCategoryStats:
@@ -35,6 +38,14 @@ class RareCategoryStats:
     total_rare_rows: int = 0
     rare_row_percentage: float = 0.0
+    def to_dict(self) -> dict:
+        return {
+            "threshold_pct": self.threshold_pct,
+            "rare_category_count": self.rare_category_count,
+            "total_rare_rows": self.total_rare_rows,
+            "rare_row_percentage": self.rare_row_percentage,
+        }
 @dataclass
 class ImbalanceMetrics:
@@ -42,6 +53,13 @@ class ImbalanceMetrics:
     shannon_entropy: float = 0.0
     gini_impurity: float = 0.0
+    def to_dict(self) -> dict:
+        return {
+            "class_ratio": self.class_ratio,
+            "shannon_entropy": self.shannon_entropy,
+            "gini_impurity": self.gini_impurity,
+        }
 @dataclass
 class CategoricalStats:
@@ -55,6 +73,17 @@ class CategoricalStats:
     imbalance: ImbalanceMetrics = field(default_factory=ImbalanceMetrics)
     flags: list[CategoricalFlag] = field(default_factory=list)
+    def to_dict(self) -> dict:
+        return {
+            "cardinality": self.cardinality,
+            "unique_ratio": self.unique_ratio,
+            "mode_frequency": self.mode_frequency,
+            "top_values": [v.to_dict() for v in self.top_values],
+            "rare_categories": self.rare_categories.to_dict(),
+            "imbalance": self.imbalance.to_dict(),
+            "flags": [str(f) for f in self.flags],
+        }
 CategoricalColumnProfile = CategoricalStats

{dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_correlation_config.py RENAMED Viewed

@@ -66,6 +66,13 @@ class CorrelationPair:
     spearman_r: Optional[float] = None
     near_redundant: bool = False
+    def to_dict(self) -> dict:
+        return {
+            "col_a": self.col_a, "col_b": self.col_b,
+            "pearson_r": self.pearson_r, "spearman_r": self.spearman_r,
+            "near_redundant": self.near_redundant,
+        }
 # ---------------------------------------------------------------------------
 # Feature–target entries
@@ -84,6 +91,9 @@ class NumericTargetCorrelation:
     feature:   str
     pearson_r: Optional[float] = None
+    def to_dict(self) -> dict:
+        return {"feature": self.feature, "pearson_r": self.pearson_r}
 @dataclass
 class CategoricalTargetCorrelation:
@@ -108,6 +118,12 @@ class CategoricalTargetCorrelation:
     p_value:     Optional[float] = None
     eta_squared: Optional[float] = None
+    def to_dict(self) -> dict:
+        return {
+            "feature": self.feature, "f_statistic": self.f_statistic,
+            "p_value": self.p_value, "eta_squared": self.eta_squared,
+        }
 # ---------------------------------------------------------------------------
 # Mutual information
@@ -131,6 +147,9 @@ class MutualInformationEntry:
     mi_score: float = 0.0
     rank:     int   = 0
+    def to_dict(self) -> dict:
+        return {"feature": self.feature, "mi_score": self.mi_score, "rank": self.rank}
 # ---------------------------------------------------------------------------
 # Near-redundancy summary
@@ -148,6 +167,9 @@ class NearRedundancyGroup:
     columns:       list[str] = field(default_factory=list)
     suggested_drop: list[str] = field(default_factory=list)
+    def to_dict(self) -> dict:
+        return {"columns": list(self.columns), "suggested_drop": list(self.suggested_drop)}
 # ---------------------------------------------------------------------------
 # Top-level result
@@ -223,3 +245,18 @@ class CorrelationProfileResult:
     def get_spearman(self, col_a: str, col_b: str) -> Optional[float]:
         return self.spearman_matrix.get(col_a, {}).get(col_b)
+    def to_dict(self) -> dict:
+        return {
+            "analysed_numeric_columns": list(self.analysed_numeric_columns),
+            "pearson_matrix": {k: dict(v) for k, v in self.pearson_matrix.items()},
+            "spearman_matrix": {k: dict(v) for k, v in self.spearman_matrix.items()},
+            "pairwise": [p.to_dict() for p in self.pairwise],
+            "near_redundant_pairs": [p.to_dict() for p in self.near_redundant_pairs],
+            "near_redundancy_groups": [g.to_dict() for g in self.near_redundancy_groups],
+            "target_column": self.target_column,
+            "target_type": str(self.target_type) if self.target_type else None,
+            "feature_target_numeric": [f.to_dict() for f in self.feature_target_numeric],
+            "feature_target_categorical": [f.to_dict() for f in self.feature_target_categorical],
+            "mutual_information": [m.to_dict() for m in self.mutual_information],
+        }

{dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_datetime_config.py RENAMED Viewed

@@ -58,6 +58,18 @@ class TemporalSignals:
             features.append("is_month_end")
         return features
+    def to_dict(self) -> dict:
+        return {
+            "has_year": self.has_year,
+            "has_month": self.has_month,
+            "has_day": self.has_day,
+            "has_day_of_week": self.has_day_of_week,
+            "has_hour": self.has_hour,
+            "has_is_weekend": self.has_is_weekend,
+            "has_is_month_end": self.has_is_month_end,
+            "extractable_features": self.extractable_features(),
+        }
 @dataclass
 class DatetimeStats:
@@ -74,6 +86,19 @@ class DatetimeStats:
     def has_flag(self, flag: DatetimeFlag) -> bool:
         return flag in self.flags
+    def to_dict(self) -> dict:
+        return {
+            "min_date": self.min_date,
+            "max_date": self.max_date,
+            "date_range_days": self.date_range_days,
+            "future_date_count": self.future_date_count,
+            "inferred_granularity": str(self.inferred_granularity) if self.inferred_granularity else None,
+            "median_gap_seconds": self.median_gap_seconds,
+            "gap_cv": self.gap_cv,
+            "signals": self.signals.to_dict(),
+            "flags": [str(f) for f in self.flags],
+        }
 @dataclass
 class DatetimeProfileResult:

{dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_missingness_config.py RENAMED Viewed

@@ -81,6 +81,19 @@ class ColumnMissingnessProfile:
     def has_flag(self, flag: MissingnessFlag) -> bool:
         return flag in self.flags
+    def to_dict(self) -> dict:
+        return {
+            "column": self.column,
+            "total_rows": self.total_rows,
+            "standard_null_count": self.standard_null_count,
+            "effective_null_count": self.effective_null_count,
+            "standard_null_ratio": self.standard_null_ratio,
+            "effective_null_ratio": self.effective_null_ratio,
+            "severity": str(self.severity) if self.severity else None,
+            "flags": [str(f) for f in self.flags],
+            "correlated_with": list(self.correlated_with),
+        }
     def __str__(self) -> str:  # pragma: no cover
         lines = [
             f"  Column : {self.column}",

{dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_numeric_config.py RENAMED Viewed

@@ -28,6 +28,12 @@ class PercentileSnapshot:
             return self.p75 - self.p25
         return None
+    def to_dict(self) -> dict:
+        return {
+            "p1": self.p1, "p5": self.p5, "p25": self.p25, "p50": self.p50,
+            "p75": self.p75, "p95": self.p95, "p99": self.p99,
+        }
 class SkewSeverity(StrEnum):
     Normal = "normal"
@@ -53,6 +59,9 @@ class NumericTopValueEntry:
     count: int
     percentage: float
+    def to_dict(self) -> dict:
+        return {"value": self.value, "count": self.count, "percentage": self.percentage}
 @dataclass
 class HistogramBin:
@@ -61,6 +70,12 @@ class HistogramBin:
     count: int
     percentage: float
+    def to_dict(self) -> dict:
+        return {
+            "lower_bound": self.lower_bound, "upper_bound": self.upper_bound,
+            "count": self.count, "percentage": self.percentage,
+        }
 @dataclass
 class NumericStats:
@@ -89,6 +104,27 @@ class NumericStats:
     def has_flag(self, flag: NumericFlag) -> bool:
         return flag in self.flags
+    def to_dict(self) -> dict:
+        return {
+            "mean": self.mean,
+            "median": self.median,
+            "mean_median_ratio": self.mean_median_ratio,
+            "mode": self.mode,
+            "mode_frequency": self.mode_frequency,
+            "top_values": [v.to_dict() for v in self.top_values],
+            "histogram": [b.to_dict() for b in self.histogram],
+            "std": self.std,
+            "variance": self.variance,
+            "min": self.min,
+            "max": self.max,
+            "percentiles": self.percentiles.to_dict(),
+            "skewness": self.skewness,
+            "kurtosis": self.kurtosis,
+            "skewness_severity": str(self.skewness_severity) if self.skewness_severity else None,
+            "kurtosis_tag": str(self.kurtosis_tag) if self.kurtosis_tag else None,
+            "flags": [str(f) for f in self.flags],
+        }
 ColumnNumericProfile = NumericStats

{dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_target_config.py RENAMED Viewed

@@ -47,6 +47,17 @@ class TargetProfileResult:
     def has_flag(self, flag: TargetFlag) -> bool:
         return flag in self.flags
+    def to_dict(self) -> dict:
+        return {
+            "column": self.column,
+            "problem_type": str(self.problem_type),
+            "missing_count": self.missing_count,
+            "missing_ratio": self.missing_ratio,
+            "numeric_profile": self.numeric_profile.to_dict() if self.numeric_profile else None,
+            "categorical_profile": self.categorical_profile.to_dict() if self.categorical_profile else None,
+            "flags": [str(f) for f in self.flags],
+        }
     def __str__(self) -> str:
         lines = [
             "=== Target Variable Profile ===",

{dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/_text_config.py RENAMED Viewed

@@ -21,6 +21,19 @@ class TextStats:
     empty_ratio: float = 0.0
     whitespace_ratio: float = 0.0
+    def to_dict(self) -> dict:
+        return {
+            "avg_token_count": self.avg_token_count,
+            "median_token_count": self.median_token_count,
+            "vocabulary_size": self.vocabulary_size,
+            "char_length_min": self.char_length_min,
+            "char_length_max": self.char_length_max,
+            "char_length_mean": self.char_length_mean,
+            "char_length_median": self.char_length_median,
+            "empty_ratio": self.empty_ratio,
+            "whitespace_ratio": self.whitespace_ratio,
+        }
 @dataclass
 class TextProfileResult:

{dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml/profiling/config.py RENAMED Viewed

@@ -91,6 +91,18 @@ class ColumnProfile:
     is_target: bool = False
     stats: Optional[AnyStats] = None
+    def to_dict(self) -> dict:
+        return {
+            "name": self.name,
+            "semantic_type": str(self.semantic_type) if self.semantic_type else None,
+            "type_flags": [str(f) for f in self.type_flags],
+            "original_dtype": self.original_dtype,
+            "inferred_dtype": self.inferred_dtype,
+            "missingness": self.missingness.to_dict() if self.missingness else None,
+            "is_target": self.is_target,
+            "stats": self.stats.to_dict() if self.stats else None,
+        }
 @dataclass
 class RowMissingnessDistribution:
@@ -106,6 +118,16 @@ class RowMissingnessDistribution:
     pct_over_half_missing: float = 0.0
     drop_candidate_row_count: int = 0
+    def to_dict(self) -> dict:
+        return {
+            "pct_zero_missing": self.pct_zero_missing,
+            "pct_one_to_two": self.pct_one_to_two,
+            "pct_three_to_five": self.pct_three_to_five,
+            "pct_over_five": self.pct_over_five,
+            "pct_over_half_missing": self.pct_over_half_missing,
+            "drop_candidate_row_count": self.drop_candidate_row_count,
+        }
 @dataclass
 class MemoryBreakdown:
@@ -118,6 +140,9 @@ class MemoryBreakdown:
     def top_consumers(self, n: int = 10) -> list[tuple[str, int]]:
         return self.sorted_by_usage[:n]
+    def to_dict(self) -> dict:
+        return {"column_bytes": dict(self.column_bytes)}
 @dataclass
 class DatasetStats:
@@ -141,6 +166,23 @@ class DatasetStats:
         default_factory=dict,
     )
+    def to_dict(self) -> dict:
+        return {
+            "modality": str(self.modality),
+            "row_count": self.row_count,
+            "column_count": self.column_count,
+            "memory_bytes": self.memory_bytes,
+            "memory_breakdown": self.memory_breakdown.to_dict() if self.memory_breakdown else None,
+            "duplicate_count": self.duplicate_count,
+            "duplicate_ratio": self.duplicate_ratio,
+            "overall_sparsity": self.overall_sparsity,
+            "was_chunked": self.was_chunked,
+            "missingness_matrix": self.missingness_matrix,
+            "row_distribution": self.row_distribution.to_dict(),
+            "feature_correlation": self.feature_correlation.to_dict() if self.feature_correlation else None,
+            "target_correlations": {k: v.to_dict() for k, v in self.target_correlations.items()},
+        }
 @dataclass
 class StructuralProfileResult:
@@ -148,6 +190,16 @@ class StructuralProfileResult:
     dataset: DatasetStats = field(default_factory=DatasetStats)
     targets: dict[str, TargetProfileResult] = field(default_factory=dict)
+    def to_dict(self) -> dict:
+        return {
+            "columns": {k: v.to_dict() for k, v in self.columns.items()},
+            "dataset": self.dataset.to_dict(),
+            "targets": {k: v.to_dict() for k, v in self.targets.items()},
+        }
+    def to_json(self, indent: int = 2) -> str:
+        return json.dumps(self.to_dict(), indent=indent, default=str)
 # ---------------------------------------------------------------------------
 # ProfileConfig — clean break from per-profiler column lists

{dataforge_ml-0.3.0 → dataforge_ml-0.5.0}/src/dataforge_ml.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataforge-ml
-Version: 0.3.0
+Version: 0.5.0
 Summary: A automated feature engineering and designing pipeline library
 License: MIT
 Classifier: License :: OSI Approved :: MIT License
@@ -15,6 +15,8 @@ Requires-Dist: polars>=1.0.0
 Requires-Dist: scikit-learn>=1.0.0
 Requires-Dist: scipy>=1.10.0
 Requires-Dist: numpy>=2.0.0
+Requires-Dist: pandas>=2.0.0
+Requires-Dist: chardet>=5.0.0
 Provides-Extra: dev
 Requires-Dist: pytest>=8.0; extra == "dev"
 Dynamic: license-file