dataforge-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. dataforge_ml-0.1.0.dist-info/METADATA +34 -0
  2. dataforge_ml-0.1.0.dist-info/RECORD +54 -0
  3. dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
  4. dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
  5. dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
  6. models/__init__.py +0 -0
  7. models/_data_structure.py +7 -0
  8. models/_data_types.py +12 -0
  9. profiling/__init__.py +35 -0
  10. profiling/_base.py +101 -0
  11. profiling/_boolean_config.py +37 -0
  12. profiling/_boolean_profiler.py +191 -0
  13. profiling/_categorical.py +315 -0
  14. profiling/_categorical_config.py +87 -0
  15. profiling/_correlation_config.py +225 -0
  16. profiling/_correlation_profiler.py +544 -0
  17. profiling/_datetime_config.py +98 -0
  18. profiling/_datetime_profiler.py +406 -0
  19. profiling/_missingness_config.py +137 -0
  20. profiling/_missingness_profiler.py +252 -0
  21. profiling/_numeric_config.py +116 -0
  22. profiling/_numeric_profiler.py +403 -0
  23. profiling/_tabular.py +249 -0
  24. profiling/_target_config.py +74 -0
  25. profiling/_target_profiler.py +156 -0
  26. profiling/_text_config.py +40 -0
  27. profiling/_text_profiler.py +194 -0
  28. profiling/_type_detector.py +463 -0
  29. profiling/config.py +236 -0
  30. profiling/structural.py +280 -0
  31. splitting/__init__.py +4 -0
  32. splitting/_config.py +56 -0
  33. splitting/_splitter.py +202 -0
  34. tests/__init__.py +0 -0
  35. tests/conftest.py +7 -0
  36. tests/integration/__init__.py +0 -0
  37. tests/integration/conftest.py +82 -0
  38. tests/integration/test_structural_end_to_end.py +219 -0
  39. tests/unit/__init__.py +0 -0
  40. tests/unit/profiling/__init__.py +0 -0
  41. tests/unit/profiling/conftest.py +81 -0
  42. tests/unit/profiling/test_boolean_profiler.py +91 -0
  43. tests/unit/profiling/test_categorical_profiler.py +182 -0
  44. tests/unit/profiling/test_correlation_profiler.py +124 -0
  45. tests/unit/profiling/test_datetime_profiler.py +133 -0
  46. tests/unit/profiling/test_missingness_profiler.py +51 -0
  47. tests/unit/profiling/test_numeric_profiler.py +212 -0
  48. tests/unit/profiling/test_target_profiler.py +44 -0
  49. tests/unit/profiling/test_text_profiler.py +61 -0
  50. tests/unit/profiling/test_type_detector.py +32 -0
  51. tests/unit/splitting/__init__.py +0 -0
  52. tests/unit/splitting/test_data_splitter.py +417 -0
  53. utils/__init__.py +0 -0
  54. utils/data_loader.py +110 -0
@@ -0,0 +1,74 @@
1
+ """
2
+ Configuration and result dataclasses for Target Variable profiling.
3
+
4
+ Determines the nature of the predictive task (Regression vs Classification)
5
+ and flags critical issues like missing labels or severe imbalances.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from enum import StrEnum
11
+ from typing import Optional
12
+
13
+ from ._categorical_config import CategoricalColumnProfile
14
+ from ._numeric_config import ColumnNumericProfile
15
+
16
+ class TargetProblemType(StrEnum):
17
+ Regression = "regression"
18
+ BinaryClassification = "binary_classification"
19
+ MulticlassClassification = "multiclass_classification"
20
+ Unknown = "unknown"
21
+
22
+ class TargetFlag(StrEnum):
23
+ ContainsMissing = "contains_missing" # Target has >0 missing values; must drop or reframe
24
+ HighImbalance = "high_imbalance" # Class ratio > 5 (requires handling in Phase 5)
25
+ SevereImbalance = "severe_imbalance" # Class ratio > 20 (accuracy metric is meaningless)
26
+ HighlySkewed = "highly_skewed" # Numeric target is severely skewed (consider log transform)
27
+ IsIdentifier = "is_identifier" # Target looks like an ID column (useless for modeling)
28
+
29
+ @dataclass
30
+ class TargetProfileResult:
31
+ """
32
+ Profile specific to the designated target variable.
33
+ """
34
+ column: str
35
+ problem_type: TargetProblemType
36
+
37
+ # Missingness (Critical for targets)
38
+ missing_count: int = 0
39
+ missing_ratio: float = 0.0
40
+
41
+ # Underlying profile data depending on the problem type
42
+ numeric_profile: Optional[ColumnNumericProfile] = None
43
+ categorical_profile: Optional[CategoricalColumnProfile] = None
44
+
45
+ flags: list[TargetFlag] = field(default_factory=list)
46
+
47
+ def has_flag(self, flag: TargetFlag) -> bool:
48
+ return flag in self.flags
49
+
50
+ def __str__(self) -> str:
51
+ lines = [
52
+ "=== Target Variable Profile ===",
53
+ f" Column : {self.column}",
54
+ f" Problem Type : {self.problem_type}",
55
+ f" Missingness : {self.missing_count:,} rows ({self.missing_ratio:.2%})",
56
+ ]
57
+
58
+ if self.has_flag(TargetFlag.ContainsMissing):
59
+ lines.append(" [!] WARNING: Target contains missing values. Imputation is not recommended.")
60
+
61
+ if self.categorical_profile and self.problem_type in (TargetProblemType.BinaryClassification, TargetProblemType.MulticlassClassification):
62
+ im = self.categorical_profile.imbalance
63
+ lines.append(f" Classes : {self.categorical_profile.cardinality:,}")
64
+ lines.append(f" Class Ratio : {im.class_ratio:.2f}")
65
+ lines.append(f" Gini Impurity : {im.gini_impurity:.4f}")
66
+
67
+ if self.numeric_profile and self.problem_type == TargetProblemType.Regression:
68
+ lines.append(f" Mean / Median : {self.numeric_profile.mean:.4f} / {self.numeric_profile.median:.4f}")
69
+ lines.append(f" Skewness : {self.numeric_profile.skewness:.4f} [{self.numeric_profile.skew_severity}]")
70
+
71
+ if self.flags:
72
+ lines.append(f" Flags : {', '.join(self.flags)}")
73
+
74
+ return "\n".join(lines)
@@ -0,0 +1,156 @@
1
+ """
2
+ TargetProfiler – Phase 1 extension: Target Variable Profiling.
3
+
4
+ Performs robust dtype detection to determine the problem framework
5
+ (Regression vs Classification) and assesses critical target health metrics:
6
+ 1. Target Missingness (Any missingness flags the dataset for row-dropping)
7
+ 2. Class Imbalance (For Classification tasks)
8
+ 3. Skewness / Normalcy (For Regression tasks)
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Any
14
+
15
+ import polars as pl
16
+
17
+ from ._base import DatasetLevelProfiler
18
+ from .config import ProfileConfig
19
+ from ._target_config import (
20
+ TargetFlag,
21
+ TargetProblemType,
22
+ TargetProfileResult,
23
+ )
24
+
25
+ # Reuse your internal profilers to prevent duplication
26
+ from ._type_detector import TypeDetector, TypeFlag, NumericKind
27
+ from ._missingness_profiler import MissingnessProfiler
28
+ from ._categorical import CategoricalProfiler
29
+ from ._numeric_profiler import NumericProfiler
30
+ from ._numeric_config import SkewSeverity
31
+
32
+
33
+ class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
34
+ """
35
+ Analyzes the target variable to set up downstream ML behavior.
36
+ """
37
+
38
+ def __init__(self, target_column: str, config: ProfileConfig | None = None) -> None:
39
+ super().__init__(config)
40
+ self.target_column = target_column
41
+
42
+ def profile(self, data: pl.DataFrame, **kwargs) -> TargetProfileResult:
43
+ if self.target_column not in data.columns:
44
+ raise ValueError(
45
+ f"Target column '{self.target_column}' not found in the DataFrame."
46
+ )
47
+
48
+ return self._run(data)
49
+
50
+ def _run(self, df: pl.DataFrame) -> TargetProfileResult:
51
+ series = df[self.target_column]
52
+ n_rows = df.height
53
+
54
+ # 1. Type Detection -> Problem Type Mapping
55
+ detector = TypeDetector(columns=[self.target_column])
56
+ type_info = detector.detect(df)[self.target_column]
57
+ problem_type = self._determine_problem_type(series, type_info, n_rows)
58
+
59
+ result = TargetProfileResult(
60
+ column=self.target_column, problem_type=problem_type
61
+ )
62
+
63
+ if type_info.has_flag(TypeFlag.IdentifierColumn):
64
+ result.flags.append(TargetFlag.IsIdentifier)
65
+
66
+ # 2. Target Missingness Check
67
+ # We reuse MissingnessProfiler's static method to get standard + effective nulls
68
+ col_miss_profile, _ = MissingnessProfiler._profile_column(
69
+ series, self.target_column, n_rows
70
+ )
71
+ result.missing_count = col_miss_profile.effective_null_count
72
+ result.missing_ratio = col_miss_profile.effective_null_ratio
73
+
74
+ if result.missing_count > 0:
75
+ result.flags.append(TargetFlag.ContainsMissing)
76
+
77
+ # 3. Problem-Specific Profiling
78
+ if problem_type in (
79
+ TargetProblemType.BinaryClassification,
80
+ TargetProblemType.MulticlassClassification,
81
+ ):
82
+ self._profile_classification(series, n_rows, result)
83
+ elif problem_type == TargetProblemType.Regression:
84
+ self._profile_regression(series, n_rows, result)
85
+
86
+ return result
87
+
88
+ def _determine_problem_type(
89
+ self, series: pl.Series, type_info: Any, n_rows: int
90
+ ) -> TargetProblemType:
91
+ """Map TypeDetector results to an ML Problem Type with cardinality safety."""
92
+
93
+ # 1. Reject Identifiers completely
94
+ if type_info.has_flag(TypeFlag.IdentifierColumn):
95
+ return TargetProblemType.Unknown
96
+
97
+ # 2. Obvious Booleans -> Binary Classification
98
+ if type_info.has_flag(TypeFlag.BooleanCandidate):
99
+ return TargetProblemType.BinaryClassification
100
+
101
+ # 3. Categorical (Strings OR Integers acting as categories)
102
+ is_string = series.dtype in (pl.Utf8, pl.String)
103
+ is_encoded_int = type_info.has_flag(TypeFlag.EncodedCategory)
104
+
105
+ if is_string or is_encoded_int:
106
+ n_unique = series.drop_nulls().n_unique()
107
+
108
+ # SAFEGUARD: If a string has too many unique values, it's not a classification target.
109
+ # E.g., free text, high-cardinality IDs, or raw JSON strings.
110
+ # Threshold: > 100 classes is usually beyond standard ML classification scope.
111
+ if n_unique > 100 or (n_unique / max(n_rows, 1) > 0.05 and n_rows > 1000):
112
+ return TargetProblemType.Unknown
113
+
114
+ if n_unique == 2:
115
+ return TargetProblemType.BinaryClassification
116
+ elif n_unique > 2:
117
+ return TargetProblemType.MulticlassClassification
118
+
119
+ # 4. Confirmed Numerics -> Regression
120
+ # Note: TypeDetector strips the 'NumericKind' if it was flagged as an EncodedCategory.
121
+ # So we won't accidentally treat [0, 1, 2] classes as a regression target here.
122
+ if type_info.numeric_kind in (NumericKind.Continuous, NumericKind.Discrete):
123
+ return TargetProblemType.Regression
124
+
125
+ # If it's a string with too many unique values, or an unparsed datetime, etc.
126
+ return TargetProblemType.Unknown
127
+
128
+ def _profile_classification(
129
+ self, series: pl.Series, n_rows: int, result: TargetProfileResult
130
+ ) -> None:
131
+ """Generates categorical metrics and checks for class imbalance."""
132
+ cat_profiler = CategoricalProfiler(config=self.config)
133
+
134
+ # Internally compute cardinality, top values, and imbalance metrics
135
+ cat_profile = cat_profiler._profile_column(series, self.target_column, n_rows)
136
+ result.categorical_profile = cat_profile
137
+
138
+ # Flag Imbalances
139
+ ratio = cat_profile.imbalance.class_ratio
140
+ if ratio > 20.0:
141
+ result.flags.append(TargetFlag.SevereImbalance)
142
+ elif ratio > 5.0:
143
+ result.flags.append(TargetFlag.HighImbalance)
144
+
145
+ def _profile_regression(
146
+ self, series: pl.Series, n_rows: int, result: TargetProfileResult
147
+ ) -> None:
148
+ """Generates numeric metrics and checks for target skewness."""
149
+ num_profiler = NumericProfiler(config=self.config)
150
+
151
+ num_profile = num_profiler._profile_column(series, n_rows)
152
+ result.numeric_profile = num_profile
153
+
154
+ # Flag Skewness (Highly skewed targets often require Log/Yeo-Johnson transforms)
155
+ if num_profile.skewness_severity in (SkewSeverity.High, SkewSeverity.Severe):
156
+ result.flags.append(TargetFlag.HighlySkewed)
@@ -0,0 +1,40 @@
1
+ """
2
+ Result dataclass for free-text column profiling.
3
+
4
+ Populated by TextProfiler.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+
11
+
12
+ @dataclass
13
+ class TextStats:
14
+ avg_token_count: float = 0.0
15
+ median_token_count: float = 0.0
16
+ vocabulary_size: int = 0
17
+ char_length_min: int = 0
18
+ char_length_max: int = 0
19
+ char_length_mean: float = 0.0
20
+ char_length_median: float = 0.0
21
+ empty_ratio: float = 0.0
22
+ whitespace_ratio: float = 0.0
23
+
24
+
25
+ @dataclass
26
+ class TextProfileResult:
27
+ """
28
+ Text profile for all eligible columns.
29
+
30
+ Attributes
31
+ ----------
32
+ columns : dict[str, TextStats]
33
+ Per-column text profiles, keyed by column name.
34
+ analysed_columns : list[str]
35
+ Columns that were actually profiled (after schema intersection
36
+ and eligibility check).
37
+ """
38
+
39
+ columns: dict[str, TextStats] = field(default_factory=dict)
40
+ analysed_columns: list[str] = field(default_factory=list)
@@ -0,0 +1,194 @@
1
+ """
2
+ TextProfiler – Phase 1 extension: Free-Text Column Profiling.
3
+
4
+ Handles columns classified as SemanticType.Text (free-text string columns).
5
+ All computation is Polars-native — no external NLP libraries, no language
6
+ detection.
7
+
8
+ Per-column metrics
9
+ ------------------
10
+ 1. avg_token_count – mean whitespace-split token count across non-null rows
11
+ 2. median_token_count – median whitespace-split token count across non-null rows
12
+ 3. vocabulary_size – count of distinct tokens across all non-null values
13
+ 4. char_length_min – shortest non-null string (characters)
14
+ 5. char_length_max – longest non-null string (characters)
15
+ 6. char_length_mean – mean character length across non-null strings
16
+ 7. char_length_median – median character length across non-null strings
17
+ 8. empty_ratio – fraction of total rows that are empty strings ("")
18
+ 9. whitespace_ratio – fraction of total rows that are whitespace-only
19
+ (includes empty strings, since strip → "")
20
+
21
+ Definitions
22
+ -----------
23
+ - "token" : any run of non-whitespace characters produced by
24
+ str.split_whitespace() semantics, i.e.
25
+ ``pl.col(c).str.split(" ")`` with empty-string elements
26
+ filtered out. We use Polars ``str.count_matches`` on
27
+ ``r"\\S+"`` which counts exactly these tokens in a single
28
+ vectorised pass.
29
+ - "empty string" : len == 0 after no stripping.
30
+ - "whitespace-only": len == 0 after str.strip_chars().
31
+ - Null values are excluded from all per-row metrics and from ratio
32
+ denominators **except** empty_ratio / whitespace_ratio, which are
33
+ computed over total row count (nulls contribute 0, not counted as empty).
34
+
35
+ Eligibility
36
+ -----------
37
+ A column is eligible when:
38
+ - It has a SemanticType.Text override in ProfileConfig.column_overrides, OR
39
+ - Its Polars dtype is pl.Utf8 (alias pl.String) and no other override is set.
40
+
41
+ Integration
42
+ -----------
43
+ Drop ``TextProfiler`` into the profiler loop in ``structural.py`` alongside
44
+ ``NumericProfiler``, ``CategoricalProfiler``, ``DatetimeProfiler``, and
45
+ ``BooleanProfiler``::
46
+
47
+ sub_result = TextProfiler(config=self.config).profile(data, columns=active_cols)
48
+ for col_name, col_stats in sub_result.columns.items():
49
+ result.columns.setdefault(col_name, ColumnProfile(name=col_name)).stats = col_stats
50
+ """
51
+
52
+ from __future__ import annotations
53
+
54
+ import polars as pl
55
+
56
+ from ._base import ColumnBatchProfiler
57
+ from .config import (
58
+ ProfileConfig,
59
+ TextStats,
60
+ SemanticType,
61
+ )
62
+ from ._text_config import TextProfileResult
63
+
64
+ # Regex that counts non-whitespace token runs — used with str.count_matches.
65
+ _TOKEN_PATTERN: str = r"\S+"
66
+
67
+
68
+ class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
69
+ """
70
+ Free-text column profiler for Polars DataFrames.
71
+
72
+ A column is eligible when:
73
+ - It has a ``SemanticType.Text`` override in
74
+ ``ProfileConfig.column_overrides``, OR
75
+ - Its Polars dtype is ``pl.Utf8`` / ``pl.String`` and no override is set.
76
+
77
+ Non-eligible columns are silently skipped.
78
+
79
+ Parameters
80
+ ----------
81
+ config : ProfileConfig | None
82
+ Shared profiling configuration.
83
+ """
84
+
85
+ def __init__(self, config: ProfileConfig | None = None) -> None:
86
+ super().__init__(config)
87
+
88
+ # ------------------------------------------------------------------
89
+ # Public API
90
+ # ------------------------------------------------------------------
91
+
92
+ def profile(
93
+ self,
94
+ data: pl.DataFrame,
95
+ columns: list[str],
96
+ ) -> TextProfileResult:
97
+ return self._run(data, columns)
98
+
99
+ # ------------------------------------------------------------------
100
+ # Eligibility
101
+ # ------------------------------------------------------------------
102
+
103
+ def _eligible(self, series: pl.Series) -> bool:
104
+ override = self.config.column_overrides.get(series.name)
105
+
106
+ if override == SemanticType.Text:
107
+ return True
108
+
109
+ # Any other explicit override takes precedence
110
+ if override is not None:
111
+ return False
112
+
113
+ # Native string dtype (pl.Utf8 is the canonical name; pl.String is
114
+ # an alias in newer Polars — check both for cross-version safety)
115
+ return series.dtype in (pl.Utf8, pl.String)
116
+
117
+ # ------------------------------------------------------------------
118
+ # Orchestration
119
+ # ------------------------------------------------------------------
120
+
121
+ def _run(
122
+ self,
123
+ df: pl.DataFrame,
124
+ columns: list[str],
125
+ ) -> TextProfileResult:
126
+ result = TextProfileResult()
127
+
128
+ available = [
129
+ c
130
+ for c in self._resolve_columns(df.columns, columns)
131
+ if self._eligible(df[c])
132
+ ]
133
+ result.analysed_columns = available
134
+
135
+ for col_name in available:
136
+ result.columns[col_name] = self._profile_column(
137
+ df[col_name], df.height
138
+ )
139
+
140
+ return result
141
+
142
+ # ------------------------------------------------------------------
143
+ # Per-column driver
144
+ # ------------------------------------------------------------------
145
+
146
+ def _profile_column(
147
+ self,
148
+ series: pl.Series,
149
+ n_rows: int,
150
+ ) -> TextStats:
151
+ profile = TextStats()
152
+
153
+ if n_rows == 0:
154
+ return profile
155
+
156
+ # ── 1. Empty / whitespace ratios (computed over ALL rows, nulls → 0) ──
157
+ # null rows do not count as empty or whitespace-only.
158
+ non_null_mask = series.is_not_null()
159
+ empty_mask = non_null_mask & (series.str.len_chars() == 0)
160
+ stripped = series.str.strip_chars()
161
+ whitespace_mask = non_null_mask & (stripped.str.len_chars() == 0)
162
+
163
+ profile.empty_ratio = float(empty_mask.sum()) / n_rows
164
+ profile.whitespace_ratio = float(whitespace_mask.sum()) / n_rows
165
+
166
+ # ── 2. Work on non-null values only from here on ─────────────────────
167
+ non_null = series.drop_nulls()
168
+ n_non_null = non_null.len()
169
+
170
+ if n_non_null == 0:
171
+ return profile
172
+
173
+ # ── 3. Token counts (whitespace-split, Polars regex count) ────────────
174
+ # str.count_matches counts non-overlapping matches of r"\S+",
175
+ # which is exactly the set of whitespace-delimited tokens.
176
+ token_counts: pl.Series = non_null.str.count_matches(_TOKEN_PATTERN)
177
+
178
+ profile.avg_token_count = float(token_counts.mean()) # type: ignore[arg-type]
179
+ profile.median_token_count = float(token_counts.median()) # type: ignore[arg-type]
180
+
181
+ # Re-derive cleanly to avoid the chained reference issue above:
182
+ exploded = non_null.str.split(" ").explode().drop_nulls()
183
+ non_empty_tokens = exploded.filter(exploded != "")
184
+ profile.vocabulary_size = non_empty_tokens.n_unique()
185
+
186
+ # ── 5. Character-length distribution ─────────────────────────────────
187
+ char_lengths: pl.Series = non_null.str.len_chars().cast(pl.Float64)
188
+
189
+ profile.char_length_min = int(char_lengths.min()) # type: ignore[arg-type]
190
+ profile.char_length_max = int(char_lengths.max()) # type: ignore[arg-type]
191
+ profile.char_length_mean = float(char_lengths.mean()) # type: ignore[arg-type]
192
+ profile.char_length_median = float(char_lengths.median()) # type: ignore[arg-type]
193
+
194
+ return profile