dataforge-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge_ml-0.1.0.dist-info/METADATA +34 -0
- dataforge_ml-0.1.0.dist-info/RECORD +54 -0
- dataforge_ml-0.1.0.dist-info/WHEEL +5 -0
- dataforge_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
- dataforge_ml-0.1.0.dist-info/top_level.txt +5 -0
- models/__init__.py +0 -0
- models/_data_structure.py +7 -0
- models/_data_types.py +12 -0
- profiling/__init__.py +35 -0
- profiling/_base.py +101 -0
- profiling/_boolean_config.py +37 -0
- profiling/_boolean_profiler.py +191 -0
- profiling/_categorical.py +315 -0
- profiling/_categorical_config.py +87 -0
- profiling/_correlation_config.py +225 -0
- profiling/_correlation_profiler.py +544 -0
- profiling/_datetime_config.py +98 -0
- profiling/_datetime_profiler.py +406 -0
- profiling/_missingness_config.py +137 -0
- profiling/_missingness_profiler.py +252 -0
- profiling/_numeric_config.py +116 -0
- profiling/_numeric_profiler.py +403 -0
- profiling/_tabular.py +249 -0
- profiling/_target_config.py +74 -0
- profiling/_target_profiler.py +156 -0
- profiling/_text_config.py +40 -0
- profiling/_text_profiler.py +194 -0
- profiling/_type_detector.py +463 -0
- profiling/config.py +236 -0
- profiling/structural.py +280 -0
- splitting/__init__.py +4 -0
- splitting/_config.py +56 -0
- splitting/_splitter.py +202 -0
- tests/__init__.py +0 -0
- tests/conftest.py +7 -0
- tests/integration/__init__.py +0 -0
- tests/integration/conftest.py +82 -0
- tests/integration/test_structural_end_to_end.py +219 -0
- tests/unit/__init__.py +0 -0
- tests/unit/profiling/__init__.py +0 -0
- tests/unit/profiling/conftest.py +81 -0
- tests/unit/profiling/test_boolean_profiler.py +91 -0
- tests/unit/profiling/test_categorical_profiler.py +182 -0
- tests/unit/profiling/test_correlation_profiler.py +124 -0
- tests/unit/profiling/test_datetime_profiler.py +133 -0
- tests/unit/profiling/test_missingness_profiler.py +51 -0
- tests/unit/profiling/test_numeric_profiler.py +212 -0
- tests/unit/profiling/test_target_profiler.py +44 -0
- tests/unit/profiling/test_text_profiler.py +61 -0
- tests/unit/profiling/test_type_detector.py +32 -0
- tests/unit/splitting/__init__.py +0 -0
- tests/unit/splitting/test_data_splitter.py +417 -0
- utils/__init__.py +0 -0
- utils/data_loader.py +110 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration and result dataclasses for Target Variable profiling.
|
|
3
|
+
|
|
4
|
+
Determines the nature of the predictive task (Regression vs Classification)
|
|
5
|
+
and flags critical issues like missing labels or severe imbalances.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from enum import StrEnum
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
from ._categorical_config import CategoricalColumnProfile
|
|
14
|
+
from ._numeric_config import ColumnNumericProfile
|
|
15
|
+
|
|
16
|
+
class TargetProblemType(StrEnum):
|
|
17
|
+
Regression = "regression"
|
|
18
|
+
BinaryClassification = "binary_classification"
|
|
19
|
+
MulticlassClassification = "multiclass_classification"
|
|
20
|
+
Unknown = "unknown"
|
|
21
|
+
|
|
22
|
+
class TargetFlag(StrEnum):
|
|
23
|
+
ContainsMissing = "contains_missing" # Target has >0 missing values; must drop or reframe
|
|
24
|
+
HighImbalance = "high_imbalance" # Class ratio > 5 (requires handling in Phase 5)
|
|
25
|
+
SevereImbalance = "severe_imbalance" # Class ratio > 20 (accuracy metric is meaningless)
|
|
26
|
+
HighlySkewed = "highly_skewed" # Numeric target is severely skewed (consider log transform)
|
|
27
|
+
IsIdentifier = "is_identifier" # Target looks like an ID column (useless for modeling)
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class TargetProfileResult:
|
|
31
|
+
"""
|
|
32
|
+
Profile specific to the designated target variable.
|
|
33
|
+
"""
|
|
34
|
+
column: str
|
|
35
|
+
problem_type: TargetProblemType
|
|
36
|
+
|
|
37
|
+
# Missingness (Critical for targets)
|
|
38
|
+
missing_count: int = 0
|
|
39
|
+
missing_ratio: float = 0.0
|
|
40
|
+
|
|
41
|
+
# Underlying profile data depending on the problem type
|
|
42
|
+
numeric_profile: Optional[ColumnNumericProfile] = None
|
|
43
|
+
categorical_profile: Optional[CategoricalColumnProfile] = None
|
|
44
|
+
|
|
45
|
+
flags: list[TargetFlag] = field(default_factory=list)
|
|
46
|
+
|
|
47
|
+
def has_flag(self, flag: TargetFlag) -> bool:
|
|
48
|
+
return flag in self.flags
|
|
49
|
+
|
|
50
|
+
def __str__(self) -> str:
|
|
51
|
+
lines = [
|
|
52
|
+
"=== Target Variable Profile ===",
|
|
53
|
+
f" Column : {self.column}",
|
|
54
|
+
f" Problem Type : {self.problem_type}",
|
|
55
|
+
f" Missingness : {self.missing_count:,} rows ({self.missing_ratio:.2%})",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
if self.has_flag(TargetFlag.ContainsMissing):
|
|
59
|
+
lines.append(" [!] WARNING: Target contains missing values. Imputation is not recommended.")
|
|
60
|
+
|
|
61
|
+
if self.categorical_profile and self.problem_type in (TargetProblemType.BinaryClassification, TargetProblemType.MulticlassClassification):
|
|
62
|
+
im = self.categorical_profile.imbalance
|
|
63
|
+
lines.append(f" Classes : {self.categorical_profile.cardinality:,}")
|
|
64
|
+
lines.append(f" Class Ratio : {im.class_ratio:.2f}")
|
|
65
|
+
lines.append(f" Gini Impurity : {im.gini_impurity:.4f}")
|
|
66
|
+
|
|
67
|
+
if self.numeric_profile and self.problem_type == TargetProblemType.Regression:
|
|
68
|
+
lines.append(f" Mean / Median : {self.numeric_profile.mean:.4f} / {self.numeric_profile.median:.4f}")
|
|
69
|
+
lines.append(f" Skewness : {self.numeric_profile.skewness:.4f} [{self.numeric_profile.skew_severity}]")
|
|
70
|
+
|
|
71
|
+
if self.flags:
|
|
72
|
+
lines.append(f" Flags : {', '.join(self.flags)}")
|
|
73
|
+
|
|
74
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TargetProfiler – Phase 1 extension: Target Variable Profiling.
|
|
3
|
+
|
|
4
|
+
Performs robust dtype detection to determine the problem framework
|
|
5
|
+
(Regression vs Classification) and assesses critical target health metrics:
|
|
6
|
+
1. Target Missingness (Any missingness flags the dataset for row-dropping)
|
|
7
|
+
2. Class Imbalance (For Classification tasks)
|
|
8
|
+
3. Skewness / Normalcy (For Regression tasks)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
import polars as pl
|
|
16
|
+
|
|
17
|
+
from ._base import DatasetLevelProfiler
|
|
18
|
+
from .config import ProfileConfig
|
|
19
|
+
from ._target_config import (
|
|
20
|
+
TargetFlag,
|
|
21
|
+
TargetProblemType,
|
|
22
|
+
TargetProfileResult,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Reuse your internal profilers to prevent duplication
|
|
26
|
+
from ._type_detector import TypeDetector, TypeFlag, NumericKind
|
|
27
|
+
from ._missingness_profiler import MissingnessProfiler
|
|
28
|
+
from ._categorical import CategoricalProfiler
|
|
29
|
+
from ._numeric_profiler import NumericProfiler
|
|
30
|
+
from ._numeric_config import SkewSeverity
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TargetProfiler(DatasetLevelProfiler[TargetProfileResult]):
|
|
34
|
+
"""
|
|
35
|
+
Analyzes the target variable to set up downstream ML behavior.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, target_column: str, config: ProfileConfig | None = None) -> None:
|
|
39
|
+
super().__init__(config)
|
|
40
|
+
self.target_column = target_column
|
|
41
|
+
|
|
42
|
+
def profile(self, data: pl.DataFrame, **kwargs) -> TargetProfileResult:
|
|
43
|
+
if self.target_column not in data.columns:
|
|
44
|
+
raise ValueError(
|
|
45
|
+
f"Target column '{self.target_column}' not found in the DataFrame."
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return self._run(data)
|
|
49
|
+
|
|
50
|
+
def _run(self, df: pl.DataFrame) -> TargetProfileResult:
|
|
51
|
+
series = df[self.target_column]
|
|
52
|
+
n_rows = df.height
|
|
53
|
+
|
|
54
|
+
# 1. Type Detection -> Problem Type Mapping
|
|
55
|
+
detector = TypeDetector(columns=[self.target_column])
|
|
56
|
+
type_info = detector.detect(df)[self.target_column]
|
|
57
|
+
problem_type = self._determine_problem_type(series, type_info, n_rows)
|
|
58
|
+
|
|
59
|
+
result = TargetProfileResult(
|
|
60
|
+
column=self.target_column, problem_type=problem_type
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
if type_info.has_flag(TypeFlag.IdentifierColumn):
|
|
64
|
+
result.flags.append(TargetFlag.IsIdentifier)
|
|
65
|
+
|
|
66
|
+
# 2. Target Missingness Check
|
|
67
|
+
# We reuse MissingnessProfiler's static method to get standard + effective nulls
|
|
68
|
+
col_miss_profile, _ = MissingnessProfiler._profile_column(
|
|
69
|
+
series, self.target_column, n_rows
|
|
70
|
+
)
|
|
71
|
+
result.missing_count = col_miss_profile.effective_null_count
|
|
72
|
+
result.missing_ratio = col_miss_profile.effective_null_ratio
|
|
73
|
+
|
|
74
|
+
if result.missing_count > 0:
|
|
75
|
+
result.flags.append(TargetFlag.ContainsMissing)
|
|
76
|
+
|
|
77
|
+
# 3. Problem-Specific Profiling
|
|
78
|
+
if problem_type in (
|
|
79
|
+
TargetProblemType.BinaryClassification,
|
|
80
|
+
TargetProblemType.MulticlassClassification,
|
|
81
|
+
):
|
|
82
|
+
self._profile_classification(series, n_rows, result)
|
|
83
|
+
elif problem_type == TargetProblemType.Regression:
|
|
84
|
+
self._profile_regression(series, n_rows, result)
|
|
85
|
+
|
|
86
|
+
return result
|
|
87
|
+
|
|
88
|
+
def _determine_problem_type(
|
|
89
|
+
self, series: pl.Series, type_info: Any, n_rows: int
|
|
90
|
+
) -> TargetProblemType:
|
|
91
|
+
"""Map TypeDetector results to an ML Problem Type with cardinality safety."""
|
|
92
|
+
|
|
93
|
+
# 1. Reject Identifiers completely
|
|
94
|
+
if type_info.has_flag(TypeFlag.IdentifierColumn):
|
|
95
|
+
return TargetProblemType.Unknown
|
|
96
|
+
|
|
97
|
+
# 2. Obvious Booleans -> Binary Classification
|
|
98
|
+
if type_info.has_flag(TypeFlag.BooleanCandidate):
|
|
99
|
+
return TargetProblemType.BinaryClassification
|
|
100
|
+
|
|
101
|
+
# 3. Categorical (Strings OR Integers acting as categories)
|
|
102
|
+
is_string = series.dtype in (pl.Utf8, pl.String)
|
|
103
|
+
is_encoded_int = type_info.has_flag(TypeFlag.EncodedCategory)
|
|
104
|
+
|
|
105
|
+
if is_string or is_encoded_int:
|
|
106
|
+
n_unique = series.drop_nulls().n_unique()
|
|
107
|
+
|
|
108
|
+
# SAFEGUARD: If a string has too many unique values, it's not a classification target.
|
|
109
|
+
# E.g., free text, high-cardinality IDs, or raw JSON strings.
|
|
110
|
+
# Threshold: > 100 classes is usually beyond standard ML classification scope.
|
|
111
|
+
if n_unique > 100 or (n_unique / max(n_rows, 1) > 0.05 and n_rows > 1000):
|
|
112
|
+
return TargetProblemType.Unknown
|
|
113
|
+
|
|
114
|
+
if n_unique == 2:
|
|
115
|
+
return TargetProblemType.BinaryClassification
|
|
116
|
+
elif n_unique > 2:
|
|
117
|
+
return TargetProblemType.MulticlassClassification
|
|
118
|
+
|
|
119
|
+
# 4. Confirmed Numerics -> Regression
|
|
120
|
+
# Note: TypeDetector strips the 'NumericKind' if it was flagged as an EncodedCategory.
|
|
121
|
+
# So we won't accidentally treat [0, 1, 2] classes as a regression target here.
|
|
122
|
+
if type_info.numeric_kind in (NumericKind.Continuous, NumericKind.Discrete):
|
|
123
|
+
return TargetProblemType.Regression
|
|
124
|
+
|
|
125
|
+
# If it's a string with too many unique values, or an unparsed datetime, etc.
|
|
126
|
+
return TargetProblemType.Unknown
|
|
127
|
+
|
|
128
|
+
def _profile_classification(
|
|
129
|
+
self, series: pl.Series, n_rows: int, result: TargetProfileResult
|
|
130
|
+
) -> None:
|
|
131
|
+
"""Generates categorical metrics and checks for class imbalance."""
|
|
132
|
+
cat_profiler = CategoricalProfiler(config=self.config)
|
|
133
|
+
|
|
134
|
+
# Internally compute cardinality, top values, and imbalance metrics
|
|
135
|
+
cat_profile = cat_profiler._profile_column(series, self.target_column, n_rows)
|
|
136
|
+
result.categorical_profile = cat_profile
|
|
137
|
+
|
|
138
|
+
# Flag Imbalances
|
|
139
|
+
ratio = cat_profile.imbalance.class_ratio
|
|
140
|
+
if ratio > 20.0:
|
|
141
|
+
result.flags.append(TargetFlag.SevereImbalance)
|
|
142
|
+
elif ratio > 5.0:
|
|
143
|
+
result.flags.append(TargetFlag.HighImbalance)
|
|
144
|
+
|
|
145
|
+
def _profile_regression(
|
|
146
|
+
self, series: pl.Series, n_rows: int, result: TargetProfileResult
|
|
147
|
+
) -> None:
|
|
148
|
+
"""Generates numeric metrics and checks for target skewness."""
|
|
149
|
+
num_profiler = NumericProfiler(config=self.config)
|
|
150
|
+
|
|
151
|
+
num_profile = num_profiler._profile_column(series, n_rows)
|
|
152
|
+
result.numeric_profile = num_profile
|
|
153
|
+
|
|
154
|
+
# Flag Skewness (Highly skewed targets often require Log/Yeo-Johnson transforms)
|
|
155
|
+
if num_profile.skewness_severity in (SkewSeverity.High, SkewSeverity.Severe):
|
|
156
|
+
result.flags.append(TargetFlag.HighlySkewed)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Result dataclass for free-text column profiling.
|
|
3
|
+
|
|
4
|
+
Populated by TextProfiler.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class TextStats:
|
|
14
|
+
avg_token_count: float = 0.0
|
|
15
|
+
median_token_count: float = 0.0
|
|
16
|
+
vocabulary_size: int = 0
|
|
17
|
+
char_length_min: int = 0
|
|
18
|
+
char_length_max: int = 0
|
|
19
|
+
char_length_mean: float = 0.0
|
|
20
|
+
char_length_median: float = 0.0
|
|
21
|
+
empty_ratio: float = 0.0
|
|
22
|
+
whitespace_ratio: float = 0.0
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class TextProfileResult:
|
|
27
|
+
"""
|
|
28
|
+
Text profile for all eligible columns.
|
|
29
|
+
|
|
30
|
+
Attributes
|
|
31
|
+
----------
|
|
32
|
+
columns : dict[str, TextStats]
|
|
33
|
+
Per-column text profiles, keyed by column name.
|
|
34
|
+
analysed_columns : list[str]
|
|
35
|
+
Columns that were actually profiled (after schema intersection
|
|
36
|
+
and eligibility check).
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
columns: dict[str, TextStats] = field(default_factory=dict)
|
|
40
|
+
analysed_columns: list[str] = field(default_factory=list)
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TextProfiler – Phase 1 extension: Free-Text Column Profiling.
|
|
3
|
+
|
|
4
|
+
Handles columns classified as SemanticType.Text (free-text string columns).
|
|
5
|
+
All computation is Polars-native — no external NLP libraries, no language
|
|
6
|
+
detection.
|
|
7
|
+
|
|
8
|
+
Per-column metrics
|
|
9
|
+
------------------
|
|
10
|
+
1. avg_token_count – mean whitespace-split token count across non-null rows
|
|
11
|
+
2. median_token_count – median whitespace-split token count across non-null rows
|
|
12
|
+
3. vocabulary_size – count of distinct tokens across all non-null values
|
|
13
|
+
4. char_length_min – shortest non-null string (characters)
|
|
14
|
+
5. char_length_max – longest non-null string (characters)
|
|
15
|
+
6. char_length_mean – mean character length across non-null strings
|
|
16
|
+
7. char_length_median – median character length across non-null strings
|
|
17
|
+
8. empty_ratio – fraction of total rows that are empty strings ("")
|
|
18
|
+
9. whitespace_ratio – fraction of total rows that are whitespace-only
|
|
19
|
+
(includes empty strings, since strip → "")
|
|
20
|
+
|
|
21
|
+
Definitions
|
|
22
|
+
-----------
|
|
23
|
+
- "token" : any run of non-whitespace characters produced by
|
|
24
|
+
str.split_whitespace() semantics, i.e.
|
|
25
|
+
``pl.col(c).str.split(" ")`` with empty-string elements
|
|
26
|
+
filtered out. We use Polars ``str.count_matches`` on
|
|
27
|
+
``r"\\S+"`` which counts exactly these tokens in a single
|
|
28
|
+
vectorised pass.
|
|
29
|
+
- "empty string" : len == 0 after no stripping.
|
|
30
|
+
- "whitespace-only": len == 0 after str.strip_chars().
|
|
31
|
+
- Null values are excluded from all per-row metrics and from ratio
|
|
32
|
+
denominators **except** empty_ratio / whitespace_ratio, which are
|
|
33
|
+
computed over total row count (nulls contribute 0, not counted as empty).
|
|
34
|
+
|
|
35
|
+
Eligibility
|
|
36
|
+
-----------
|
|
37
|
+
A column is eligible when:
|
|
38
|
+
- It has a SemanticType.Text override in ProfileConfig.column_overrides, OR
|
|
39
|
+
- Its Polars dtype is pl.Utf8 (alias pl.String) and no other override is set.
|
|
40
|
+
|
|
41
|
+
Integration
|
|
42
|
+
-----------
|
|
43
|
+
Drop ``TextProfiler`` into the profiler loop in ``structural.py`` alongside
|
|
44
|
+
``NumericProfiler``, ``CategoricalProfiler``, ``DatetimeProfiler``, and
|
|
45
|
+
``BooleanProfiler``::
|
|
46
|
+
|
|
47
|
+
sub_result = TextProfiler(config=self.config).profile(data, columns=active_cols)
|
|
48
|
+
for col_name, col_stats in sub_result.columns.items():
|
|
49
|
+
result.columns.setdefault(col_name, ColumnProfile(name=col_name)).stats = col_stats
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
from __future__ import annotations
|
|
53
|
+
|
|
54
|
+
import polars as pl
|
|
55
|
+
|
|
56
|
+
from ._base import ColumnBatchProfiler
|
|
57
|
+
from .config import (
|
|
58
|
+
ProfileConfig,
|
|
59
|
+
TextStats,
|
|
60
|
+
SemanticType,
|
|
61
|
+
)
|
|
62
|
+
from ._text_config import TextProfileResult
|
|
63
|
+
|
|
64
|
+
# Regex that counts non-whitespace token runs — used with str.count_matches.
|
|
65
|
+
_TOKEN_PATTERN: str = r"\S+"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class TextProfiler(ColumnBatchProfiler[TextProfileResult]):
|
|
69
|
+
"""
|
|
70
|
+
Free-text column profiler for Polars DataFrames.
|
|
71
|
+
|
|
72
|
+
A column is eligible when:
|
|
73
|
+
- It has a ``SemanticType.Text`` override in
|
|
74
|
+
``ProfileConfig.column_overrides``, OR
|
|
75
|
+
- Its Polars dtype is ``pl.Utf8`` / ``pl.String`` and no override is set.
|
|
76
|
+
|
|
77
|
+
Non-eligible columns are silently skipped.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
config : ProfileConfig | None
|
|
82
|
+
Shared profiling configuration.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(self, config: ProfileConfig | None = None) -> None:
|
|
86
|
+
super().__init__(config)
|
|
87
|
+
|
|
88
|
+
# ------------------------------------------------------------------
|
|
89
|
+
# Public API
|
|
90
|
+
# ------------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
def profile(
|
|
93
|
+
self,
|
|
94
|
+
data: pl.DataFrame,
|
|
95
|
+
columns: list[str],
|
|
96
|
+
) -> TextProfileResult:
|
|
97
|
+
return self._run(data, columns)
|
|
98
|
+
|
|
99
|
+
# ------------------------------------------------------------------
|
|
100
|
+
# Eligibility
|
|
101
|
+
# ------------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
def _eligible(self, series: pl.Series) -> bool:
|
|
104
|
+
override = self.config.column_overrides.get(series.name)
|
|
105
|
+
|
|
106
|
+
if override == SemanticType.Text:
|
|
107
|
+
return True
|
|
108
|
+
|
|
109
|
+
# Any other explicit override takes precedence
|
|
110
|
+
if override is not None:
|
|
111
|
+
return False
|
|
112
|
+
|
|
113
|
+
# Native string dtype (pl.Utf8 is the canonical name; pl.String is
|
|
114
|
+
# an alias in newer Polars — check both for cross-version safety)
|
|
115
|
+
return series.dtype in (pl.Utf8, pl.String)
|
|
116
|
+
|
|
117
|
+
# ------------------------------------------------------------------
|
|
118
|
+
# Orchestration
|
|
119
|
+
# ------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
def _run(
|
|
122
|
+
self,
|
|
123
|
+
df: pl.DataFrame,
|
|
124
|
+
columns: list[str],
|
|
125
|
+
) -> TextProfileResult:
|
|
126
|
+
result = TextProfileResult()
|
|
127
|
+
|
|
128
|
+
available = [
|
|
129
|
+
c
|
|
130
|
+
for c in self._resolve_columns(df.columns, columns)
|
|
131
|
+
if self._eligible(df[c])
|
|
132
|
+
]
|
|
133
|
+
result.analysed_columns = available
|
|
134
|
+
|
|
135
|
+
for col_name in available:
|
|
136
|
+
result.columns[col_name] = self._profile_column(
|
|
137
|
+
df[col_name], df.height
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
return result
|
|
141
|
+
|
|
142
|
+
# ------------------------------------------------------------------
|
|
143
|
+
# Per-column driver
|
|
144
|
+
# ------------------------------------------------------------------
|
|
145
|
+
|
|
146
|
+
def _profile_column(
|
|
147
|
+
self,
|
|
148
|
+
series: pl.Series,
|
|
149
|
+
n_rows: int,
|
|
150
|
+
) -> TextStats:
|
|
151
|
+
profile = TextStats()
|
|
152
|
+
|
|
153
|
+
if n_rows == 0:
|
|
154
|
+
return profile
|
|
155
|
+
|
|
156
|
+
# ── 1. Empty / whitespace ratios (computed over ALL rows, nulls → 0) ──
|
|
157
|
+
# null rows do not count as empty or whitespace-only.
|
|
158
|
+
non_null_mask = series.is_not_null()
|
|
159
|
+
empty_mask = non_null_mask & (series.str.len_chars() == 0)
|
|
160
|
+
stripped = series.str.strip_chars()
|
|
161
|
+
whitespace_mask = non_null_mask & (stripped.str.len_chars() == 0)
|
|
162
|
+
|
|
163
|
+
profile.empty_ratio = float(empty_mask.sum()) / n_rows
|
|
164
|
+
profile.whitespace_ratio = float(whitespace_mask.sum()) / n_rows
|
|
165
|
+
|
|
166
|
+
# ── 2. Work on non-null values only from here on ─────────────────────
|
|
167
|
+
non_null = series.drop_nulls()
|
|
168
|
+
n_non_null = non_null.len()
|
|
169
|
+
|
|
170
|
+
if n_non_null == 0:
|
|
171
|
+
return profile
|
|
172
|
+
|
|
173
|
+
# ── 3. Token counts (whitespace-split, Polars regex count) ────────────
|
|
174
|
+
# str.count_matches counts non-overlapping matches of r"\S+",
|
|
175
|
+
# which is exactly the set of whitespace-delimited tokens.
|
|
176
|
+
token_counts: pl.Series = non_null.str.count_matches(_TOKEN_PATTERN)
|
|
177
|
+
|
|
178
|
+
profile.avg_token_count = float(token_counts.mean()) # type: ignore[arg-type]
|
|
179
|
+
profile.median_token_count = float(token_counts.median()) # type: ignore[arg-type]
|
|
180
|
+
|
|
181
|
+
# Re-derive cleanly to avoid the chained reference issue above:
|
|
182
|
+
exploded = non_null.str.split(" ").explode().drop_nulls()
|
|
183
|
+
non_empty_tokens = exploded.filter(exploded != "")
|
|
184
|
+
profile.vocabulary_size = non_empty_tokens.n_unique()
|
|
185
|
+
|
|
186
|
+
# ── 5. Character-length distribution ─────────────────────────────────
|
|
187
|
+
char_lengths: pl.Series = non_null.str.len_chars().cast(pl.Float64)
|
|
188
|
+
|
|
189
|
+
profile.char_length_min = int(char_lengths.min()) # type: ignore[arg-type]
|
|
190
|
+
profile.char_length_max = int(char_lengths.max()) # type: ignore[arg-type]
|
|
191
|
+
profile.char_length_mean = float(char_lengths.mean()) # type: ignore[arg-type]
|
|
192
|
+
profile.char_length_median = float(char_lengths.median()) # type: ignore[arg-type]
|
|
193
|
+
|
|
194
|
+
return profile
|