dataforge-ml 0.6.0__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/PKG-INFO +1 -1
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/pyproject.toml +1 -1
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_correlation_profiler.py +7 -1
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_type_detector.py +71 -96
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/PKG-INFO +1 -1
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/LICENSE +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/README.md +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/setup.cfg +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/__init__.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/models/__init__.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/models/_data_structure.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/models/_data_types.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/__init__.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_base.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_boolean_config.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_boolean_profiler.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_categorical.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_categorical_config.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_correlation_config.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_datetime_config.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_datetime_profiler.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_missingness_config.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_missingness_profiler.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_numeric_config.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_numeric_profiler.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_tabular.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_target_config.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_target_profiler.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_text_config.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_text_profiler.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/config.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/structural.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/splitting/__init__.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/splitting/_config.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/splitting/_splitter.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/utils/__init__.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/utils/data_loader.py +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/SOURCES.txt +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/dependency_links.txt +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/requires.txt +0 -0
- {dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml.egg-info/top_level.txt +0 -0
{dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_correlation_profiler.py
RENAMED
|
@@ -400,7 +400,13 @@ class CorrelationProfiler(DatasetLevelProfiler[CorrelationProfileResult]):
|
|
|
400
400
|
phi2_corr = max(0.0, phi2 - (r - 1) * (c - 1) / (n - 1))
|
|
401
401
|
r_corr = r - (r - 1) ** 2 / (n - 1)
|
|
402
402
|
c_corr = c - (c - 1) ** 2 / (n - 1)
|
|
403
|
-
|
|
403
|
+
denom = min(r_corr - 1, c_corr - 1)
|
|
404
|
+
if denom <= 0:
|
|
405
|
+
# Near-saturated contingency table (n_unique ≈ n_rows):
|
|
406
|
+
# bias correction collapses denominator; skip the pair.
|
|
407
|
+
pairs.append(CramerVPair(col_a=col_a, col_b=col_b))
|
|
408
|
+
continue
|
|
409
|
+
v = float(np.sqrt(phi2_corr / denom))
|
|
404
410
|
v = max(0.0, min(1.0, v))
|
|
405
411
|
except Exception as exc:
|
|
406
412
|
warnings.warn(
|
|
@@ -35,10 +35,11 @@ _IDENTIFIER_UNIQUE_RATIO = 0.99 # >99 % unique → identifier
|
|
|
35
35
|
_IDENTIFIER_MAX_MEDIAN_LENGTH = 40
|
|
36
36
|
_DISCRETE_NUNIQUE_THRESHOLD = 20 # numeric with <20 unique values → discrete
|
|
37
37
|
|
|
38
|
-
_FREE_TEXT_AVG_WORDS: int =
|
|
39
|
-
_FREE_TEXT_MEDIAN_CHARS: int =
|
|
40
|
-
_FREE_TEXT_P90_CHARS: int =
|
|
38
|
+
_FREE_TEXT_AVG_WORDS: int = 3
|
|
39
|
+
_FREE_TEXT_MEDIAN_CHARS: int = 20
|
|
40
|
+
_FREE_TEXT_P90_CHARS: int = 35
|
|
41
41
|
_FREE_TEXT_MIN_UNIQUE_RATIO: float = 0.40
|
|
42
|
+
_FREE_TEXT_HIGH_UNIQUE_WITH_SPACES: float = 0.70 # unique ratio above which multi-token strings → Text
|
|
42
43
|
|
|
43
44
|
|
|
44
45
|
# Common boolean string values (lowercased)
|
|
@@ -77,115 +78,87 @@ class TypeDetector:
|
|
|
77
78
|
original_dtype=original_dtype,
|
|
78
79
|
inferred_dtype=original_dtype,
|
|
79
80
|
)
|
|
80
|
-
|
|
81
|
-
# Work with a copy that may be re-assigned after coercion
|
|
82
81
|
working = series
|
|
83
82
|
|
|
84
83
|
# 1 & 2: Coercion for string columns
|
|
85
|
-
if series.dtype
|
|
84
|
+
if series.dtype in (pl.Utf8, pl.String):
|
|
86
85
|
coerced, flag = self._try_numeric_coerce(series, n_rows)
|
|
87
86
|
if coerced is not None:
|
|
88
87
|
info.inferred_dtype = str(coerced.dtype)
|
|
89
88
|
info.flags.append(flag) # type: ignore[arg-type]
|
|
90
89
|
working = coerced
|
|
91
|
-
|
|
92
|
-
self._check_coerced_encoded_category(working, info, n_rows)
|
|
90
|
+
self._check_coerced_encoded_category(working, info)
|
|
93
91
|
else:
|
|
94
92
|
coerced_dt, flag_dt = self._try_datetime_coerce(
|
|
95
|
-
series,
|
|
93
|
+
series, n_rows
|
|
96
94
|
)
|
|
97
95
|
if coerced_dt is not None:
|
|
98
96
|
info.inferred_dtype = str(coerced_dt.dtype)
|
|
99
97
|
info.flags.append(flag_dt) # type: ignore[arg-type]
|
|
100
|
-
working = coerced_dt
|
|
101
|
-
|
|
102
98
|
info.semantic_type = SemanticType.Datetime
|
|
103
99
|
results[col_name] = info
|
|
104
100
|
continue
|
|
105
101
|
|
|
106
102
|
# 3: Boolean candidate
|
|
107
103
|
self._check_boolean_candidate(working, info)
|
|
108
|
-
|
|
109
|
-
|
|
104
|
+
if TypeFlag.BooleanCandidate in info.flags:
|
|
105
|
+
info.semantic_type = SemanticType.Boolean
|
|
106
|
+
results[col_name] = info
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
# Native datetime types
|
|
110
|
+
if working.dtype in (pl.Date, pl.Datetime, pl.Duration, pl.Time) or isinstance(
|
|
111
|
+
working.dtype, pl.Datetime
|
|
112
|
+
):
|
|
113
|
+
info.semantic_type = SemanticType.Datetime
|
|
114
|
+
results[col_name] = info
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
# 4–7: Numeric path
|
|
110
118
|
if working.dtype in _NUMERIC_DTYPES:
|
|
111
|
-
# 4 & 5: Encoded category and identifier checks — integers only.
|
|
112
|
-
# Continuous floats have high cardinality by nature and are never
|
|
113
|
-
# identifiers; restricting these checks prevents false Identifier
|
|
114
|
-
# classification of genuine numeric features.
|
|
115
119
|
if working.dtype in _INT_DTYPES:
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
info.
|
|
129
|
-
|
|
130
|
-
TypeFlag.IdentifierColumn,
|
|
131
|
-
TypeFlag.SequentialIndex,
|
|
132
|
-
TypeFlag.FloatSequentialIndex,
|
|
133
|
-
)
|
|
134
|
-
):
|
|
120
|
+
# EncodedCategory and IdentifierColumn are mutually exclusive:
|
|
121
|
+
# low-cardinality and near-unique cannot both be true.
|
|
122
|
+
# Check encoded category first; skip identifier if it matches.
|
|
123
|
+
self._check_encoded_category(working, info)
|
|
124
|
+
if TypeFlag.EncodedCategory not in info.flags:
|
|
125
|
+
self._check_identifier(working, info, n_rows)
|
|
126
|
+
if TypeFlag.IdentifierColumn in info.flags:
|
|
127
|
+
self._check_sequential_index(working, info, n_rows)
|
|
128
|
+
|
|
129
|
+
if TypeFlag.EncodedCategory in info.flags:
|
|
130
|
+
info.semantic_type = SemanticType.Categorical
|
|
131
|
+
elif TypeFlag.IdentifierColumn in info.flags:
|
|
132
|
+
info.semantic_type = SemanticType.Identifier
|
|
133
|
+
else:
|
|
135
134
|
self._classify_numeric_kind(working, info)
|
|
135
|
+
info.semantic_type = SemanticType.Numeric
|
|
136
136
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
self._check_identifier(working, info, n_rows)
|
|
137
|
+
results[col_name] = info
|
|
138
|
+
continue
|
|
140
139
|
|
|
140
|
+
# String path
|
|
141
|
+
if working.dtype in (pl.Utf8, pl.String):
|
|
141
142
|
self._check_free_text(working, info, n_rows)
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
n_rows
|
|
147
|
-
|
|
148
|
-
|
|
143
|
+
if TypeFlag.FreeTextCandidate in info.flags:
|
|
144
|
+
info.semantic_type = SemanticType.Text
|
|
145
|
+
results[col_name] = info
|
|
146
|
+
continue
|
|
147
|
+
self._check_identifier(working, info, n_rows)
|
|
148
|
+
info.semantic_type = (
|
|
149
|
+
SemanticType.Identifier
|
|
150
|
+
if TypeFlag.IdentifierColumn in info.flags
|
|
151
|
+
else SemanticType.Categorical
|
|
152
|
+
)
|
|
153
|
+
results[col_name] = info
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
# Fallback
|
|
157
|
+
info.semantic_type = SemanticType.Text
|
|
149
158
|
results[col_name] = info
|
|
150
159
|
|
|
151
160
|
return results
|
|
152
161
|
|
|
153
|
-
@staticmethod
|
|
154
|
-
def _derive_semantic_type(
|
|
155
|
-
info: ColumnTypeInfo,
|
|
156
|
-
working: pl.Series,
|
|
157
|
-
n_rows: int,
|
|
158
|
-
) -> SemanticType:
|
|
159
|
-
if TypeFlag.IdentifierColumn in info.flags:
|
|
160
|
-
return SemanticType.Identifier
|
|
161
|
-
|
|
162
|
-
if TypeFlag.BooleanCandidate in info.flags:
|
|
163
|
-
return SemanticType.Boolean
|
|
164
|
-
|
|
165
|
-
is_native_datetime = working.dtype in (
|
|
166
|
-
pl.Date,
|
|
167
|
-
pl.Datetime,
|
|
168
|
-
pl.Duration,
|
|
169
|
-
pl.Time,
|
|
170
|
-
) or (hasattr(pl, "Datetime") and isinstance(working.dtype, pl.Datetime))
|
|
171
|
-
|
|
172
|
-
if is_native_datetime or TypeFlag.DatetimeCoerced in info.flags:
|
|
173
|
-
return SemanticType.Datetime
|
|
174
|
-
|
|
175
|
-
if TypeFlag.EncodedCategory in info.flags:
|
|
176
|
-
return SemanticType.Categorical
|
|
177
|
-
|
|
178
|
-
if working.dtype in (pl.Utf8, pl.String):
|
|
179
|
-
if TypeFlag.FreeTextCandidate in info.flags:
|
|
180
|
-
return SemanticType.Text
|
|
181
|
-
|
|
182
|
-
return SemanticType.Categorical
|
|
183
|
-
|
|
184
|
-
if working.dtype in _NUMERIC_DTYPES:
|
|
185
|
-
return SemanticType.Numeric
|
|
186
|
-
|
|
187
|
-
return SemanticType.Categorical
|
|
188
|
-
|
|
189
162
|
# ------------------------------------------------------------------
|
|
190
163
|
# Step 1: Numeric coercion
|
|
191
164
|
# ------------------------------------------------------------------
|
|
@@ -221,7 +194,7 @@ class TypeDetector:
|
|
|
221
194
|
|
|
222
195
|
@staticmethod
|
|
223
196
|
def _try_datetime_coerce(
|
|
224
|
-
series: pl.Series,
|
|
197
|
+
series: pl.Series, n_rows: int
|
|
225
198
|
) -> tuple[pl.Series, TypeFlag] | tuple[None, None]:
|
|
226
199
|
"""
|
|
227
200
|
Attempt datetime coercion if the column name looks date-like.
|
|
@@ -269,7 +242,7 @@ class TypeDetector:
|
|
|
269
242
|
|
|
270
243
|
@staticmethod
|
|
271
244
|
def _check_coerced_encoded_category(
|
|
272
|
-
series: pl.Series, info: ColumnTypeInfo
|
|
245
|
+
series: pl.Series, info: ColumnTypeInfo
|
|
273
246
|
) -> None:
|
|
274
247
|
"""
|
|
275
248
|
Post-coercion low-cardinality check for Float64 series that originated
|
|
@@ -312,9 +285,8 @@ class TypeDetector:
|
|
|
312
285
|
|
|
313
286
|
@staticmethod
|
|
314
287
|
def _check_encoded_category(
|
|
315
|
-
series: pl.Series, info: ColumnTypeInfo
|
|
288
|
+
series: pl.Series, info: ColumnTypeInfo
|
|
316
289
|
) -> None:
|
|
317
|
-
# Skip if already flagged as boolean candidate (subset of {0,1})
|
|
318
290
|
if TypeFlag.BooleanCandidate in info.flags:
|
|
319
291
|
return
|
|
320
292
|
|
|
@@ -441,24 +413,27 @@ class TypeDetector:
|
|
|
441
413
|
|
|
442
414
|
char_lengths = non_null.str.len_chars()
|
|
443
415
|
median_chars = float(char_lengths.median() or 0.0)
|
|
416
|
+
space_counts = non_null.str.count_matches(r"\s+")
|
|
417
|
+
median_spaces = float(space_counts.median() or 0.0)
|
|
418
|
+
median_words = median_spaces + 1.0
|
|
419
|
+
unique_ratio = series.n_unique() / n_rows if n_rows > 0 else 0.0
|
|
444
420
|
|
|
445
|
-
|
|
421
|
+
# Multi-word strings of medium length: names, addresses, short descriptions
|
|
422
|
+
if median_chars > _FREE_TEXT_MEDIAN_CHARS and median_spaces >= 1.0:
|
|
446
423
|
info.flags.append(TypeFlag.FreeTextCandidate)
|
|
447
424
|
return
|
|
448
425
|
|
|
449
|
-
|
|
450
|
-
median_words = float(space_counts.median() or 0.0) + 1.0
|
|
451
|
-
|
|
426
|
+
# Long average word count: sentences, paragraphs
|
|
452
427
|
if median_words > _FREE_TEXT_AVG_WORDS:
|
|
453
428
|
info.flags.append(TypeFlag.FreeTextCandidate)
|
|
454
429
|
return
|
|
455
430
|
|
|
456
431
|
p90_chars = float(char_lengths.quantile(0.9) or 0.0)
|
|
432
|
+
if p90_chars > _FREE_TEXT_P90_CHARS and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO:
|
|
433
|
+
info.flags.append(TypeFlag.FreeTextCandidate)
|
|
434
|
+
return
|
|
457
435
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
if
|
|
461
|
-
p90_chars > _FREE_TEXT_P90_CHARS
|
|
462
|
-
and unique_ratio > _FREE_TEXT_MIN_UNIQUE_RATIO
|
|
463
|
-
):
|
|
436
|
+
# High-cardinality multi-token strings that don't meet char thresholds:
|
|
437
|
+
# e.g. short full names like "John Smith", compound tokens
|
|
438
|
+
if unique_ratio >= _FREE_TEXT_HIGH_UNIQUE_WITH_SPACES and median_spaces >= 1.0:
|
|
464
439
|
info.flags.append(TypeFlag.FreeTextCandidate)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dataforge_ml-0.6.0 → dataforge_ml-0.7.0}/src/dataforge_ml/profiling/_missingness_profiler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|