valediction 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valediction/__init__.py +8 -8
- valediction/convenience.py +45 -50
- valediction/data_types/data_type_helpers.py +75 -75
- valediction/data_types/data_types.py +58 -58
- valediction/data_types/type_inference.py +541 -541
- valediction/datasets/datasets.py +870 -870
- valediction/datasets/datasets_helpers.py +46 -46
- valediction/demo/DEMOGRAPHICS.csv +101 -101
- valediction/demo/DIAGNOSES.csv +650 -650
- valediction/demo/LAB_TESTS.csv +1001 -1001
- valediction/demo/VITALS.csv +1001 -1001
- valediction/demo/__init__.py +6 -6
- valediction/demo/demo_dictionary.py +129 -129
- valediction/dictionary/exporting.py +501 -501
- valediction/dictionary/exporting_helpers.py +371 -371
- valediction/dictionary/generation.py +357 -357
- valediction/dictionary/helpers.py +174 -174
- valediction/dictionary/importing.py +494 -494
- valediction/dictionary/integrity.py +37 -37
- valediction/dictionary/model.py +582 -582
- valediction/exceptions.py +22 -22
- valediction/integrity.py +97 -97
- valediction/io/csv_readers.py +307 -307
- valediction/progress.py +206 -206
- valediction/support.py +72 -72
- valediction/validation/helpers.py +315 -315
- valediction/validation/issues.py +280 -280
- valediction/validation/validation.py +598 -598
- {valediction-1.0.0.dist-info → valediction-1.1.0.dist-info}/METADATA +1 -1
- valediction-1.1.0.dist-info/RECORD +38 -0
- {valediction-1.0.0.dist-info → valediction-1.1.0.dist-info}/WHEEL +1 -1
- valediction-1.0.0.dist-info/RECORD +0 -38
|
@@ -1,541 +1,541 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import re
|
|
4
|
-
import warnings
|
|
5
|
-
|
|
6
|
-
import pandas as pd
|
|
7
|
-
|
|
8
|
-
from valediction.data_types.data_type_helpers import infer_datetime_format
|
|
9
|
-
from valediction.data_types.data_types import DataType
|
|
10
|
-
from valediction.integrity import get_config
|
|
11
|
-
from valediction.progress import Progress
|
|
12
|
-
|
|
13
|
-
# ---------- compiled patterns ----------
|
|
14
|
-
_INT_RE = re.compile(r"^[+-]?\d+$")
|
|
15
|
-
# FLOAT: allow decimals OR integers, plus optional scientific notation
|
|
16
|
-
_FLOAT_RE = re.compile(r"^[+-]?(?:\d+\.\d*|\.\d+|\d+)(?:[eE][+-]?\d+)?$")
|
|
17
|
-
# integers written as 123, 123.0, 123.
|
|
18
|
-
_INT_EQ_RE = re.compile(r"^[+-]?\d+(?:\.0*)?$")
|
|
19
|
-
_LEAD0_RE = re.compile(r"^[+-]?0\d+$")
|
|
20
|
-
_DATE_HINT_RE = re.compile(r"[-/T]") # cheap prefilter
|
|
21
|
-
COLUMN_STEPS = 8
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class ColumnState:
|
|
25
|
-
def __init__(self, name: str) -> None:
|
|
26
|
-
self.name = name
|
|
27
|
-
self.data_type: DataType = DataType.TEXT
|
|
28
|
-
self.nullable: bool = False
|
|
29
|
-
self.max_length: int = 0
|
|
30
|
-
|
|
31
|
-
# Locks / disqualifiers
|
|
32
|
-
self.lock_text_due_to_leading_zero: bool = False
|
|
33
|
-
self.lock_text_permanent: bool = False
|
|
34
|
-
self.disqualify_numeric: bool = False
|
|
35
|
-
self.disqualify_datetime: bool = False
|
|
36
|
-
|
|
37
|
-
# Datetime speed hint
|
|
38
|
-
self.cached_datetime_format: str | None = None
|
|
39
|
-
self.prefer_date_first: bool = False
|
|
40
|
-
|
|
41
|
-
def final_data_type_and_length(self) -> tuple[DataType, int | None]:
|
|
42
|
-
def _len1() -> int:
|
|
43
|
-
return max(1, self.max_length or 0)
|
|
44
|
-
|
|
45
|
-
if self.lock_text_due_to_leading_zero or self.lock_text_permanent:
|
|
46
|
-
return DataType.TEXT, _len1()
|
|
47
|
-
if self.data_type == DataType.TEXT:
|
|
48
|
-
return DataType.TEXT, _len1()
|
|
49
|
-
|
|
50
|
-
if self.data_type == DataType.INTEGER:
|
|
51
|
-
return DataType.INTEGER, None
|
|
52
|
-
if self.data_type == DataType.FLOAT:
|
|
53
|
-
return DataType.FLOAT, None
|
|
54
|
-
if self.data_type == DataType.DATE:
|
|
55
|
-
return DataType.DATE, None
|
|
56
|
-
if self.data_type == DataType.DATETIME:
|
|
57
|
-
return DataType.DATETIME, None
|
|
58
|
-
|
|
59
|
-
return DataType.TEXT, _len1()
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
class TypeInferer:
|
|
63
|
-
"""
|
|
64
|
-
Chunk-friendly type inference with:
|
|
65
|
-
- compiled regex reuse
|
|
66
|
-
- cached datetime formats
|
|
67
|
-
- sticky TEXT on contradictions
|
|
68
|
-
- unified debug logging via __say()
|
|
69
|
-
"""
|
|
70
|
-
|
|
71
|
-
def __init__(
|
|
72
|
-
self,
|
|
73
|
-
*,
|
|
74
|
-
dayfirst: bool,
|
|
75
|
-
debug: bool = False,
|
|
76
|
-
progress: Progress = None,
|
|
77
|
-
) -> None:
|
|
78
|
-
config = get_config()
|
|
79
|
-
self.dayfirst = dayfirst
|
|
80
|
-
self.datetime_formats = config.date_formats
|
|
81
|
-
self.null_tokens = {v.strip().lower() for v in config.null_values}
|
|
82
|
-
self.states: dict[str, ColumnState] = {}
|
|
83
|
-
self.debug = debug
|
|
84
|
-
self.progress: Progress = progress
|
|
85
|
-
self.__current_column: str | None = None
|
|
86
|
-
|
|
87
|
-
# Inference
|
|
88
|
-
def update_with_chunk(self, df: pd.DataFrame) -> None:
|
|
89
|
-
if df.empty:
|
|
90
|
-
return
|
|
91
|
-
|
|
92
|
-
for col in df.columns:
|
|
93
|
-
self.__current_column = col
|
|
94
|
-
self.__begin_step(step="Preparing column")
|
|
95
|
-
series = self._ensure_string_series(df[col])
|
|
96
|
-
state = self.states.setdefault(col, ColumnState(name=col))
|
|
97
|
-
self.__complete_step() # 1 step
|
|
98
|
-
|
|
99
|
-
trimmed, nulls, nonnull_mask, max_len = self._preprocess_column(
|
|
100
|
-
series
|
|
101
|
-
) # 4 steps
|
|
102
|
-
state.nullable |= bool(nulls.any())
|
|
103
|
-
if max_len is not None and max_len > state.max_length:
|
|
104
|
-
state.max_length = max_len
|
|
105
|
-
|
|
106
|
-
if not bool(nonnull_mask.any()):
|
|
107
|
-
self.__complete_step(n=3, save_as="Skipped")
|
|
108
|
-
continue # nothing to learn in this chunk
|
|
109
|
-
|
|
110
|
-
non_nulls = trimmed[nonnull_mask]
|
|
111
|
-
|
|
112
|
-
# Hard TEXT locks
|
|
113
|
-
if self._apply_hard_text_locks(state, non_nulls): # 1 step
|
|
114
|
-
self.__complete_step(n=2, save_as="Skipped")
|
|
115
|
-
continue
|
|
116
|
-
|
|
117
|
-
# Datetime fast path
|
|
118
|
-
if self._apply_datetime_fast_path(state, non_nulls): # 1 step
|
|
119
|
-
self.__complete_step(n=1, save_as="Skipped")
|
|
120
|
-
continue
|
|
121
|
-
|
|
122
|
-
# State-specific handling
|
|
123
|
-
_handling_function: callable = {
|
|
124
|
-
DataType.TEXT: self._handle_state_text,
|
|
125
|
-
DataType.DATE: self._handle_state_date,
|
|
126
|
-
DataType.DATETIME: self._handle_state_datetime,
|
|
127
|
-
DataType.INTEGER: self._handle_state_integer,
|
|
128
|
-
DataType.FLOAT: self._handle_state_float,
|
|
129
|
-
}.get(state.data_type, self._handle_state_text)
|
|
130
|
-
|
|
131
|
-
_handling_function(state, non_nulls) # 1 of 5 steps
|
|
132
|
-
|
|
133
|
-
# Inference Helpers
|
|
134
|
-
@staticmethod
|
|
135
|
-
def _ensure_string_series(s: pd.Series) -> pd.Series:
|
|
136
|
-
if not pd.api.types.is_string_dtype(s.dtype):
|
|
137
|
-
return s.astype("string")
|
|
138
|
-
return s
|
|
139
|
-
|
|
140
|
-
def _preprocess_column(
|
|
141
|
-
self, s: pd.Series
|
|
142
|
-
) -> tuple[pd.Series, pd.Series, pd.Series, int | None]:
|
|
143
|
-
self.__begin_step(step="Trimming whitespace")
|
|
144
|
-
trimmed = s.str.strip()
|
|
145
|
-
self.__complete_step()
|
|
146
|
-
|
|
147
|
-
self.__begin_step(step="Checking nulls")
|
|
148
|
-
nulls = trimmed.isna() | trimmed.str.lower().isin(self.null_tokens)
|
|
149
|
-
self.__complete_step()
|
|
150
|
-
|
|
151
|
-
self.__begin_step(step="Checking max length")
|
|
152
|
-
lengths = s.str.len()
|
|
153
|
-
max_len = int(lengths.max(skipna=True)) if lengths.notna().any() else None
|
|
154
|
-
self.__complete_step()
|
|
155
|
-
|
|
156
|
-
self.__begin_step(step="Setting non-null mask")
|
|
157
|
-
nonnull_mask = (~nulls) & s.notna()
|
|
158
|
-
self.__complete_step()
|
|
159
|
-
|
|
160
|
-
return trimmed, nulls, nonnull_mask, max_len
|
|
161
|
-
|
|
162
|
-
# Early Locks
|
|
163
|
-
@staticmethod
|
|
164
|
-
def _looks_dateish(nn: pd.Series) -> bool:
|
|
165
|
-
return bool(nn.str.contains(_DATE_HINT_RE).any())
|
|
166
|
-
|
|
167
|
-
@staticmethod
|
|
168
|
-
def _has_leading_zero(nn: pd.Series) -> bool:
|
|
169
|
-
return bool(nn.str.match(_LEAD0_RE, na=False).any())
|
|
170
|
-
|
|
171
|
-
def _apply_hard_text_locks(self, st: ColumnState, nn: pd.Series) -> bool:
|
|
172
|
-
if st.lock_text_due_to_leading_zero or st.lock_text_permanent:
|
|
173
|
-
self._transition(st, DataType.TEXT, "locked to TEXT")
|
|
174
|
-
self.__complete_step()
|
|
175
|
-
return True
|
|
176
|
-
|
|
177
|
-
if self._has_leading_zero(nn):
|
|
178
|
-
self._debug_leading_zero_examples(st, nn)
|
|
179
|
-
st.lock_text_due_to_leading_zero = True
|
|
180
|
-
self._transition(st, DataType.TEXT, "leading-zero integer tokens")
|
|
181
|
-
self.__complete_step()
|
|
182
|
-
return True
|
|
183
|
-
|
|
184
|
-
self.__complete_step()
|
|
185
|
-
return False
|
|
186
|
-
|
|
187
|
-
def _apply_datetime_fast_path(self, st: ColumnState, nn: pd.Series) -> bool:
|
|
188
|
-
self.__begin_step(step="Applying datetime locks")
|
|
189
|
-
|
|
190
|
-
# Cached single format
|
|
191
|
-
if st.cached_datetime_format is not None:
|
|
192
|
-
ok, has_time = self._parse_with_cached_format(nn, st.cached_datetime_format)
|
|
193
|
-
if ok.all():
|
|
194
|
-
self._transition(
|
|
195
|
-
st,
|
|
196
|
-
DataType.DATETIME if has_time.any() else DataType.DATE,
|
|
197
|
-
f"cached datetime format={st.cached_datetime_format!r}",
|
|
198
|
-
)
|
|
199
|
-
self.__complete_step()
|
|
200
|
-
return True
|
|
201
|
-
|
|
202
|
-
st.cached_datetime_format = None
|
|
203
|
-
st.prefer_date_first = False
|
|
204
|
-
|
|
205
|
-
# Date-first hint (explicit formats)
|
|
206
|
-
if st.prefer_date_first and not st.disqualify_datetime:
|
|
207
|
-
for fmt in self.datetime_formats:
|
|
208
|
-
ok, has_time = self._parse_with_cached_format(nn, fmt)
|
|
209
|
-
if ok.all():
|
|
210
|
-
st.cached_datetime_format = fmt
|
|
211
|
-
self._transition(
|
|
212
|
-
st,
|
|
213
|
-
DataType.DATETIME if has_time.any() else DataType.DATE,
|
|
214
|
-
f"explicit datetime format={fmt!r}",
|
|
215
|
-
)
|
|
216
|
-
self.__complete_step()
|
|
217
|
-
return True
|
|
218
|
-
|
|
219
|
-
self.__complete_step()
|
|
220
|
-
return False
|
|
221
|
-
|
|
222
|
-
# State Handlers
|
|
223
|
-
def _handle_state_text(self, st: ColumnState, nn: pd.Series) -> None:
|
|
224
|
-
self.__begin_step(step="Handling text")
|
|
225
|
-
# DATETIME attempt
|
|
226
|
-
if not st.disqualify_datetime and self._looks_dateish(nn):
|
|
227
|
-
if self._try_parse_datetime_then_cache(st, nn):
|
|
228
|
-
self.__complete_step()
|
|
229
|
-
return
|
|
230
|
-
|
|
231
|
-
# NUMERIC attempt
|
|
232
|
-
if not st.disqualify_numeric:
|
|
233
|
-
int_equiv = nn.str.fullmatch(_INT_EQ_RE, na=False)
|
|
234
|
-
float_like = nn.str.fullmatch(_FLOAT_RE, na=False)
|
|
235
|
-
|
|
236
|
-
if int_equiv.all():
|
|
237
|
-
self._transition(st, DataType.INTEGER, "all integer-equivalent")
|
|
238
|
-
self.__complete_step()
|
|
239
|
-
return
|
|
240
|
-
|
|
241
|
-
if (int_equiv | float_like).all():
|
|
242
|
-
self._debug_float_promotion(st, nn, int_equiv, float_like)
|
|
243
|
-
self._transition(st, DataType.FLOAT, "mixed numeric (int/float)")
|
|
244
|
-
self.__complete_step()
|
|
245
|
-
return
|
|
246
|
-
|
|
247
|
-
# Otherwise: non-numeric → TEXT (sticky)
|
|
248
|
-
self._debug_offenders_numeric(st, nn, int_equiv, float_like)
|
|
249
|
-
st.disqualify_numeric = True
|
|
250
|
-
st.lock_text_permanent = True
|
|
251
|
-
self._transition(st, DataType.TEXT, "non-numeric tokens present")
|
|
252
|
-
self.__complete_step()
|
|
253
|
-
return
|
|
254
|
-
|
|
255
|
-
# If both numeric and datetime are disqualified, permanently TEXT
|
|
256
|
-
if st.disqualify_numeric and st.disqualify_datetime:
|
|
257
|
-
st.lock_text_permanent = True
|
|
258
|
-
self._transition(
|
|
259
|
-
st, DataType.TEXT, "both numeric and datetime disqualified"
|
|
260
|
-
)
|
|
261
|
-
self.__complete_step()
|
|
262
|
-
|
|
263
|
-
def _handle_state_date(self, st: ColumnState, nn: pd.Series) -> None:
|
|
264
|
-
self.__begin_step(step="Handling dates")
|
|
265
|
-
if not self._looks_dateish(nn):
|
|
266
|
-
st.disqualify_datetime = True
|
|
267
|
-
st.lock_text_permanent = True
|
|
268
|
-
self._transition(st, DataType.TEXT, "lost date-ish pattern")
|
|
269
|
-
self.__complete_step()
|
|
270
|
-
return
|
|
271
|
-
|
|
272
|
-
ok, has_time = self._datetime_parse_ok(nn)
|
|
273
|
-
if not ok.all():
|
|
274
|
-
self._debug_offenders_datetime(st, nn, ok)
|
|
275
|
-
st.disqualify_datetime = True
|
|
276
|
-
st.lock_text_permanent = True
|
|
277
|
-
self._transition(st, DataType.TEXT, "datetime parse failures")
|
|
278
|
-
elif has_time.any():
|
|
279
|
-
self._transition(st, DataType.DATETIME, "time component detected")
|
|
280
|
-
|
|
281
|
-
self.__complete_step()
|
|
282
|
-
|
|
283
|
-
def _handle_state_datetime(self, st: ColumnState, nn: pd.Series) -> None:
|
|
284
|
-
self.__begin_step(step="Handling datetimes")
|
|
285
|
-
if not self._looks_dateish(nn):
|
|
286
|
-
st.disqualify_datetime = True
|
|
287
|
-
st.lock_text_permanent = True
|
|
288
|
-
self._transition(st, DataType.TEXT, "lost date-ish pattern")
|
|
289
|
-
self.__complete_step()
|
|
290
|
-
return
|
|
291
|
-
|
|
292
|
-
ok, _ = self._datetime_parse_ok(nn)
|
|
293
|
-
if not ok.all():
|
|
294
|
-
self._debug_offenders_datetime(st, nn, ok)
|
|
295
|
-
st.disqualify_datetime = True
|
|
296
|
-
st.lock_text_permanent = True
|
|
297
|
-
self._transition(st, DataType.TEXT, "datetime parse failures")
|
|
298
|
-
|
|
299
|
-
self.__complete_step()
|
|
300
|
-
|
|
301
|
-
def _handle_state_integer(self, st: ColumnState, nn: pd.Series) -> None:
|
|
302
|
-
self.__begin_step(step="Handling integers")
|
|
303
|
-
int_equiv = nn.str.fullmatch(_INT_EQ_RE, na=False)
|
|
304
|
-
float_like = nn.str.fullmatch(_FLOAT_RE, na=False)
|
|
305
|
-
|
|
306
|
-
if not (int_equiv | float_like).all():
|
|
307
|
-
self._debug_offenders_numeric(st, nn, int_equiv, float_like)
|
|
308
|
-
st.disqualify_numeric = True
|
|
309
|
-
st.lock_text_permanent = True
|
|
310
|
-
self._transition(st, DataType.TEXT, "non-numeric tokens introduced")
|
|
311
|
-
elif float_like.any() and not int_equiv.all():
|
|
312
|
-
self._debug_float_promotion(st, nn, int_equiv, float_like)
|
|
313
|
-
self._transition(st, DataType.FLOAT, "decimals/scientific detected")
|
|
314
|
-
|
|
315
|
-
self.__complete_step()
|
|
316
|
-
# else remain INTEGER
|
|
317
|
-
|
|
318
|
-
def _handle_state_float(self, st: ColumnState, nn: pd.Series) -> None:
|
|
319
|
-
self.__begin_step(step="Handling floats")
|
|
320
|
-
int_like = nn.str.fullmatch(_INT_RE, na=False)
|
|
321
|
-
fl_like = nn.str.fullmatch(_FLOAT_RE, na=False)
|
|
322
|
-
if not (int_like | fl_like).all():
|
|
323
|
-
self._debug_offenders_numeric(st, nn, int_like, fl_like)
|
|
324
|
-
st.disqualify_numeric = True
|
|
325
|
-
st.lock_text_permanent = True
|
|
326
|
-
self._transition(st, DataType.TEXT, "non-numeric tokens introduced")
|
|
327
|
-
self.__complete_step()
|
|
328
|
-
|
|
329
|
-
# Datetime Parsing
|
|
330
|
-
def _try_parse_datetime_then_cache(self, st: ColumnState, nn: pd.Series) -> bool:
|
|
331
|
-
# 1) If we’ve already cached a format, try it fast
|
|
332
|
-
if st.cached_datetime_format is not None:
|
|
333
|
-
ok, has_time = self._parse_with_cached_format(nn, st.cached_datetime_format)
|
|
334
|
-
if ok.all():
|
|
335
|
-
self._transition(
|
|
336
|
-
st,
|
|
337
|
-
DataType.DATETIME if has_time.any() else DataType.DATE,
|
|
338
|
-
f"cached datetime format={st.cached_datetime_format!r}",
|
|
339
|
-
)
|
|
340
|
-
return True
|
|
341
|
-
# cache failed on this chunk; clear and fall through to re-infer once
|
|
342
|
-
st.cached_datetime_format = None
|
|
343
|
-
st.prefer_date_first = False
|
|
344
|
-
|
|
345
|
-
# 2) Infer with the new helper (efficient: unique, batched, intersects across slices)
|
|
346
|
-
# Work on uniques only for speed and stability.
|
|
347
|
-
uniq = (
|
|
348
|
-
nn.astype("string", copy=False)
|
|
349
|
-
.str.strip()
|
|
350
|
-
.replace("", pd.NA)
|
|
351
|
-
.dropna()
|
|
352
|
-
.unique()
|
|
353
|
-
)
|
|
354
|
-
if len(uniq) == 0:
|
|
355
|
-
return False
|
|
356
|
-
|
|
357
|
-
try:
|
|
358
|
-
fmt_or_false = infer_datetime_format(pd.Series(uniq, dtype="string"))
|
|
359
|
-
except ValueError as e:
|
|
360
|
-
# ambiguous after scanning – treat as “can’t determine” and disqualify
|
|
361
|
-
self.__say(f"[{st.name}] datetime ambiguous: {e}")
|
|
362
|
-
st.disqualify_datetime = True
|
|
363
|
-
return False
|
|
364
|
-
|
|
365
|
-
if fmt_or_false is False:
|
|
366
|
-
# helper couldn’t find any valid explicit format
|
|
367
|
-
st.disqualify_datetime = True
|
|
368
|
-
self._transition(
|
|
369
|
-
st, DataType.TEXT, "datetime helper found no matching format"
|
|
370
|
-
)
|
|
371
|
-
return False
|
|
372
|
-
|
|
373
|
-
# 3) Cache and confirm on current (non-unique) values
|
|
374
|
-
st.cached_datetime_format = fmt_or_false
|
|
375
|
-
st.prefer_date_first = True
|
|
376
|
-
ok, has_time = self._parse_with_cached_format(nn, st.cached_datetime_format)
|
|
377
|
-
if ok.all():
|
|
378
|
-
self._transition(
|
|
379
|
-
st,
|
|
380
|
-
DataType.DATETIME if has_time.any() else DataType.DATE,
|
|
381
|
-
f"explicit datetime format={st.cached_datetime_format!r}",
|
|
382
|
-
)
|
|
383
|
-
return True
|
|
384
|
-
|
|
385
|
-
self.__say(
|
|
386
|
-
f"[{st.name}] cached format failed on live slice; disqualifying datetime."
|
|
387
|
-
)
|
|
388
|
-
st.cached_datetime_format = None
|
|
389
|
-
st.disqualify_datetime = True
|
|
390
|
-
return False
|
|
391
|
-
|
|
392
|
-
def _parse_with_cached_format(
|
|
393
|
-
self, s: pd.Series, fmt: str
|
|
394
|
-
) -> tuple[pd.Series, pd.Series]:
|
|
395
|
-
with warnings.catch_warnings():
|
|
396
|
-
warnings.simplefilter("ignore", UserWarning)
|
|
397
|
-
parsed = pd.to_datetime(s, format=fmt, errors="coerce", utc=False)
|
|
398
|
-
|
|
399
|
-
ok = parsed.notna()
|
|
400
|
-
has_time = ok & (
|
|
401
|
-
(parsed.dt.hour != 0)
|
|
402
|
-
| (parsed.dt.minute != 0)
|
|
403
|
-
| (parsed.dt.second != 0)
|
|
404
|
-
| (parsed.dt.microsecond != 0)
|
|
405
|
-
)
|
|
406
|
-
return ok, has_time
|
|
407
|
-
|
|
408
|
-
def _datetime_parse_ok(self, s: pd.Series) -> tuple[pd.Series, pd.Series]:
|
|
409
|
-
with warnings.catch_warnings():
|
|
410
|
-
warnings.simplefilter("ignore", UserWarning)
|
|
411
|
-
parsed = pd.to_datetime(
|
|
412
|
-
s, errors="coerce", dayfirst=self.dayfirst, utc=False
|
|
413
|
-
)
|
|
414
|
-
|
|
415
|
-
ok = parsed.notna()
|
|
416
|
-
has_time = ok & (
|
|
417
|
-
(parsed.dt.hour != 0)
|
|
418
|
-
| (parsed.dt.minute != 0)
|
|
419
|
-
| (parsed.dt.second != 0)
|
|
420
|
-
| (parsed.dt.microsecond != 0)
|
|
421
|
-
)
|
|
422
|
-
return ok, has_time
|
|
423
|
-
|
|
424
|
-
# Debug/Log
|
|
425
|
-
def __say(self, *values: object, sep: str = " ", end: str = "\n") -> None:
|
|
426
|
-
if self.debug:
|
|
427
|
-
print("TypeInferer:", *values, sep=sep, end=end)
|
|
428
|
-
|
|
429
|
-
def _transition(self, st: ColumnState, to_type: DataType, reason: str) -> None:
|
|
430
|
-
"""Set st.data_type and emit a standardised debug line if changed."""
|
|
431
|
-
from_type = st.data_type
|
|
432
|
-
st.data_type = to_type
|
|
433
|
-
if self.debug:
|
|
434
|
-
if from_type != to_type:
|
|
435
|
-
self.__say(f"[{st.name}] {from_type.name} → {to_type.name} ({reason})")
|
|
436
|
-
else:
|
|
437
|
-
self.__say(f"[{st.name}] stays {to_type.name} ({reason})")
|
|
438
|
-
|
|
439
|
-
def _fmt_examples(
|
|
440
|
-
self,
|
|
441
|
-
vc: pd.Series,
|
|
442
|
-
*,
|
|
443
|
-
max_examples: int = 5,
|
|
444
|
-
max_value_len: int = 80,
|
|
445
|
-
) -> str:
|
|
446
|
-
shown = vc.head(max_examples)
|
|
447
|
-
parts: list[str] = []
|
|
448
|
-
for val in shown.index:
|
|
449
|
-
s = repr(val)
|
|
450
|
-
if len(s) > max_value_len:
|
|
451
|
-
s = s[: max_value_len - 1] + "…"
|
|
452
|
-
parts.append(s)
|
|
453
|
-
extra = vc.shape[0] - shown.shape[0]
|
|
454
|
-
suffix = f"; …+{extra}" if extra > 0 else ""
|
|
455
|
-
return "[" + "; ".join(parts) + suffix + "]"
|
|
456
|
-
|
|
457
|
-
def _debug_offenders_numeric(
|
|
458
|
-
self,
|
|
459
|
-
st: ColumnState,
|
|
460
|
-
nn: pd.Series,
|
|
461
|
-
int_like: pd.Series,
|
|
462
|
-
float_like: pd.Series,
|
|
463
|
-
*,
|
|
464
|
-
max_examples: int = 5,
|
|
465
|
-
note: str = "non-numeric present",
|
|
466
|
-
) -> None:
|
|
467
|
-
if not self.debug:
|
|
468
|
-
return
|
|
469
|
-
bad = ~(int_like | float_like)
|
|
470
|
-
if not bool(bad.any()):
|
|
471
|
-
return
|
|
472
|
-
vc = nn[bad].value_counts(dropna=False)
|
|
473
|
-
examples = self._fmt_examples(vc, max_examples=max_examples)
|
|
474
|
-
self.__say(f"[{st.name}] numeric disqualified: {note}. Examples {examples}")
|
|
475
|
-
|
|
476
|
-
def _debug_offenders_datetime(
|
|
477
|
-
self,
|
|
478
|
-
st: ColumnState,
|
|
479
|
-
nn: pd.Series,
|
|
480
|
-
ok_mask: pd.Series,
|
|
481
|
-
*,
|
|
482
|
-
max_examples: int = 5,
|
|
483
|
-
) -> None:
|
|
484
|
-
if not self.debug:
|
|
485
|
-
return
|
|
486
|
-
bad = ~ok_mask
|
|
487
|
-
if not bool(bad.any()):
|
|
488
|
-
return
|
|
489
|
-
vc = nn[bad].value_counts(dropna=False)
|
|
490
|
-
examples = self._fmt_examples(vc, max_examples=max_examples)
|
|
491
|
-
self.__say(f"[{st.name}] datetime disqualified. Examples {examples}")
|
|
492
|
-
|
|
493
|
-
def _debug_leading_zero_examples(
|
|
494
|
-
self,
|
|
495
|
-
st: ColumnState,
|
|
496
|
-
nn: pd.Series,
|
|
497
|
-
*,
|
|
498
|
-
max_examples: int = 5,
|
|
499
|
-
) -> None:
|
|
500
|
-
if not self.debug:
|
|
501
|
-
return
|
|
502
|
-
m = nn.str.match(_LEAD0_RE, na=False)
|
|
503
|
-
if not bool(m.any()):
|
|
504
|
-
return
|
|
505
|
-
vc = nn[m].value_counts(dropna=False)
|
|
506
|
-
examples = self._fmt_examples(vc, max_examples=max_examples)
|
|
507
|
-
self.__say(f"[{st.name}] leading-zero lock. Examples {examples}")
|
|
508
|
-
|
|
509
|
-
def _debug_float_promotion(
|
|
510
|
-
self,
|
|
511
|
-
st: ColumnState,
|
|
512
|
-
nn: pd.Series,
|
|
513
|
-
int_equiv: pd.Series,
|
|
514
|
-
float_like: pd.Series,
|
|
515
|
-
*,
|
|
516
|
-
max_examples: int = 5,
|
|
517
|
-
) -> None:
|
|
518
|
-
if not self.debug:
|
|
519
|
-
return
|
|
520
|
-
non_integer_numeric = float_like & ~int_equiv
|
|
521
|
-
if not bool(non_integer_numeric.any()):
|
|
522
|
-
self.__say(f"[{st.name}] promoted to FLOAT.")
|
|
523
|
-
return
|
|
524
|
-
sample = nn[non_integer_numeric]
|
|
525
|
-
reasons = []
|
|
526
|
-
if bool(sample.str.contains(r"\.", na=False).any()):
|
|
527
|
-
reasons.append("decimals present")
|
|
528
|
-
if bool(sample.str.contains(r"[eE][+-]?\d+", na=False).any()):
|
|
529
|
-
reasons.append("scientific notation present")
|
|
530
|
-
reason_msg = (": " + ", ".join(reasons)) if reasons else ""
|
|
531
|
-
vc = sample.value_counts(dropna=False)
|
|
532
|
-
examples = self._fmt_examples(vc, max_examples=max_examples)
|
|
533
|
-
self.__say(f"[{st.name}] promoted to FLOAT{reason_msg}. Examples {examples}")
|
|
534
|
-
|
|
535
|
-
def __begin_step(self, step: str):
|
|
536
|
-
self.progress.begin_step(
|
|
537
|
-
step=step, alt_postfix=f"{self.__current_column}: {step}"
|
|
538
|
-
)
|
|
539
|
-
|
|
540
|
-
def __complete_step(self, n: int = 1, save_as: str = None):
|
|
541
|
-
self.progress.complete_step(n=n, save_as=save_as)
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import warnings
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from valediction.data_types.data_type_helpers import infer_datetime_format
|
|
9
|
+
from valediction.data_types.data_types import DataType
|
|
10
|
+
from valediction.integrity import get_config
|
|
11
|
+
from valediction.progress import Progress
|
|
12
|
+
|
|
13
|
+
# ---------- compiled patterns ----------
|
|
14
|
+
_INT_RE = re.compile(r"^[+-]?\d+$")
|
|
15
|
+
# FLOAT: allow decimals OR integers, plus optional scientific notation
|
|
16
|
+
_FLOAT_RE = re.compile(r"^[+-]?(?:\d+\.\d*|\.\d+|\d+)(?:[eE][+-]?\d+)?$")
|
|
17
|
+
# integers written as 123, 123.0, 123.
|
|
18
|
+
_INT_EQ_RE = re.compile(r"^[+-]?\d+(?:\.0*)?$")
|
|
19
|
+
_LEAD0_RE = re.compile(r"^[+-]?0\d+$")
|
|
20
|
+
_DATE_HINT_RE = re.compile(r"[-/T]") # cheap prefilter
|
|
21
|
+
COLUMN_STEPS = 8
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ColumnState:
|
|
25
|
+
def __init__(self, name: str) -> None:
|
|
26
|
+
self.name = name
|
|
27
|
+
self.data_type: DataType = DataType.TEXT
|
|
28
|
+
self.nullable: bool = False
|
|
29
|
+
self.max_length: int = 0
|
|
30
|
+
|
|
31
|
+
# Locks / disqualifiers
|
|
32
|
+
self.lock_text_due_to_leading_zero: bool = False
|
|
33
|
+
self.lock_text_permanent: bool = False
|
|
34
|
+
self.disqualify_numeric: bool = False
|
|
35
|
+
self.disqualify_datetime: bool = False
|
|
36
|
+
|
|
37
|
+
# Datetime speed hint
|
|
38
|
+
self.cached_datetime_format: str | None = None
|
|
39
|
+
self.prefer_date_first: bool = False
|
|
40
|
+
|
|
41
|
+
def final_data_type_and_length(self) -> tuple[DataType, int | None]:
|
|
42
|
+
def _len1() -> int:
|
|
43
|
+
return max(1, self.max_length or 0)
|
|
44
|
+
|
|
45
|
+
if self.lock_text_due_to_leading_zero or self.lock_text_permanent:
|
|
46
|
+
return DataType.TEXT, _len1()
|
|
47
|
+
if self.data_type == DataType.TEXT:
|
|
48
|
+
return DataType.TEXT, _len1()
|
|
49
|
+
|
|
50
|
+
if self.data_type == DataType.INTEGER:
|
|
51
|
+
return DataType.INTEGER, None
|
|
52
|
+
if self.data_type == DataType.FLOAT:
|
|
53
|
+
return DataType.FLOAT, None
|
|
54
|
+
if self.data_type == DataType.DATE:
|
|
55
|
+
return DataType.DATE, None
|
|
56
|
+
if self.data_type == DataType.DATETIME:
|
|
57
|
+
return DataType.DATETIME, None
|
|
58
|
+
|
|
59
|
+
return DataType.TEXT, _len1()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class TypeInferer:
|
|
63
|
+
"""
|
|
64
|
+
Chunk-friendly type inference with:
|
|
65
|
+
- compiled regex reuse
|
|
66
|
+
- cached datetime formats
|
|
67
|
+
- sticky TEXT on contradictions
|
|
68
|
+
- unified debug logging via __say()
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
*,
|
|
74
|
+
dayfirst: bool,
|
|
75
|
+
debug: bool = False,
|
|
76
|
+
progress: Progress = None,
|
|
77
|
+
) -> None:
|
|
78
|
+
config = get_config()
|
|
79
|
+
self.dayfirst = dayfirst
|
|
80
|
+
self.datetime_formats = config.date_formats
|
|
81
|
+
self.null_tokens = {v.strip().lower() for v in config.null_values}
|
|
82
|
+
self.states: dict[str, ColumnState] = {}
|
|
83
|
+
self.debug = debug
|
|
84
|
+
self.progress: Progress = progress
|
|
85
|
+
self.__current_column: str | None = None
|
|
86
|
+
|
|
87
|
+
# Inference
|
|
88
|
+
def update_with_chunk(self, df: pd.DataFrame) -> None:
|
|
89
|
+
if df.empty:
|
|
90
|
+
return
|
|
91
|
+
|
|
92
|
+
for col in df.columns:
|
|
93
|
+
self.__current_column = col
|
|
94
|
+
self.__begin_step(step="Preparing column")
|
|
95
|
+
series = self._ensure_string_series(df[col])
|
|
96
|
+
state = self.states.setdefault(col, ColumnState(name=col))
|
|
97
|
+
self.__complete_step() # 1 step
|
|
98
|
+
|
|
99
|
+
trimmed, nulls, nonnull_mask, max_len = self._preprocess_column(
|
|
100
|
+
series
|
|
101
|
+
) # 4 steps
|
|
102
|
+
state.nullable |= bool(nulls.any())
|
|
103
|
+
if max_len is not None and max_len > state.max_length:
|
|
104
|
+
state.max_length = max_len
|
|
105
|
+
|
|
106
|
+
if not bool(nonnull_mask.any()):
|
|
107
|
+
self.__complete_step(n=3, save_as="Skipped")
|
|
108
|
+
continue # nothing to learn in this chunk
|
|
109
|
+
|
|
110
|
+
non_nulls = trimmed[nonnull_mask]
|
|
111
|
+
|
|
112
|
+
# Hard TEXT locks
|
|
113
|
+
if self._apply_hard_text_locks(state, non_nulls): # 1 step
|
|
114
|
+
self.__complete_step(n=2, save_as="Skipped")
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
# Datetime fast path
|
|
118
|
+
if self._apply_datetime_fast_path(state, non_nulls): # 1 step
|
|
119
|
+
self.__complete_step(n=1, save_as="Skipped")
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
# State-specific handling
|
|
123
|
+
_handling_function: callable = {
|
|
124
|
+
DataType.TEXT: self._handle_state_text,
|
|
125
|
+
DataType.DATE: self._handle_state_date,
|
|
126
|
+
DataType.DATETIME: self._handle_state_datetime,
|
|
127
|
+
DataType.INTEGER: self._handle_state_integer,
|
|
128
|
+
DataType.FLOAT: self._handle_state_float,
|
|
129
|
+
}.get(state.data_type, self._handle_state_text)
|
|
130
|
+
|
|
131
|
+
_handling_function(state, non_nulls) # 1 of 5 steps
|
|
132
|
+
|
|
133
|
+
# Inference Helpers
|
|
134
|
+
@staticmethod
|
|
135
|
+
def _ensure_string_series(s: pd.Series) -> pd.Series:
|
|
136
|
+
if not pd.api.types.is_string_dtype(s.dtype):
|
|
137
|
+
return s.astype("string")
|
|
138
|
+
return s
|
|
139
|
+
|
|
140
|
+
def _preprocess_column(
|
|
141
|
+
self, s: pd.Series
|
|
142
|
+
) -> tuple[pd.Series, pd.Series, pd.Series, int | None]:
|
|
143
|
+
self.__begin_step(step="Trimming whitespace")
|
|
144
|
+
trimmed = s.str.strip()
|
|
145
|
+
self.__complete_step()
|
|
146
|
+
|
|
147
|
+
self.__begin_step(step="Checking nulls")
|
|
148
|
+
nulls = trimmed.isna() | trimmed.str.lower().isin(self.null_tokens)
|
|
149
|
+
self.__complete_step()
|
|
150
|
+
|
|
151
|
+
self.__begin_step(step="Checking max length")
|
|
152
|
+
lengths = s.str.len()
|
|
153
|
+
max_len = int(lengths.max(skipna=True)) if lengths.notna().any() else None
|
|
154
|
+
self.__complete_step()
|
|
155
|
+
|
|
156
|
+
self.__begin_step(step="Setting non-null mask")
|
|
157
|
+
nonnull_mask = (~nulls) & s.notna()
|
|
158
|
+
self.__complete_step()
|
|
159
|
+
|
|
160
|
+
return trimmed, nulls, nonnull_mask, max_len
|
|
161
|
+
|
|
162
|
+
# Early Locks
|
|
163
|
+
@staticmethod
|
|
164
|
+
def _looks_dateish(nn: pd.Series) -> bool:
|
|
165
|
+
return bool(nn.str.contains(_DATE_HINT_RE).any())
|
|
166
|
+
|
|
167
|
+
@staticmethod
|
|
168
|
+
def _has_leading_zero(nn: pd.Series) -> bool:
|
|
169
|
+
return bool(nn.str.match(_LEAD0_RE, na=False).any())
|
|
170
|
+
|
|
171
|
+
def _apply_hard_text_locks(self, st: ColumnState, nn: pd.Series) -> bool:
|
|
172
|
+
if st.lock_text_due_to_leading_zero or st.lock_text_permanent:
|
|
173
|
+
self._transition(st, DataType.TEXT, "locked to TEXT")
|
|
174
|
+
self.__complete_step()
|
|
175
|
+
return True
|
|
176
|
+
|
|
177
|
+
if self._has_leading_zero(nn):
|
|
178
|
+
self._debug_leading_zero_examples(st, nn)
|
|
179
|
+
st.lock_text_due_to_leading_zero = True
|
|
180
|
+
self._transition(st, DataType.TEXT, "leading-zero integer tokens")
|
|
181
|
+
self.__complete_step()
|
|
182
|
+
return True
|
|
183
|
+
|
|
184
|
+
self.__complete_step()
|
|
185
|
+
return False
|
|
186
|
+
|
|
187
|
+
def _apply_datetime_fast_path(self, st: ColumnState, nn: pd.Series) -> bool:
|
|
188
|
+
self.__begin_step(step="Applying datetime locks")
|
|
189
|
+
|
|
190
|
+
# Cached single format
|
|
191
|
+
if st.cached_datetime_format is not None:
|
|
192
|
+
ok, has_time = self._parse_with_cached_format(nn, st.cached_datetime_format)
|
|
193
|
+
if ok.all():
|
|
194
|
+
self._transition(
|
|
195
|
+
st,
|
|
196
|
+
DataType.DATETIME if has_time.any() else DataType.DATE,
|
|
197
|
+
f"cached datetime format={st.cached_datetime_format!r}",
|
|
198
|
+
)
|
|
199
|
+
self.__complete_step()
|
|
200
|
+
return True
|
|
201
|
+
|
|
202
|
+
st.cached_datetime_format = None
|
|
203
|
+
st.prefer_date_first = False
|
|
204
|
+
|
|
205
|
+
# Date-first hint (explicit formats)
|
|
206
|
+
if st.prefer_date_first and not st.disqualify_datetime:
|
|
207
|
+
for fmt in self.datetime_formats:
|
|
208
|
+
ok, has_time = self._parse_with_cached_format(nn, fmt)
|
|
209
|
+
if ok.all():
|
|
210
|
+
st.cached_datetime_format = fmt
|
|
211
|
+
self._transition(
|
|
212
|
+
st,
|
|
213
|
+
DataType.DATETIME if has_time.any() else DataType.DATE,
|
|
214
|
+
f"explicit datetime format={fmt!r}",
|
|
215
|
+
)
|
|
216
|
+
self.__complete_step()
|
|
217
|
+
return True
|
|
218
|
+
|
|
219
|
+
self.__complete_step()
|
|
220
|
+
return False
|
|
221
|
+
|
|
222
|
+
# State Handlers
|
|
223
|
+
def _handle_state_text(self, st: ColumnState, nn: pd.Series) -> None:
|
|
224
|
+
self.__begin_step(step="Handling text")
|
|
225
|
+
# DATETIME attempt
|
|
226
|
+
if not st.disqualify_datetime and self._looks_dateish(nn):
|
|
227
|
+
if self._try_parse_datetime_then_cache(st, nn):
|
|
228
|
+
self.__complete_step()
|
|
229
|
+
return
|
|
230
|
+
|
|
231
|
+
# NUMERIC attempt
|
|
232
|
+
if not st.disqualify_numeric:
|
|
233
|
+
int_equiv = nn.str.fullmatch(_INT_EQ_RE, na=False)
|
|
234
|
+
float_like = nn.str.fullmatch(_FLOAT_RE, na=False)
|
|
235
|
+
|
|
236
|
+
if int_equiv.all():
|
|
237
|
+
self._transition(st, DataType.INTEGER, "all integer-equivalent")
|
|
238
|
+
self.__complete_step()
|
|
239
|
+
return
|
|
240
|
+
|
|
241
|
+
if (int_equiv | float_like).all():
|
|
242
|
+
self._debug_float_promotion(st, nn, int_equiv, float_like)
|
|
243
|
+
self._transition(st, DataType.FLOAT, "mixed numeric (int/float)")
|
|
244
|
+
self.__complete_step()
|
|
245
|
+
return
|
|
246
|
+
|
|
247
|
+
# Otherwise: non-numeric → TEXT (sticky)
|
|
248
|
+
self._debug_offenders_numeric(st, nn, int_equiv, float_like)
|
|
249
|
+
st.disqualify_numeric = True
|
|
250
|
+
st.lock_text_permanent = True
|
|
251
|
+
self._transition(st, DataType.TEXT, "non-numeric tokens present")
|
|
252
|
+
self.__complete_step()
|
|
253
|
+
return
|
|
254
|
+
|
|
255
|
+
# If both numeric and datetime are disqualified, permanently TEXT
|
|
256
|
+
if st.disqualify_numeric and st.disqualify_datetime:
|
|
257
|
+
st.lock_text_permanent = True
|
|
258
|
+
self._transition(
|
|
259
|
+
st, DataType.TEXT, "both numeric and datetime disqualified"
|
|
260
|
+
)
|
|
261
|
+
self.__complete_step()
|
|
262
|
+
|
|
263
|
+
def _handle_state_date(self, st: ColumnState, nn: pd.Series) -> None:
|
|
264
|
+
self.__begin_step(step="Handling dates")
|
|
265
|
+
if not self._looks_dateish(nn):
|
|
266
|
+
st.disqualify_datetime = True
|
|
267
|
+
st.lock_text_permanent = True
|
|
268
|
+
self._transition(st, DataType.TEXT, "lost date-ish pattern")
|
|
269
|
+
self.__complete_step()
|
|
270
|
+
return
|
|
271
|
+
|
|
272
|
+
ok, has_time = self._datetime_parse_ok(nn)
|
|
273
|
+
if not ok.all():
|
|
274
|
+
self._debug_offenders_datetime(st, nn, ok)
|
|
275
|
+
st.disqualify_datetime = True
|
|
276
|
+
st.lock_text_permanent = True
|
|
277
|
+
self._transition(st, DataType.TEXT, "datetime parse failures")
|
|
278
|
+
elif has_time.any():
|
|
279
|
+
self._transition(st, DataType.DATETIME, "time component detected")
|
|
280
|
+
|
|
281
|
+
self.__complete_step()
|
|
282
|
+
|
|
283
|
+
def _handle_state_datetime(self, st: ColumnState, nn: pd.Series) -> None:
|
|
284
|
+
self.__begin_step(step="Handling datetimes")
|
|
285
|
+
if not self._looks_dateish(nn):
|
|
286
|
+
st.disqualify_datetime = True
|
|
287
|
+
st.lock_text_permanent = True
|
|
288
|
+
self._transition(st, DataType.TEXT, "lost date-ish pattern")
|
|
289
|
+
self.__complete_step()
|
|
290
|
+
return
|
|
291
|
+
|
|
292
|
+
ok, _ = self._datetime_parse_ok(nn)
|
|
293
|
+
if not ok.all():
|
|
294
|
+
self._debug_offenders_datetime(st, nn, ok)
|
|
295
|
+
st.disqualify_datetime = True
|
|
296
|
+
st.lock_text_permanent = True
|
|
297
|
+
self._transition(st, DataType.TEXT, "datetime parse failures")
|
|
298
|
+
|
|
299
|
+
self.__complete_step()
|
|
300
|
+
|
|
301
|
+
def _handle_state_integer(self, st: ColumnState, nn: pd.Series) -> None:
|
|
302
|
+
self.__begin_step(step="Handling integers")
|
|
303
|
+
int_equiv = nn.str.fullmatch(_INT_EQ_RE, na=False)
|
|
304
|
+
float_like = nn.str.fullmatch(_FLOAT_RE, na=False)
|
|
305
|
+
|
|
306
|
+
if not (int_equiv | float_like).all():
|
|
307
|
+
self._debug_offenders_numeric(st, nn, int_equiv, float_like)
|
|
308
|
+
st.disqualify_numeric = True
|
|
309
|
+
st.lock_text_permanent = True
|
|
310
|
+
self._transition(st, DataType.TEXT, "non-numeric tokens introduced")
|
|
311
|
+
elif float_like.any() and not int_equiv.all():
|
|
312
|
+
self._debug_float_promotion(st, nn, int_equiv, float_like)
|
|
313
|
+
self._transition(st, DataType.FLOAT, "decimals/scientific detected")
|
|
314
|
+
|
|
315
|
+
self.__complete_step()
|
|
316
|
+
# else remain INTEGER
|
|
317
|
+
|
|
318
|
+
def _handle_state_float(self, st: ColumnState, nn: pd.Series) -> None:
|
|
319
|
+
self.__begin_step(step="Handling floats")
|
|
320
|
+
int_like = nn.str.fullmatch(_INT_RE, na=False)
|
|
321
|
+
fl_like = nn.str.fullmatch(_FLOAT_RE, na=False)
|
|
322
|
+
if not (int_like | fl_like).all():
|
|
323
|
+
self._debug_offenders_numeric(st, nn, int_like, fl_like)
|
|
324
|
+
st.disqualify_numeric = True
|
|
325
|
+
st.lock_text_permanent = True
|
|
326
|
+
self._transition(st, DataType.TEXT, "non-numeric tokens introduced")
|
|
327
|
+
self.__complete_step()
|
|
328
|
+
|
|
329
|
+
# Datetime Parsing
|
|
330
|
+
def _try_parse_datetime_then_cache(self, st: ColumnState, nn: pd.Series) -> bool:
|
|
331
|
+
# 1) If we’ve already cached a format, try it fast
|
|
332
|
+
if st.cached_datetime_format is not None:
|
|
333
|
+
ok, has_time = self._parse_with_cached_format(nn, st.cached_datetime_format)
|
|
334
|
+
if ok.all():
|
|
335
|
+
self._transition(
|
|
336
|
+
st,
|
|
337
|
+
DataType.DATETIME if has_time.any() else DataType.DATE,
|
|
338
|
+
f"cached datetime format={st.cached_datetime_format!r}",
|
|
339
|
+
)
|
|
340
|
+
return True
|
|
341
|
+
# cache failed on this chunk; clear and fall through to re-infer once
|
|
342
|
+
st.cached_datetime_format = None
|
|
343
|
+
st.prefer_date_first = False
|
|
344
|
+
|
|
345
|
+
# 2) Infer with the new helper (efficient: unique, batched, intersects across slices)
|
|
346
|
+
# Work on uniques only for speed and stability.
|
|
347
|
+
uniq = (
|
|
348
|
+
nn.astype("string", copy=False)
|
|
349
|
+
.str.strip()
|
|
350
|
+
.replace("", pd.NA)
|
|
351
|
+
.dropna()
|
|
352
|
+
.unique()
|
|
353
|
+
)
|
|
354
|
+
if len(uniq) == 0:
|
|
355
|
+
return False
|
|
356
|
+
|
|
357
|
+
try:
|
|
358
|
+
fmt_or_false = infer_datetime_format(pd.Series(uniq, dtype="string"))
|
|
359
|
+
except ValueError as e:
|
|
360
|
+
# ambiguous after scanning – treat as “can’t determine” and disqualify
|
|
361
|
+
self.__say(f"[{st.name}] datetime ambiguous: {e}")
|
|
362
|
+
st.disqualify_datetime = True
|
|
363
|
+
return False
|
|
364
|
+
|
|
365
|
+
if fmt_or_false is False:
|
|
366
|
+
# helper couldn’t find any valid explicit format
|
|
367
|
+
st.disqualify_datetime = True
|
|
368
|
+
self._transition(
|
|
369
|
+
st, DataType.TEXT, "datetime helper found no matching format"
|
|
370
|
+
)
|
|
371
|
+
return False
|
|
372
|
+
|
|
373
|
+
# 3) Cache and confirm on current (non-unique) values
|
|
374
|
+
st.cached_datetime_format = fmt_or_false
|
|
375
|
+
st.prefer_date_first = True
|
|
376
|
+
ok, has_time = self._parse_with_cached_format(nn, st.cached_datetime_format)
|
|
377
|
+
if ok.all():
|
|
378
|
+
self._transition(
|
|
379
|
+
st,
|
|
380
|
+
DataType.DATETIME if has_time.any() else DataType.DATE,
|
|
381
|
+
f"explicit datetime format={st.cached_datetime_format!r}",
|
|
382
|
+
)
|
|
383
|
+
return True
|
|
384
|
+
|
|
385
|
+
self.__say(
|
|
386
|
+
f"[{st.name}] cached format failed on live slice; disqualifying datetime."
|
|
387
|
+
)
|
|
388
|
+
st.cached_datetime_format = None
|
|
389
|
+
st.disqualify_datetime = True
|
|
390
|
+
return False
|
|
391
|
+
|
|
392
|
+
def _parse_with_cached_format(
|
|
393
|
+
self, s: pd.Series, fmt: str
|
|
394
|
+
) -> tuple[pd.Series, pd.Series]:
|
|
395
|
+
with warnings.catch_warnings():
|
|
396
|
+
warnings.simplefilter("ignore", UserWarning)
|
|
397
|
+
parsed = pd.to_datetime(s, format=fmt, errors="coerce", utc=False)
|
|
398
|
+
|
|
399
|
+
ok = parsed.notna()
|
|
400
|
+
has_time = ok & (
|
|
401
|
+
(parsed.dt.hour != 0)
|
|
402
|
+
| (parsed.dt.minute != 0)
|
|
403
|
+
| (parsed.dt.second != 0)
|
|
404
|
+
| (parsed.dt.microsecond != 0)
|
|
405
|
+
)
|
|
406
|
+
return ok, has_time
|
|
407
|
+
|
|
408
|
+
def _datetime_parse_ok(self, s: pd.Series) -> tuple[pd.Series, pd.Series]:
|
|
409
|
+
with warnings.catch_warnings():
|
|
410
|
+
warnings.simplefilter("ignore", UserWarning)
|
|
411
|
+
parsed = pd.to_datetime(
|
|
412
|
+
s, errors="coerce", dayfirst=self.dayfirst, utc=False
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
ok = parsed.notna()
|
|
416
|
+
has_time = ok & (
|
|
417
|
+
(parsed.dt.hour != 0)
|
|
418
|
+
| (parsed.dt.minute != 0)
|
|
419
|
+
| (parsed.dt.second != 0)
|
|
420
|
+
| (parsed.dt.microsecond != 0)
|
|
421
|
+
)
|
|
422
|
+
return ok, has_time
|
|
423
|
+
|
|
424
|
+
# Debug/Log
|
|
425
|
+
def __say(self, *values: object, sep: str = " ", end: str = "\n") -> None:
|
|
426
|
+
if self.debug:
|
|
427
|
+
print("TypeInferer:", *values, sep=sep, end=end)
|
|
428
|
+
|
|
429
|
+
def _transition(self, st: ColumnState, to_type: DataType, reason: str) -> None:
|
|
430
|
+
"""Set st.data_type and emit a standardised debug line if changed."""
|
|
431
|
+
from_type = st.data_type
|
|
432
|
+
st.data_type = to_type
|
|
433
|
+
if self.debug:
|
|
434
|
+
if from_type != to_type:
|
|
435
|
+
self.__say(f"[{st.name}] {from_type.name} → {to_type.name} ({reason})")
|
|
436
|
+
else:
|
|
437
|
+
self.__say(f"[{st.name}] stays {to_type.name} ({reason})")
|
|
438
|
+
|
|
439
|
+
def _fmt_examples(
|
|
440
|
+
self,
|
|
441
|
+
vc: pd.Series,
|
|
442
|
+
*,
|
|
443
|
+
max_examples: int = 5,
|
|
444
|
+
max_value_len: int = 80,
|
|
445
|
+
) -> str:
|
|
446
|
+
shown = vc.head(max_examples)
|
|
447
|
+
parts: list[str] = []
|
|
448
|
+
for val in shown.index:
|
|
449
|
+
s = repr(val)
|
|
450
|
+
if len(s) > max_value_len:
|
|
451
|
+
s = s[: max_value_len - 1] + "…"
|
|
452
|
+
parts.append(s)
|
|
453
|
+
extra = vc.shape[0] - shown.shape[0]
|
|
454
|
+
suffix = f"; …+{extra}" if extra > 0 else ""
|
|
455
|
+
return "[" + "; ".join(parts) + suffix + "]"
|
|
456
|
+
|
|
457
|
+
def _debug_offenders_numeric(
|
|
458
|
+
self,
|
|
459
|
+
st: ColumnState,
|
|
460
|
+
nn: pd.Series,
|
|
461
|
+
int_like: pd.Series,
|
|
462
|
+
float_like: pd.Series,
|
|
463
|
+
*,
|
|
464
|
+
max_examples: int = 5,
|
|
465
|
+
note: str = "non-numeric present",
|
|
466
|
+
) -> None:
|
|
467
|
+
if not self.debug:
|
|
468
|
+
return
|
|
469
|
+
bad = ~(int_like | float_like)
|
|
470
|
+
if not bool(bad.any()):
|
|
471
|
+
return
|
|
472
|
+
vc = nn[bad].value_counts(dropna=False)
|
|
473
|
+
examples = self._fmt_examples(vc, max_examples=max_examples)
|
|
474
|
+
self.__say(f"[{st.name}] numeric disqualified: {note}. Examples {examples}")
|
|
475
|
+
|
|
476
|
+
def _debug_offenders_datetime(
|
|
477
|
+
self,
|
|
478
|
+
st: ColumnState,
|
|
479
|
+
nn: pd.Series,
|
|
480
|
+
ok_mask: pd.Series,
|
|
481
|
+
*,
|
|
482
|
+
max_examples: int = 5,
|
|
483
|
+
) -> None:
|
|
484
|
+
if not self.debug:
|
|
485
|
+
return
|
|
486
|
+
bad = ~ok_mask
|
|
487
|
+
if not bool(bad.any()):
|
|
488
|
+
return
|
|
489
|
+
vc = nn[bad].value_counts(dropna=False)
|
|
490
|
+
examples = self._fmt_examples(vc, max_examples=max_examples)
|
|
491
|
+
self.__say(f"[{st.name}] datetime disqualified. Examples {examples}")
|
|
492
|
+
|
|
493
|
+
def _debug_leading_zero_examples(
|
|
494
|
+
self,
|
|
495
|
+
st: ColumnState,
|
|
496
|
+
nn: pd.Series,
|
|
497
|
+
*,
|
|
498
|
+
max_examples: int = 5,
|
|
499
|
+
) -> None:
|
|
500
|
+
if not self.debug:
|
|
501
|
+
return
|
|
502
|
+
m = nn.str.match(_LEAD0_RE, na=False)
|
|
503
|
+
if not bool(m.any()):
|
|
504
|
+
return
|
|
505
|
+
vc = nn[m].value_counts(dropna=False)
|
|
506
|
+
examples = self._fmt_examples(vc, max_examples=max_examples)
|
|
507
|
+
self.__say(f"[{st.name}] leading-zero lock. Examples {examples}")
|
|
508
|
+
|
|
509
|
+
def _debug_float_promotion(
|
|
510
|
+
self,
|
|
511
|
+
st: ColumnState,
|
|
512
|
+
nn: pd.Series,
|
|
513
|
+
int_equiv: pd.Series,
|
|
514
|
+
float_like: pd.Series,
|
|
515
|
+
*,
|
|
516
|
+
max_examples: int = 5,
|
|
517
|
+
) -> None:
|
|
518
|
+
if not self.debug:
|
|
519
|
+
return
|
|
520
|
+
non_integer_numeric = float_like & ~int_equiv
|
|
521
|
+
if not bool(non_integer_numeric.any()):
|
|
522
|
+
self.__say(f"[{st.name}] promoted to FLOAT.")
|
|
523
|
+
return
|
|
524
|
+
sample = nn[non_integer_numeric]
|
|
525
|
+
reasons = []
|
|
526
|
+
if bool(sample.str.contains(r"\.", na=False).any()):
|
|
527
|
+
reasons.append("decimals present")
|
|
528
|
+
if bool(sample.str.contains(r"[eE][+-]?\d+", na=False).any()):
|
|
529
|
+
reasons.append("scientific notation present")
|
|
530
|
+
reason_msg = (": " + ", ".join(reasons)) if reasons else ""
|
|
531
|
+
vc = sample.value_counts(dropna=False)
|
|
532
|
+
examples = self._fmt_examples(vc, max_examples=max_examples)
|
|
533
|
+
self.__say(f"[{st.name}] promoted to FLOAT{reason_msg}. Examples {examples}")
|
|
534
|
+
|
|
535
|
+
def __begin_step(self, step: str):
|
|
536
|
+
self.progress.begin_step(
|
|
537
|
+
step=step, alt_postfix=f"{self.__current_column}: {step}"
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
def __complete_step(self, n: int = 1, save_as: str = None):
|
|
541
|
+
self.progress.complete_step(n=n, save_as=save_as)
|