ltc-code 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ltc_code-0.1.0/PKG-INFO
ADDED
ltc_code-0.1.0/README.md
ADDED
|
File without changes
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "ltc-code"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Add your description here"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
dependencies = []
|
|
8
|
+
|
|
9
|
+
[project.scripts]
|
|
10
|
+
ltc-code = "ltc_code:main"
|
|
11
|
+
|
|
12
|
+
[build-system]
|
|
13
|
+
requires = ["uv_build>=0.11.16,<0.12.0"]
|
|
14
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Utilities for parsing mixed-format string dates with Polars.
|
|
2
|
+
|
|
3
|
+
`parse_date_column(...)` returns expressions that can be used directly inside
|
|
4
|
+
`with_columns(*...)`.
|
|
5
|
+
|
|
6
|
+
`apply_parse_date_column(...)` is a small eager wrapper around those
|
|
7
|
+
expressions that prints diagnostics and conditionally keeps or drops the
|
|
8
|
+
parse-failure flag column.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Sequence
|
|
14
|
+
|
|
15
|
+
import polars as pl
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
DEFAULT_DATE_FORMATS: tuple[str, ...] = (
|
|
19
|
+
"%Y-%m-%d",
|
|
20
|
+
"%Y/%m/%d",
|
|
21
|
+
"%m/%d/%Y",
|
|
22
|
+
"%m-%d-%Y",
|
|
23
|
+
"%b %d, %Y",
|
|
24
|
+
"%d-%b-%Y",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _coalesced_date_expr(column: str, formats: Sequence[str]) -> pl.Expr:
|
|
29
|
+
"""Build one Date expression that tries each format in order."""
|
|
30
|
+
|
|
31
|
+
source = pl.col(column).cast(pl.String)
|
|
32
|
+
return pl.coalesce(
|
|
33
|
+
[
|
|
34
|
+
source.str.strptime(pl.Date, format=fmt, strict=False)
|
|
35
|
+
for fmt in formats
|
|
36
|
+
]
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _collect_if_lazy(frame: pl.DataFrame | pl.LazyFrame) -> pl.DataFrame:
|
|
41
|
+
"""Collect a LazyFrame, or pass through an eager DataFrame unchanged."""
|
|
42
|
+
|
|
43
|
+
if isinstance(frame, pl.LazyFrame):
|
|
44
|
+
return frame.collect()
|
|
45
|
+
return frame
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def parse_date_column(
|
|
49
|
+
column: str,
|
|
50
|
+
*,
|
|
51
|
+
output_column: str | None = None,
|
|
52
|
+
formats: Sequence[str] = DEFAULT_DATE_FORMATS,
|
|
53
|
+
flag_column: str | None = None,
|
|
54
|
+
) -> list[pl.Expr]:
|
|
55
|
+
"""Return Polars expressions for mixed-format date parsing.
|
|
56
|
+
|
|
57
|
+
This helper is designed for direct use inside `with_columns(*...)`.
|
|
58
|
+
It always returns:
|
|
59
|
+
1. the parsed `Date` column
|
|
60
|
+
2. an integer flag column where `1` means the parsed date is null
|
|
61
|
+
|
|
62
|
+
The richer behavior requested by the user (printing diagnostics and
|
|
63
|
+
dropping the flag when only null inputs failed) lives in
|
|
64
|
+
`apply_parse_date_column(...)`, which needs access to the full DataFrame.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
output_column = output_column or column
|
|
68
|
+
flag_column = flag_column or f"{output_column}_parse_failed"
|
|
69
|
+
|
|
70
|
+
parsed_expr = _coalesced_date_expr(column, formats)
|
|
71
|
+
|
|
72
|
+
return [
|
|
73
|
+
parsed_expr.alias(output_column),
|
|
74
|
+
parsed_expr.is_null().cast(pl.Int8).alias(flag_column),
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def apply_parse_date_column(
|
|
79
|
+
df: pl.DataFrame | pl.LazyFrame,
|
|
80
|
+
column: str,
|
|
81
|
+
*,
|
|
82
|
+
output_column: str | None = None,
|
|
83
|
+
formats: Sequence[str] = DEFAULT_DATE_FORMATS,
|
|
84
|
+
flag_column: str | None = None,
|
|
85
|
+
sample_size: int = 5,
|
|
86
|
+
) -> pl.DataFrame | pl.LazyFrame:
|
|
87
|
+
"""Parse one date column on an eager or lazy frame.
|
|
88
|
+
|
|
89
|
+
The return type matches the input type. Diagnostics are printed in either
|
|
90
|
+
case. For lazy inputs, the function collects only the small summary queries
|
|
91
|
+
needed for reporting and deciding whether to keep the failure flag column.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
output_column = output_column or column
|
|
95
|
+
flag_column = flag_column or f"{output_column}_parse_failed"
|
|
96
|
+
|
|
97
|
+
temp_raw_column = f"__{output_column}_raw_input"
|
|
98
|
+
temp_flag_column = f"__{output_column}_parse_failed_tmp"
|
|
99
|
+
temp_null_column = f"__{output_column}_input_was_null_tmp"
|
|
100
|
+
|
|
101
|
+
working = df.with_columns(
|
|
102
|
+
pl.col(column).cast(pl.String).alias(temp_raw_column),
|
|
103
|
+
*parse_date_column(
|
|
104
|
+
column,
|
|
105
|
+
output_column=output_column,
|
|
106
|
+
formats=formats,
|
|
107
|
+
flag_column=temp_flag_column,
|
|
108
|
+
),
|
|
109
|
+
pl.col(column).is_null().alias(temp_null_column),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
summary = _collect_if_lazy(
|
|
113
|
+
working.select(
|
|
114
|
+
pl.len().alias("n_rows"),
|
|
115
|
+
pl.col(output_column).is_not_null().sum().alias("n_parsed_successfully"),
|
|
116
|
+
pl.col(temp_null_column).sum().alias("n_failed_because_null"),
|
|
117
|
+
(
|
|
118
|
+
(pl.col(temp_flag_column) == 1) & (~pl.col(temp_null_column))
|
|
119
|
+
).sum().alias("n_failed_because_invalid_non_null"),
|
|
120
|
+
)
|
|
121
|
+
).row(0, named=True)
|
|
122
|
+
|
|
123
|
+
n_rows = int(summary["n_rows"])
|
|
124
|
+
n_parsed_successfully = int(summary["n_parsed_successfully"])
|
|
125
|
+
n_failed_because_null = int(summary["n_failed_because_null"])
|
|
126
|
+
n_failed_because_invalid_non_null = int(
|
|
127
|
+
summary["n_failed_because_invalid_non_null"]
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
print(
|
|
131
|
+
f"[parse_date_column] '{column}' -> '{output_column}' "
|
|
132
|
+
f"across {n_rows} rows"
|
|
133
|
+
)
|
|
134
|
+
print(f" parsed successfully: {n_parsed_successfully}")
|
|
135
|
+
print(f" failed because original value was null: {n_failed_because_null}")
|
|
136
|
+
print(
|
|
137
|
+
" failed because original value was a non-null invalid date string: "
|
|
138
|
+
f"{n_failed_because_invalid_non_null}"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
if n_failed_because_invalid_non_null > 0:
|
|
142
|
+
invalid_values = _collect_if_lazy(
|
|
143
|
+
working.filter(
|
|
144
|
+
(pl.col(temp_flag_column) == 1) & pl.col(temp_raw_column).is_not_null()
|
|
145
|
+
).select(pl.col(temp_raw_column).alias("invalid_value"))
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
n_unique_invalid = int(
|
|
149
|
+
invalid_values.select(pl.col("invalid_value").n_unique()).item()
|
|
150
|
+
)
|
|
151
|
+
invalid_examples = (
|
|
152
|
+
invalid_values.unique()
|
|
153
|
+
.sort("invalid_value")
|
|
154
|
+
.head(sample_size)
|
|
155
|
+
.get_column("invalid_value")
|
|
156
|
+
.to_list()
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
print(f" unique invalid values: {n_unique_invalid}")
|
|
160
|
+
print(f" example invalid values: {invalid_examples}")
|
|
161
|
+
|
|
162
|
+
return working.with_columns(
|
|
163
|
+
pl.col(temp_flag_column).cast(pl.Int8).alias(flag_column)
|
|
164
|
+
).drop([temp_raw_column, temp_flag_column, temp_null_column])
|
|
165
|
+
|
|
166
|
+
print(" no non-null invalid strings detected, so the failure flag was dropped")
|
|
167
|
+
return working.drop([temp_raw_column, temp_flag_column, temp_null_column])
|