ltc-code 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ltc_code/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ def main() -> None:
2
+ print("Hello from ltc-code!")
@@ -0,0 +1,167 @@
1
+ """Utilities for parsing mixed-format string dates with Polars.
2
+
3
+ `parse_date_column(...)` returns expressions that can be used directly inside
4
+ `with_columns(*...)`.
5
+
6
+ `apply_parse_date_column(...)` is a small eager wrapper around those
7
+ expressions that prints diagnostics and conditionally keeps or drops the
8
+ parse-failure flag column.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Sequence
14
+
15
+ import polars as pl
16
+
17
+
18
+ DEFAULT_DATE_FORMATS: tuple[str, ...] = (
19
+ "%Y-%m-%d",
20
+ "%Y/%m/%d",
21
+ "%m/%d/%Y",
22
+ "%m-%d-%Y",
23
+ "%b %d, %Y",
24
+ "%d-%b-%Y",
25
+ )
26
+
27
+
28
+ def _coalesced_date_expr(column: str, formats: Sequence[str]) -> pl.Expr:
29
+ """Build one Date expression that tries each format in order."""
30
+
31
+ source = pl.col(column).cast(pl.String)
32
+ return pl.coalesce(
33
+ [
34
+ source.str.strptime(pl.Date, format=fmt, strict=False)
35
+ for fmt in formats
36
+ ]
37
+ )
38
+
39
+
40
+ def _collect_if_lazy(frame: pl.DataFrame | pl.LazyFrame) -> pl.DataFrame:
41
+ """Collect a LazyFrame, or pass through an eager DataFrame unchanged."""
42
+
43
+ if isinstance(frame, pl.LazyFrame):
44
+ return frame.collect()
45
+ return frame
46
+
47
+
48
+ def parse_date_column(
49
+ column: str,
50
+ *,
51
+ output_column: str | None = None,
52
+ formats: Sequence[str] = DEFAULT_DATE_FORMATS,
53
+ flag_column: str | None = None,
54
+ ) -> list[pl.Expr]:
55
+ """Return Polars expressions for mixed-format date parsing.
56
+
57
+ This helper is designed for direct use inside `with_columns(*...)`.
58
+ It always returns:
59
+ 1. the parsed `Date` column
60
+ 2. an integer flag column where `1` means the parsed date is null
61
+
62
+ The richer behavior requested by the user (printing diagnostics and
63
+ dropping the flag when only null inputs failed) lives in
64
+ `apply_parse_date_column(...)`, which needs access to the full DataFrame.
65
+ """
66
+
67
+ output_column = output_column or column
68
+ flag_column = flag_column or f"{output_column}_parse_failed"
69
+
70
+ parsed_expr = _coalesced_date_expr(column, formats)
71
+
72
+ return [
73
+ parsed_expr.alias(output_column),
74
+ parsed_expr.is_null().cast(pl.Int8).alias(flag_column),
75
+ ]
76
+
77
+
78
+ def apply_parse_date_column(
79
+ df: pl.DataFrame | pl.LazyFrame,
80
+ column: str,
81
+ *,
82
+ output_column: str | None = None,
83
+ formats: Sequence[str] = DEFAULT_DATE_FORMATS,
84
+ flag_column: str | None = None,
85
+ sample_size: int = 5,
86
+ ) -> pl.DataFrame | pl.LazyFrame:
87
+ """Parse one date column on an eager or lazy frame.
88
+
89
+ The return type matches the input type. Diagnostics are printed in either
90
+ case. For lazy inputs, the function collects only the small summary queries
91
+ needed for reporting and deciding whether to keep the failure flag column.
92
+ """
93
+
94
+ output_column = output_column or column
95
+ flag_column = flag_column or f"{output_column}_parse_failed"
96
+
97
+ temp_raw_column = f"__{output_column}_raw_input"
98
+ temp_flag_column = f"__{output_column}_parse_failed_tmp"
99
+ temp_null_column = f"__{output_column}_input_was_null_tmp"
100
+
101
+ working = df.with_columns(
102
+ pl.col(column).cast(pl.String).alias(temp_raw_column),
103
+ *parse_date_column(
104
+ column,
105
+ output_column=output_column,
106
+ formats=formats,
107
+ flag_column=temp_flag_column,
108
+ ),
109
+ pl.col(column).is_null().alias(temp_null_column),
110
+ )
111
+
112
+ summary = _collect_if_lazy(
113
+ working.select(
114
+ pl.len().alias("n_rows"),
115
+ pl.col(output_column).is_not_null().sum().alias("n_parsed_successfully"),
116
+ pl.col(temp_null_column).sum().alias("n_failed_because_null"),
117
+ (
118
+ (pl.col(temp_flag_column) == 1) & (~pl.col(temp_null_column))
119
+ ).sum().alias("n_failed_because_invalid_non_null"),
120
+ )
121
+ ).row(0, named=True)
122
+
123
+ n_rows = int(summary["n_rows"])
124
+ n_parsed_successfully = int(summary["n_parsed_successfully"])
125
+ n_failed_because_null = int(summary["n_failed_because_null"])
126
+ n_failed_because_invalid_non_null = int(
127
+ summary["n_failed_because_invalid_non_null"]
128
+ )
129
+
130
+ print(
131
+ f"[parse_date_column] '{column}' -> '{output_column}' "
132
+ f"across {n_rows} rows"
133
+ )
134
+ print(f" parsed successfully: {n_parsed_successfully}")
135
+ print(f" failed because original value was null: {n_failed_because_null}")
136
+ print(
137
+ " failed because original value was a non-null invalid date string: "
138
+ f"{n_failed_because_invalid_non_null}"
139
+ )
140
+
141
+ if n_failed_because_invalid_non_null > 0:
142
+ invalid_values = _collect_if_lazy(
143
+ working.filter(
144
+ (pl.col(temp_flag_column) == 1) & pl.col(temp_raw_column).is_not_null()
145
+ ).select(pl.col(temp_raw_column).alias("invalid_value"))
146
+ )
147
+
148
+ n_unique_invalid = int(
149
+ invalid_values.select(pl.col("invalid_value").n_unique()).item()
150
+ )
151
+ invalid_examples = (
152
+ invalid_values.unique()
153
+ .sort("invalid_value")
154
+ .head(sample_size)
155
+ .get_column("invalid_value")
156
+ .to_list()
157
+ )
158
+
159
+ print(f" unique invalid values: {n_unique_invalid}")
160
+ print(f" example invalid values: {invalid_examples}")
161
+
162
+ return working.with_columns(
163
+ pl.col(temp_flag_column).cast(pl.Int8).alias(flag_column)
164
+ ).drop([temp_raw_column, temp_flag_column, temp_null_column])
165
+
166
+ print(" no non-null invalid strings detected, so the failure flag was dropped")
167
+ return working.drop([temp_raw_column, temp_flag_column, temp_null_column])
@@ -0,0 +1,7 @@
1
+ Metadata-Version: 2.3
2
+ Name: ltc-code
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ Requires-Python: >=3.11
6
+ Description-Content-Type: text/markdown
7
+
@@ -0,0 +1,6 @@
1
+ ltc_code/__init__.py,sha256=oVeqiMp3EZkL8L2aIdNzn2HjrmB8D06qj46J1b5VjRI,54
2
+ ltc_code/polars_dates.py,sha256=5S6R4fRBig0SdtZ6X2x7NvJEAfS6akVH0pA_Ryd7Z4U,5487
3
+ ltc_code-0.1.0.dist-info/WHEEL,sha256=f5fWSvWsg5Knq5GWa6t1nJIug0Tqo69GqAWD_9LbBKw,81
4
+ ltc_code-0.1.0.dist-info/entry_points.txt,sha256=qgoFyxqsnABOyOi-uLHQAfOk6CwMndlwAhTpbe68Rpw,44
5
+ ltc_code-0.1.0.dist-info/METADATA,sha256=Evsq-aF0XNBzznwltDT7cFpKthYu5uIs8qFtksIUJyo,152
6
+ ltc_code-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.11.16
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ ltc-code = ltc_code:main
3
+