polars-sgt 0.1.0__cp39-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polars_sgt/.mypy.ini +2 -0
- polars_sgt/__init__.py +35 -0
- polars_sgt/_internal.pyd +0 -0
- polars_sgt/_internal.pyi +1 -0
- polars_sgt/functions.py +859 -0
- polars_sgt/namespace.py +23 -0
- polars_sgt/py.typed +0 -0
- polars_sgt/ranges.py +162 -0
- polars_sgt/typing.py +17 -0
- polars_sgt/utils.py +49 -0
- polars_sgt-0.1.0.dist-info/METADATA +205 -0
- polars_sgt-0.1.0.dist-info/RECORD +14 -0
- polars_sgt-0.1.0.dist-info/WHEEL +4 -0
- polars_sgt-0.1.0.dist-info/licenses/LICENSE +21 -0
polars_sgt/functions.py
ADDED
|
@@ -0,0 +1,859 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from datetime import date
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Literal
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
from polars.plugins import register_plugin_function
|
|
10
|
+
|
|
11
|
+
from polars_sgt.utils import parse_into_expr
|
|
12
|
+
|
|
13
|
+
if sys.version_info >= (3, 10):
|
|
14
|
+
from typing import TypeAlias
|
|
15
|
+
else:
|
|
16
|
+
from typing_extensions import TypeAlias
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from collections.abc import Sequence
|
|
20
|
+
|
|
21
|
+
from polars import Expr
|
|
22
|
+
|
|
23
|
+
from polars_sgt.typing import IntoExprColumn
|
|
24
|
+
|
|
25
|
+
Ambiguous: TypeAlias = Literal["earliest", "latest", "raise", "null"]
|
|
26
|
+
|
|
27
|
+
RollStrategy: TypeAlias = Literal["raise", "forward", "backward"]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
PLUGIN_PATH = Path(__file__).parent
|
|
31
|
+
|
|
32
|
+
mapping = {"Mon": 1, "Tue": 2, "Wed": 3, "Thu": 4, "Fri": 5, "Sat": 6, "Sun": 7}
|
|
33
|
+
reverse_mapping = {value: key for key, value in mapping.items()}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_weekmask(weekend: Sequence[str]) -> list[bool]:
|
|
37
|
+
if weekend == ("Sat", "Sun"):
|
|
38
|
+
weekmask = [True, True, True, True, True, False, False]
|
|
39
|
+
else:
|
|
40
|
+
weekmask = [reverse_mapping[i] not in weekend for i in range(1, 8)]
|
|
41
|
+
if sum(weekmask) == 0:
|
|
42
|
+
msg = f"At least one day of the week must be a business day. Got weekend={weekend}"
|
|
43
|
+
raise ValueError(msg)
|
|
44
|
+
return weekmask
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def is_workday(
|
|
48
|
+
expr: IntoExprColumn,
|
|
49
|
+
*,
|
|
50
|
+
weekend: Sequence[str] = ("Sat", "Sun"),
|
|
51
|
+
holidays: Sequence[date] | None = None,
|
|
52
|
+
) -> pl.Expr:
|
|
53
|
+
"""
|
|
54
|
+
Determine whether a day is a workday.
|
|
55
|
+
|
|
56
|
+
Parameters
|
|
57
|
+
----------
|
|
58
|
+
expr
|
|
59
|
+
Input expression.
|
|
60
|
+
weekend
|
|
61
|
+
The days of the week that are considered weekends. Defaults to ("Sat", "Sun").
|
|
62
|
+
holidays
|
|
63
|
+
The holidays to exclude from the calculation. Defaults to None. This should
|
|
64
|
+
be a list of ``datetime.date`` s.
|
|
65
|
+
|
|
66
|
+
Returns
|
|
67
|
+
-------
|
|
68
|
+
polars.Expr
|
|
69
|
+
|
|
70
|
+
Examples
|
|
71
|
+
--------
|
|
72
|
+
>>> from datetime import date
|
|
73
|
+
>>> import polars as pl
|
|
74
|
+
>>> import polars_xdt as xdt
|
|
75
|
+
>>> df = pl.DataFrame(
|
|
76
|
+
... {
|
|
77
|
+
... "date": [
|
|
78
|
+
... date(2023, 1, 4),
|
|
79
|
+
... date(2023, 5, 1),
|
|
80
|
+
... date(2023, 9, 9),
|
|
81
|
+
... ],
|
|
82
|
+
... }
|
|
83
|
+
... )
|
|
84
|
+
>>> df.with_columns(is_workday=xdt.is_workday("date"))
|
|
85
|
+
shape: (3, 2)
|
|
86
|
+
┌────────────┬────────────┐
|
|
87
|
+
│ date ┆ is_workday │
|
|
88
|
+
│ --- ┆ --- │
|
|
89
|
+
│ date ┆ bool │
|
|
90
|
+
╞════════════╪════════════╡
|
|
91
|
+
│ 2023-01-04 ┆ true │
|
|
92
|
+
│ 2023-05-01 ┆ true │
|
|
93
|
+
│ 2023-09-09 ┆ false │
|
|
94
|
+
└────────────┴────────────┘
|
|
95
|
+
|
|
96
|
+
"""
|
|
97
|
+
expr = parse_into_expr(expr)
|
|
98
|
+
weekend_int = [mapping[x] for x in weekend]
|
|
99
|
+
if holidays is not None:
|
|
100
|
+
return ~(
|
|
101
|
+
expr.dt.date().is_in(holidays)
|
|
102
|
+
| expr.dt.weekday().is_in(weekend_int)
|
|
103
|
+
)
|
|
104
|
+
return ~expr.dt.weekday().is_in(weekend_int)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def from_local_datetime(
|
|
108
|
+
expr: IntoExprColumn,
|
|
109
|
+
from_tz: str | Expr,
|
|
110
|
+
to_tz: str,
|
|
111
|
+
ambiguous: Ambiguous = "raise",
|
|
112
|
+
) -> pl.Expr:
|
|
113
|
+
"""
|
|
114
|
+
Convert from local datetime in given time zone to new timezone.
|
|
115
|
+
|
|
116
|
+
Parameters
|
|
117
|
+
----------
|
|
118
|
+
expr
|
|
119
|
+
Expression to convert.
|
|
120
|
+
from_tz
|
|
121
|
+
Current timezone of each datetime
|
|
122
|
+
to_tz
|
|
123
|
+
Timezone to convert to
|
|
124
|
+
ambiguous
|
|
125
|
+
Determine how to deal with ambiguous datetimes:
|
|
126
|
+
|
|
127
|
+
- `'raise'` (default): raise
|
|
128
|
+
- `'earliest'`: use the earliest datetime
|
|
129
|
+
- `'latest'`: use the latest datetime
|
|
130
|
+
|
|
131
|
+
Returns
|
|
132
|
+
-------
|
|
133
|
+
Expr
|
|
134
|
+
Expression of data type :class:`DateTime`.
|
|
135
|
+
|
|
136
|
+
Examples
|
|
137
|
+
--------
|
|
138
|
+
You can go from a localized datetime back to expressing the datetimes
|
|
139
|
+
in a single timezone with `from_local_datetime`.
|
|
140
|
+
|
|
141
|
+
>>> from datetime import datetime
|
|
142
|
+
>>> import polars_xdt as xdt
|
|
143
|
+
>>> df = pl.DataFrame(
|
|
144
|
+
... {
|
|
145
|
+
... "local_dt": [
|
|
146
|
+
... datetime(2020, 10, 10, 1),
|
|
147
|
+
... datetime(2020, 10, 10, 2),
|
|
148
|
+
... datetime(2020, 10, 9, 20),
|
|
149
|
+
... ],
|
|
150
|
+
... "timezone": [
|
|
151
|
+
... "Europe/London",
|
|
152
|
+
... "Africa/Kigali",
|
|
153
|
+
... "America/New_York",
|
|
154
|
+
... ],
|
|
155
|
+
... }
|
|
156
|
+
... )
|
|
157
|
+
>>> df.with_columns(
|
|
158
|
+
... xdt.from_local_datetime(
|
|
159
|
+
... "local_dt", pl.col("timezone"), "UTC"
|
|
160
|
+
... ).alias("date")
|
|
161
|
+
... )
|
|
162
|
+
shape: (3, 3)
|
|
163
|
+
┌─────────────────────┬──────────────────┬─────────────────────────┐
|
|
164
|
+
│ local_dt ┆ timezone ┆ date │
|
|
165
|
+
│ --- ┆ --- ┆ --- │
|
|
166
|
+
│ datetime[μs] ┆ str ┆ datetime[μs, UTC] │
|
|
167
|
+
╞═════════════════════╪══════════════════╪═════════════════════════╡
|
|
168
|
+
│ 2020-10-10 01:00:00 ┆ Europe/London ┆ 2020-10-10 00:00:00 UTC │
|
|
169
|
+
│ 2020-10-10 02:00:00 ┆ Africa/Kigali ┆ 2020-10-10 00:00:00 UTC │
|
|
170
|
+
│ 2020-10-09 20:00:00 ┆ America/New_York ┆ 2020-10-10 00:00:00 UTC │
|
|
171
|
+
└─────────────────────┴──────────────────┴─────────────────────────┘
|
|
172
|
+
|
|
173
|
+
"""
|
|
174
|
+
expr = parse_into_expr(expr)
|
|
175
|
+
from_tz = parse_into_expr(from_tz, str_as_lit=True)
|
|
176
|
+
return register_plugin_function(
|
|
177
|
+
plugin_path=PLUGIN_PATH,
|
|
178
|
+
function_name="from_local_datetime",
|
|
179
|
+
is_elementwise=True,
|
|
180
|
+
args=[expr, from_tz],
|
|
181
|
+
kwargs={
|
|
182
|
+
"to_tz": to_tz,
|
|
183
|
+
"ambiguous": ambiguous,
|
|
184
|
+
},
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def to_local_datetime(
|
|
189
|
+
expr: IntoExprColumn,
|
|
190
|
+
time_zone: str | Expr,
|
|
191
|
+
) -> pl.Expr:
|
|
192
|
+
"""
|
|
193
|
+
Convert to local datetime in given time zone.
|
|
194
|
+
|
|
195
|
+
Parameters
|
|
196
|
+
----------
|
|
197
|
+
expr
|
|
198
|
+
Expression to convert.
|
|
199
|
+
time_zone
|
|
200
|
+
Time zone to convert to.
|
|
201
|
+
|
|
202
|
+
Returns
|
|
203
|
+
-------
|
|
204
|
+
Expr
|
|
205
|
+
Expression of data type :class:`DateTime`.
|
|
206
|
+
|
|
207
|
+
Examples
|
|
208
|
+
--------
|
|
209
|
+
You can use `to_local_datetime` to figure out how a tz-aware datetime
|
|
210
|
+
will be expressed as a local datetime.
|
|
211
|
+
|
|
212
|
+
>>> from datetime import datetime
|
|
213
|
+
>>> import polars_xdt as xdt
|
|
214
|
+
>>> df = pl.DataFrame(
|
|
215
|
+
... {
|
|
216
|
+
... "date_col": [datetime(2020, 10, 10)] * 3,
|
|
217
|
+
... "timezone": [
|
|
218
|
+
... "Europe/London",
|
|
219
|
+
... "Africa/Kigali",
|
|
220
|
+
... "America/New_York",
|
|
221
|
+
... ],
|
|
222
|
+
... }
|
|
223
|
+
... ).with_columns(pl.col("date_col").dt.replace_time_zone("UTC"))
|
|
224
|
+
>>> df.with_columns(
|
|
225
|
+
... xdt.to_local_datetime("date_col", pl.col("timezone")).alias(
|
|
226
|
+
... "local_dt"
|
|
227
|
+
... )
|
|
228
|
+
... )
|
|
229
|
+
shape: (3, 3)
|
|
230
|
+
┌─────────────────────────┬──────────────────┬─────────────────────┐
|
|
231
|
+
│ date_col ┆ timezone ┆ local_dt │
|
|
232
|
+
│ --- ┆ --- ┆ --- │
|
|
233
|
+
│ datetime[μs, UTC] ┆ str ┆ datetime[μs] │
|
|
234
|
+
╞═════════════════════════╪══════════════════╪═════════════════════╡
|
|
235
|
+
│ 2020-10-10 00:00:00 UTC ┆ Europe/London ┆ 2020-10-10 01:00:00 │
|
|
236
|
+
│ 2020-10-10 00:00:00 UTC ┆ Africa/Kigali ┆ 2020-10-10 02:00:00 │
|
|
237
|
+
│ 2020-10-10 00:00:00 UTC ┆ America/New_York ┆ 2020-10-09 20:00:00 │
|
|
238
|
+
└─────────────────────────┴──────────────────┴─────────────────────┘
|
|
239
|
+
|
|
240
|
+
"""
|
|
241
|
+
expr = parse_into_expr(expr)
|
|
242
|
+
time_zone = parse_into_expr(time_zone, str_as_lit=True)
|
|
243
|
+
return register_plugin_function(
|
|
244
|
+
plugin_path=PLUGIN_PATH,
|
|
245
|
+
function_name="to_local_datetime",
|
|
246
|
+
is_elementwise=True,
|
|
247
|
+
args=[expr, time_zone],
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def format_localized(
|
|
252
|
+
expr: IntoExprColumn,
|
|
253
|
+
format: str, # noqa: A002
|
|
254
|
+
locale: str = "uk_UA",
|
|
255
|
+
) -> pl.Expr:
|
|
256
|
+
"""
|
|
257
|
+
Convert to local datetime in given time zone.
|
|
258
|
+
|
|
259
|
+
Parameters
|
|
260
|
+
----------
|
|
261
|
+
expr
|
|
262
|
+
Expression to format.
|
|
263
|
+
format
|
|
264
|
+
Format string, see https://docs.rs/chrono/latest/chrono/format/strftime/index.html
|
|
265
|
+
for what's available.
|
|
266
|
+
locale
|
|
267
|
+
Locale to use for formatting. Defaults to "uk_UA", because that's what the OP
|
|
268
|
+
requested https://github.com/pola-rs/polars/issues/12341.
|
|
269
|
+
|
|
270
|
+
Returns
|
|
271
|
+
-------
|
|
272
|
+
Expr
|
|
273
|
+
Expression of data type :class:`Utf8`.
|
|
274
|
+
|
|
275
|
+
Examples
|
|
276
|
+
--------
|
|
277
|
+
>>> from datetime import datetime
|
|
278
|
+
>>> import polars_xdt as xdt
|
|
279
|
+
>>> df = pl.DataFrame(
|
|
280
|
+
... {
|
|
281
|
+
... "date_col": [datetime(2024, 8, 24), datetime(2024, 10, 1)],
|
|
282
|
+
... }
|
|
283
|
+
... )
|
|
284
|
+
>>> df.with_columns(
|
|
285
|
+
... result=xdt.format_localized(
|
|
286
|
+
... "date_col", format="%A, %d %B %Y", locale="uk_UA"
|
|
287
|
+
... )
|
|
288
|
+
... )
|
|
289
|
+
shape: (2, 2)
|
|
290
|
+
┌─────────────────────┬──────────────────────────┐
|
|
291
|
+
│ date_col ┆ result │
|
|
292
|
+
│ --- ┆ --- │
|
|
293
|
+
│ datetime[μs] ┆ str │
|
|
294
|
+
╞═════════════════════╪══════════════════════════╡
|
|
295
|
+
│ 2024-08-24 00:00:00 ┆ субота, 24 серпня 2024 │
|
|
296
|
+
│ 2024-10-01 00:00:00 ┆ вівторок, 01 жовтня 2024 │
|
|
297
|
+
└─────────────────────┴──────────────────────────┘
|
|
298
|
+
|
|
299
|
+
"""
|
|
300
|
+
expr = parse_into_expr(expr)
|
|
301
|
+
return register_plugin_function(
|
|
302
|
+
plugin_path=PLUGIN_PATH,
|
|
303
|
+
function_name="format_localized",
|
|
304
|
+
is_elementwise=True,
|
|
305
|
+
args=[expr],
|
|
306
|
+
kwargs={"format": format, "locale": locale},
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def to_julian_date(expr: str | pl.Expr) -> pl.Expr:
|
|
311
|
+
"""
|
|
312
|
+
Return the Julian date corresponding to given datetimes.
|
|
313
|
+
|
|
314
|
+
Examples
|
|
315
|
+
--------
|
|
316
|
+
>>> from datetime import datetime
|
|
317
|
+
>>> import polars_xdt as xdt
|
|
318
|
+
>>> df = pl.DataFrame(
|
|
319
|
+
... {
|
|
320
|
+
... "date_col": [
|
|
321
|
+
... datetime(2013, 1, 1, 0, 30),
|
|
322
|
+
... datetime(2024, 1, 7, 13, 18, 51),
|
|
323
|
+
... ],
|
|
324
|
+
... }
|
|
325
|
+
... )
|
|
326
|
+
>>> with pl.Config(float_precision=10) as cfg:
|
|
327
|
+
... df.with_columns(julian_date=xdt.to_julian_date("date_col"))
|
|
328
|
+
shape: (2, 2)
|
|
329
|
+
┌─────────────────────┬────────────────────┐
|
|
330
|
+
│ date_col ┆ julian_date │
|
|
331
|
+
│ --- ┆ --- │
|
|
332
|
+
│ datetime[μs] ┆ f64 │
|
|
333
|
+
╞═════════════════════╪════════════════════╡
|
|
334
|
+
│ 2013-01-01 00:30:00 ┆ 2456293.5208333335 │
|
|
335
|
+
│ 2024-01-07 13:18:51 ┆ 2460317.0547569445 │
|
|
336
|
+
└─────────────────────┴────────────────────┘
|
|
337
|
+
|
|
338
|
+
"""
|
|
339
|
+
expr = parse_into_expr(expr)
|
|
340
|
+
return register_plugin_function(
|
|
341
|
+
plugin_path=PLUGIN_PATH,
|
|
342
|
+
function_name="to_julian_date",
|
|
343
|
+
is_elementwise=True,
|
|
344
|
+
args=[expr],
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def ceil(
|
|
349
|
+
expr: IntoExprColumn,
|
|
350
|
+
every: str | pl.Expr,
|
|
351
|
+
) -> pl.Expr:
|
|
352
|
+
"""
|
|
353
|
+
Find "ceiling" of datetime.
|
|
354
|
+
|
|
355
|
+
Parameters
|
|
356
|
+
----------
|
|
357
|
+
expr
|
|
358
|
+
Expression to take "ceiling" of.
|
|
359
|
+
every
|
|
360
|
+
Duration string, created with the
|
|
361
|
+
the following string language:
|
|
362
|
+
|
|
363
|
+
- 1ns (1 nanosecond)
|
|
364
|
+
- 1us (1 microsecond)
|
|
365
|
+
- 1ms (1 millisecond)
|
|
366
|
+
- 1s (1 second)
|
|
367
|
+
- 1m (1 minute)
|
|
368
|
+
- 1h (1 hour)
|
|
369
|
+
- 1d (1 calendar day)
|
|
370
|
+
- 1w (1 calendar week)
|
|
371
|
+
- 1mo (1 calendar month)
|
|
372
|
+
- 1q (1 calendar quarter)
|
|
373
|
+
- 1y (1 calendar year)
|
|
374
|
+
|
|
375
|
+
These strings can be combined:
|
|
376
|
+
|
|
377
|
+
- 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds
|
|
378
|
+
|
|
379
|
+
By "calendar day", we mean the corresponding time on the next day (which may
|
|
380
|
+
not be 24 hours, due to daylight savings). Similarly for "calendar week",
|
|
381
|
+
"calendar month", "calendar quarter", and "calendar year".
|
|
382
|
+
|
|
383
|
+
Returns
|
|
384
|
+
-------
|
|
385
|
+
Expr
|
|
386
|
+
Expression of the same type.
|
|
387
|
+
|
|
388
|
+
Examples
|
|
389
|
+
--------
|
|
390
|
+
>>> from datetime import datetime
|
|
391
|
+
>>> import polars_xdt as xdt
|
|
392
|
+
>>> df = pl.DataFrame(
|
|
393
|
+
... {
|
|
394
|
+
... "date_col": [datetime(2024, 8, 24), datetime(2024, 10, 1)],
|
|
395
|
+
... }
|
|
396
|
+
... )
|
|
397
|
+
>>> df.with_columns(result=xdt.ceil("date_col", "1mo"))
|
|
398
|
+
shape: (2, 2)
|
|
399
|
+
┌─────────────────────┬─────────────────────┐
|
|
400
|
+
│ date_col ┆ result │
|
|
401
|
+
│ --- ┆ --- │
|
|
402
|
+
│ datetime[μs] ┆ datetime[μs] │
|
|
403
|
+
╞═════════════════════╪═════════════════════╡
|
|
404
|
+
│ 2024-08-24 00:00:00 ┆ 2024-09-01 00:00:00 │
|
|
405
|
+
│ 2024-10-01 00:00:00 ┆ 2024-10-01 00:00:00 │
|
|
406
|
+
└─────────────────────┴─────────────────────┘
|
|
407
|
+
|
|
408
|
+
"""
|
|
409
|
+
expr = parse_into_expr(expr)
|
|
410
|
+
truncated = expr.dt.truncate(every)
|
|
411
|
+
return (
|
|
412
|
+
pl.when(expr == truncated)
|
|
413
|
+
.then(expr)
|
|
414
|
+
.otherwise(truncated.dt.offset_by(every))
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def day_name(expr: str | pl.Expr, locale: str | None = None) -> pl.Expr:
|
|
419
|
+
"""
|
|
420
|
+
Return day name, in specified locale (if specified).
|
|
421
|
+
|
|
422
|
+
Returns
|
|
423
|
+
-------
|
|
424
|
+
Expr
|
|
425
|
+
Expression of data type :class:`Utf8`.
|
|
426
|
+
|
|
427
|
+
See Also
|
|
428
|
+
--------
|
|
429
|
+
format_localized : format according to locale.
|
|
430
|
+
|
|
431
|
+
Examples
|
|
432
|
+
--------
|
|
433
|
+
>>> from datetime import datetime
|
|
434
|
+
>>> import polars_xdt as xdt
|
|
435
|
+
>>> df = pl.DataFrame(
|
|
436
|
+
... {
|
|
437
|
+
... "ts": [datetime(2020, 10, 25), datetime(2020, 10, 26)],
|
|
438
|
+
... }
|
|
439
|
+
... )
|
|
440
|
+
>>> df.with_columns(
|
|
441
|
+
... english_day_name=xdt.day_name("ts"),
|
|
442
|
+
... french_day_name=xdt.day_name("ts", locale="fr_FR"),
|
|
443
|
+
... ukrainian_day_name=xdt.day_name("ts", locale="uk_UA"),
|
|
444
|
+
... )
|
|
445
|
+
shape: (2, 4)
|
|
446
|
+
┌─────────────────────┬──────────────────┬─────────────────┬────────────────────┐
|
|
447
|
+
│ ts ┆ english_day_name ┆ french_day_name ┆ ukrainian_day_name │
|
|
448
|
+
│ --- ┆ --- ┆ --- ┆ --- │
|
|
449
|
+
│ datetime[μs] ┆ str ┆ str ┆ str │
|
|
450
|
+
╞═════════════════════╪══════════════════╪═════════════════╪════════════════════╡
|
|
451
|
+
│ 2020-10-25 00:00:00 ┆ Sunday ┆ dimanche ┆ неділя │
|
|
452
|
+
│ 2020-10-26 00:00:00 ┆ Monday ┆ lundi ┆ понеділок │
|
|
453
|
+
└─────────────────────┴──────────────────┴─────────────────┴────────────────────┘
|
|
454
|
+
|
|
455
|
+
"""
|
|
456
|
+
expr = parse_into_expr(expr)
|
|
457
|
+
if locale is None:
|
|
458
|
+
result = expr.dt.to_string("%A")
|
|
459
|
+
else:
|
|
460
|
+
result = format_localized(expr, "%A", locale=locale) # type: ignore[attr-defined]
|
|
461
|
+
return result
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def month_name(expr: str | pl.Expr, locale: str | None = None) -> pl.Expr:
|
|
465
|
+
"""
|
|
466
|
+
Return month name, in specified locale (if specified).
|
|
467
|
+
|
|
468
|
+
Returns
|
|
469
|
+
-------
|
|
470
|
+
Expr
|
|
471
|
+
Expression of data type :class:`Utf8`.
|
|
472
|
+
|
|
473
|
+
See Also
|
|
474
|
+
--------
|
|
475
|
+
format_localized : format according to locale.
|
|
476
|
+
|
|
477
|
+
Examples
|
|
478
|
+
--------
|
|
479
|
+
>>> from datetime import datetime
|
|
480
|
+
>>> import polars_xdt as xdt
|
|
481
|
+
>>> df = pl.DataFrame(
|
|
482
|
+
... {
|
|
483
|
+
... "ts": [datetime(2020, 10, 25), datetime(2020, 11, 26)],
|
|
484
|
+
... }
|
|
485
|
+
... )
|
|
486
|
+
>>> df.with_columns(
|
|
487
|
+
... english_month_name=xdt.month_name("ts"),
|
|
488
|
+
... french_month_name=xdt.month_name("ts", locale="fr_FR"),
|
|
489
|
+
... ukrainian_month_name=xdt.month_name("ts", locale="uk_UA"),
|
|
490
|
+
... )
|
|
491
|
+
shape: (2, 4)
|
|
492
|
+
┌─────────────────────┬────────────────────┬───────────────────┬──────────────────────┐
|
|
493
|
+
│ ts ┆ english_month_name ┆ french_month_name ┆ ukrainian_month_name │
|
|
494
|
+
│ --- ┆ --- ┆ --- ┆ --- │
|
|
495
|
+
│ datetime[μs] ┆ str ┆ str ┆ str │
|
|
496
|
+
╞═════════════════════╪════════════════════╪═══════════════════╪══════════════════════╡
|
|
497
|
+
│ 2020-10-25 00:00:00 ┆ October ┆ octobre ┆ жовтня │
|
|
498
|
+
│ 2020-11-26 00:00:00 ┆ November ┆ novembre ┆ листопада │
|
|
499
|
+
└─────────────────────┴────────────────────┴───────────────────┴──────────────────────┘
|
|
500
|
+
|
|
501
|
+
"""
|
|
502
|
+
expr = parse_into_expr(expr)
|
|
503
|
+
if locale is None:
|
|
504
|
+
result = expr.dt.to_string("%B")
|
|
505
|
+
else:
|
|
506
|
+
result = format_localized(expr, "%B", locale=locale)
|
|
507
|
+
return result
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def month_delta(
|
|
511
|
+
start_dates: IntoExprColumn,
|
|
512
|
+
end_dates: IntoExprColumn | date,
|
|
513
|
+
) -> pl.Expr:
|
|
514
|
+
"""
|
|
515
|
+
Calculate the number of months between two Series.
|
|
516
|
+
|
|
517
|
+
Parameters
|
|
518
|
+
----------
|
|
519
|
+
start_dates
|
|
520
|
+
A Series object containing the start dates.
|
|
521
|
+
end_dates
|
|
522
|
+
A Series object containing the end dates.
|
|
523
|
+
|
|
524
|
+
Returns
|
|
525
|
+
-------
|
|
526
|
+
polars.Expr
|
|
527
|
+
|
|
528
|
+
Examples
|
|
529
|
+
--------
|
|
530
|
+
>>> from datetime import date
|
|
531
|
+
>>> import polars as pl
|
|
532
|
+
>>> import polars_xdt as xdt
|
|
533
|
+
>>> df = pl.DataFrame(
|
|
534
|
+
... {
|
|
535
|
+
... "start_date": [
|
|
536
|
+
... date(2024, 3, 1),
|
|
537
|
+
... date(2024, 3, 31),
|
|
538
|
+
... date(2022, 2, 28),
|
|
539
|
+
... date(2023, 1, 31),
|
|
540
|
+
... date(2019, 12, 31),
|
|
541
|
+
... ],
|
|
542
|
+
... "end_date": [
|
|
543
|
+
... date(2023, 2, 28),
|
|
544
|
+
... date(2023, 2, 28),
|
|
545
|
+
... date(2023, 2, 28),
|
|
546
|
+
... date(2023, 1, 31),
|
|
547
|
+
... date(2023, 1, 1),
|
|
548
|
+
... ],
|
|
549
|
+
... },
|
|
550
|
+
... )
|
|
551
|
+
>>> df.with_columns(
|
|
552
|
+
... xdt.month_delta("start_date", "end_date").alias("month_delta")
|
|
553
|
+
... )
|
|
554
|
+
shape: (5, 3)
|
|
555
|
+
┌────────────┬────────────┬─────────────┐
|
|
556
|
+
│ start_date ┆ end_date ┆ month_delta │
|
|
557
|
+
│ --- ┆ --- ┆ --- │
|
|
558
|
+
│ date ┆ date ┆ i32 │
|
|
559
|
+
╞════════════╪════════════╪═════════════╡
|
|
560
|
+
│ 2024-03-01 ┆ 2023-02-28 ┆ -12 │
|
|
561
|
+
│ 2024-03-31 ┆ 2023-02-28 ┆ -13 │
|
|
562
|
+
│ 2022-02-28 ┆ 2023-02-28 ┆ 12 │
|
|
563
|
+
│ 2023-01-31 ┆ 2023-01-31 ┆ 0 │
|
|
564
|
+
│ 2019-12-31 ┆ 2023-01-01 ┆ 36 │
|
|
565
|
+
└────────────┴────────────┴─────────────┘
|
|
566
|
+
|
|
567
|
+
"""
|
|
568
|
+
start_dates = parse_into_expr(start_dates)
|
|
569
|
+
if not isinstance(end_dates, date):
|
|
570
|
+
end_dates = parse_into_expr(end_dates)
|
|
571
|
+
|
|
572
|
+
return register_plugin_function(
|
|
573
|
+
plugin_path=PLUGIN_PATH,
|
|
574
|
+
function_name="month_delta",
|
|
575
|
+
is_elementwise=True,
|
|
576
|
+
args=[start_dates, end_dates],
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def arg_previous_greater(expr: IntoExprColumn) -> pl.Expr:
|
|
581
|
+
"""
|
|
582
|
+
Find the row count of the previous value greater than the current one.
|
|
583
|
+
|
|
584
|
+
Parameters
|
|
585
|
+
----------
|
|
586
|
+
expr
|
|
587
|
+
Expression.
|
|
588
|
+
|
|
589
|
+
Returns
|
|
590
|
+
-------
|
|
591
|
+
Expr
|
|
592
|
+
UInt64 or UInt32 type, depending on the platform.
|
|
593
|
+
|
|
594
|
+
Examples
|
|
595
|
+
--------
|
|
596
|
+
>>> import polars as pl
|
|
597
|
+
>>> import polars_xdt as xdt
|
|
598
|
+
>>> df = pl.DataFrame({"value": [1, 9, 6, 7, 3]})
|
|
599
|
+
>>> df.with_columns(result=xdt.arg_previous_greater("value"))
|
|
600
|
+
shape: (5, 2)
|
|
601
|
+
┌───────┬────────┐
|
|
602
|
+
│ value ┆ result │
|
|
603
|
+
│ --- ┆ --- │
|
|
604
|
+
│ i64 ┆ u32 │
|
|
605
|
+
╞═══════╪════════╡
|
|
606
|
+
│ 1 ┆ null │
|
|
607
|
+
│ 9 ┆ 1 │
|
|
608
|
+
│ 6 ┆ 1 │
|
|
609
|
+
│ 7 ┆ 1 │
|
|
610
|
+
│ 3 ┆ 3 │
|
|
611
|
+
└───────┴────────┘
|
|
612
|
+
|
|
613
|
+
This can be useful when working with time series. For example,
|
|
614
|
+
if you a dataset like this:
|
|
615
|
+
|
|
616
|
+
>>> df = pl.DataFrame(
|
|
617
|
+
... {
|
|
618
|
+
... "date": [
|
|
619
|
+
... "2024-02-01",
|
|
620
|
+
... "2024-02-02",
|
|
621
|
+
... "2024-02-03",
|
|
622
|
+
... "2024-02-04",
|
|
623
|
+
... "2024-02-05",
|
|
624
|
+
... "2024-02-06",
|
|
625
|
+
... "2024-02-07",
|
|
626
|
+
... "2024-02-08",
|
|
627
|
+
... "2024-02-09",
|
|
628
|
+
... "2024-02-10",
|
|
629
|
+
... ],
|
|
630
|
+
... "group": ["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"],
|
|
631
|
+
... "value": [1, 9, None, 7, 3, 2, 4, 5, 1, 9],
|
|
632
|
+
... }
|
|
633
|
+
... )
|
|
634
|
+
>>> df = df.with_columns(pl.col("date").str.to_date())
|
|
635
|
+
|
|
636
|
+
and want find out, for each day and each item, how many days it's
|
|
637
|
+
been since `'value'` was higher than it currently is, you could do
|
|
638
|
+
|
|
639
|
+
>>> df.with_columns(
|
|
640
|
+
... result=(
|
|
641
|
+
... (
|
|
642
|
+
... pl.col("date")
|
|
643
|
+
... - pl.col("date")
|
|
644
|
+
... .gather(xdt.arg_previous_greater("value"))
|
|
645
|
+
... .over("group")
|
|
646
|
+
... ).dt.total_days()
|
|
647
|
+
... ),
|
|
648
|
+
... )
|
|
649
|
+
shape: (10, 4)
|
|
650
|
+
┌────────────┬───────┬───────┬────────┐
|
|
651
|
+
│ date ┆ group ┆ value ┆ result │
|
|
652
|
+
│ --- ┆ --- ┆ --- ┆ --- │
|
|
653
|
+
│ date ┆ str ┆ i64 ┆ i64 │
|
|
654
|
+
╞════════════╪═══════╪═══════╪════════╡
|
|
655
|
+
│ 2024-02-01 ┆ A ┆ 1 ┆ null │
|
|
656
|
+
│ 2024-02-02 ┆ A ┆ 9 ┆ 0 │
|
|
657
|
+
│ 2024-02-03 ┆ A ┆ null ┆ null │
|
|
658
|
+
│ 2024-02-04 ┆ A ┆ 7 ┆ 2 │
|
|
659
|
+
│ 2024-02-05 ┆ A ┆ 3 ┆ 1 │
|
|
660
|
+
│ 2024-02-06 ┆ B ┆ 2 ┆ null │
|
|
661
|
+
│ 2024-02-07 ┆ B ┆ 4 ┆ 0 │
|
|
662
|
+
│ 2024-02-08 ┆ B ┆ 5 ┆ 0 │
|
|
663
|
+
│ 2024-02-09 ┆ B ┆ 1 ┆ 1 │
|
|
664
|
+
│ 2024-02-10 ┆ B ┆ 9 ┆ 0 │
|
|
665
|
+
└────────────┴───────┴───────┴────────┘
|
|
666
|
+
|
|
667
|
+
"""
|
|
668
|
+
expr = parse_into_expr(expr)
|
|
669
|
+
return register_plugin_function(
|
|
670
|
+
plugin_path=PLUGIN_PATH,
|
|
671
|
+
function_name="arg_previous_greater",
|
|
672
|
+
is_elementwise=False,
|
|
673
|
+
args=[expr],
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
def sgt_transform(
|
|
678
|
+
sequence_id_col: IntoExprColumn,
|
|
679
|
+
state_col: IntoExprColumn,
|
|
680
|
+
time_col: IntoExprColumn | None = None,
|
|
681
|
+
*,
|
|
682
|
+
kappa: int = 1,
|
|
683
|
+
length_sensitive: bool = False,
|
|
684
|
+
mode: Literal["l1", "l2", "none"] = "l1",
|
|
685
|
+
time_penalty: Literal["inverse", "exponential", "linear", "power", "none"] = "inverse",
|
|
686
|
+
alpha: float = 1.0,
|
|
687
|
+
beta: float = 2.0,
|
|
688
|
+
deltatime: Literal["s", "m", "h", "d", "w", "month", "q", "y"] | None = None,
|
|
689
|
+
) -> pl.Expr:
|
|
690
|
+
"""
|
|
691
|
+
Compute Sequence Graph Transform (SGT) features from sequential data.
|
|
692
|
+
|
|
693
|
+
SGT transforms sequences into weighted n-gram representations, capturing
|
|
694
|
+
both the order and timing of events in your data.
|
|
695
|
+
|
|
696
|
+
Parameters
|
|
697
|
+
----------
|
|
698
|
+
sequence_id_col
|
|
699
|
+
Column name containing sequence identifiers (groups)
|
|
700
|
+
state_col
|
|
701
|
+
Column name containing state/event values
|
|
702
|
+
time_col
|
|
703
|
+
Column name containing timestamps or numeric time values
|
|
704
|
+
kappa
|
|
705
|
+
Maximum n-gram size (1=unigrams, 2=bigrams, etc.)
|
|
706
|
+
Higher values capture longer-range dependencies
|
|
707
|
+
length_sensitive
|
|
708
|
+
If True, apply length normalization to prevent long sequences
|
|
709
|
+
from dominating the feature space
|
|
710
|
+
mode
|
|
711
|
+
Normalization mode for n-gram weights:
|
|
712
|
+
- 'l1': Sum of weights equals 1
|
|
713
|
+
- 'l2': L2 norm of weights equals 1
|
|
714
|
+
- 'none': No normalization
|
|
715
|
+
time_penalty
|
|
716
|
+
Time decay function applied to n-gram weights:
|
|
717
|
+
- 'inverse': weight = alpha / time_diff
|
|
718
|
+
- 'exponential': weight = exp(-alpha * time_diff)
|
|
719
|
+
- 'linear': weight = max(0, 1 - alpha * time_diff)
|
|
720
|
+
- 'power': weight = 1 / time_diff^beta
|
|
721
|
+
- 'none': No time penalty (weight = 1)
|
|
722
|
+
alpha
|
|
723
|
+
Time penalty scale parameter (used in all penalties except 'power')
|
|
724
|
+
beta
|
|
725
|
+
Power parameter (only used when time_penalty='power')
|
|
726
|
+
deltatime
|
|
727
|
+
Time unit for date/datetime columns:
|
|
728
|
+
- 's': seconds
|
|
729
|
+
- 'm': minutes
|
|
730
|
+
- 'h': hours
|
|
731
|
+
- 'd': days
|
|
732
|
+
- 'w': weeks
|
|
733
|
+
- 'month': months (30.44 days)
|
|
734
|
+
- 'q': quarters (91.31 days)
|
|
735
|
+
- 'y': years (365.25 days)
|
|
736
|
+
|
|
737
|
+
Returns
|
|
738
|
+
-------
|
|
739
|
+
pl.Expr
|
|
740
|
+
Struct expression containing:
|
|
741
|
+
- sequence_id: Original sequence identifier
|
|
742
|
+
- ngram_keys: List of n-gram strings
|
|
743
|
+
- ngram_values: List of corresponding weights
|
|
744
|
+
|
|
745
|
+
Examples
|
|
746
|
+
--------
|
|
747
|
+
Basic usage with unigrams:
|
|
748
|
+
|
|
749
|
+
>>> df = pl.DataFrame({
|
|
750
|
+
... "user_id": [1, 1, 1, 2, 2],
|
|
751
|
+
... "action": ["login", "view", "purchase", "login", "view"],
|
|
752
|
+
... })
|
|
753
|
+
>>> result = df.select(
|
|
754
|
+
... xdt.sgt_transform("user_id", "action", kappa=1)
|
|
755
|
+
... )
|
|
756
|
+
|
|
757
|
+
Bigrams with time decay:
|
|
758
|
+
|
|
759
|
+
>>> df = pl.DataFrame({
|
|
760
|
+
... "user_id": [1, 1, 1, 2, 2],
|
|
761
|
+
... "action": ["login", "view", "purchase", "login", "view"],
|
|
762
|
+
... "timestamp": [0, 10, 20, 0, 5],
|
|
763
|
+
... })
|
|
764
|
+
>>> result = df.select(
|
|
765
|
+
... xdt.sgt_transform(
|
|
766
|
+
... "user_id",
|
|
767
|
+
... "action",
|
|
768
|
+
... time_col="timestamp",
|
|
769
|
+
... kappa=2,
|
|
770
|
+
... time_penalty="exponential",
|
|
771
|
+
... alpha=0.1,
|
|
772
|
+
... )
|
|
773
|
+
... )
|
|
774
|
+
|
|
775
|
+
With datetime columns:
|
|
776
|
+
|
|
777
|
+
>>> df = pl.DataFrame({
|
|
778
|
+
... "session_id": ["A", "A", "A"],
|
|
779
|
+
... "event": ["start", "click", "end"],
|
|
780
|
+
... "time": pl.datetime_range(
|
|
781
|
+
... start=pl.datetime(2024, 1, 1),
|
|
782
|
+
... end=pl.datetime(2024, 1, 1, 0, 10),
|
|
783
|
+
... interval="5m",
|
|
784
|
+
... eager=True,
|
|
785
|
+
... ),
|
|
786
|
+
... })
|
|
787
|
+
>>> result = df.select(
|
|
788
|
+
... xdt.sgt_transform(
|
|
789
|
+
... "session_id",
|
|
790
|
+
... "event",
|
|
791
|
+
... time_col="time",
|
|
792
|
+
... deltatime="m",
|
|
793
|
+
... kappa=2,
|
|
794
|
+
... )
|
|
795
|
+
... )
|
|
796
|
+
|
|
797
|
+
Integration with Polars pipeline (Lazy):
|
|
798
|
+
|
|
799
|
+
>>> result = (
|
|
800
|
+
... pl.scan_csv("sequences.csv")
|
|
801
|
+
... .with_columns(pl.col("timestamp").str.to_datetime())
|
|
802
|
+
... .group_by("user_id")
|
|
803
|
+
... .agg([
|
|
804
|
+
... pl.col("action"),
|
|
805
|
+
... pl.col("timestamp"),
|
|
806
|
+
... ])
|
|
807
|
+
... .select(
|
|
808
|
+
... xdt.sgt_transform(
|
|
809
|
+
... "user_id",
|
|
810
|
+
... "action",
|
|
811
|
+
... time_col="timestamp",
|
|
812
|
+
... kappa=3,
|
|
813
|
+
... deltatime="h",
|
|
814
|
+
... )
|
|
815
|
+
... )
|
|
816
|
+
... .collect(streaming=True)
|
|
817
|
+
... )
|
|
818
|
+
|
|
819
|
+
Extracting n-gram features:
|
|
820
|
+
|
|
821
|
+
>>> df_features = result.select([
|
|
822
|
+
... pl.col("sgt_result").struct.field("sequence_id"),
|
|
823
|
+
... pl.col("sgt_result").struct.field("ngram_keys").alias("ngrams"),
|
|
824
|
+
... pl.col("sgt_result").struct.field("ngram_values").alias("weights"),
|
|
825
|
+
... ]).explode(["ngrams", "weights"])
|
|
826
|
+
|
|
827
|
+
Notes
|
|
828
|
+
-----
|
|
829
|
+
- The function is highly optimized with parallel processing
|
|
830
|
+
- Supports both eager and lazy evaluation
|
|
831
|
+
- Compatible with streaming execution
|
|
832
|
+
- Time columns can be numeric or temporal (date/datetime/duration)
|
|
833
|
+
- Missing values in time columns are treated as 0
|
|
834
|
+
|
|
835
|
+
"""
|
|
836
|
+
sequence_id_col = parse_into_expr(sequence_id_col)
|
|
837
|
+
state_col = parse_into_expr(state_col)
|
|
838
|
+
|
|
839
|
+
if time_col is not None:
|
|
840
|
+
time_col = parse_into_expr(time_col)
|
|
841
|
+
args = [sequence_id_col, state_col, time_col]
|
|
842
|
+
else:
|
|
843
|
+
args = [sequence_id_col, state_col]
|
|
844
|
+
|
|
845
|
+
return register_plugin_function(
|
|
846
|
+
plugin_path=PLUGIN_PATH,
|
|
847
|
+
function_name="sgt_transform",
|
|
848
|
+
is_elementwise=False,
|
|
849
|
+
args=args,
|
|
850
|
+
kwargs={
|
|
851
|
+
"kappa": kappa,
|
|
852
|
+
"length_sensitive": length_sensitive,
|
|
853
|
+
"mode": mode,
|
|
854
|
+
"time_penalty": time_penalty,
|
|
855
|
+
"alpha": alpha,
|
|
856
|
+
"beta": beta,
|
|
857
|
+
"deltatime": deltatime,
|
|
858
|
+
},
|
|
859
|
+
)
|