polars-sgt 0.1.0__cp39-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,859 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from datetime import date
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Literal
7
+
8
+ import polars as pl
9
+ from polars.plugins import register_plugin_function
10
+
11
+ from polars_sgt.utils import parse_into_expr
12
+
13
+ if sys.version_info >= (3, 10):
14
+ from typing import TypeAlias
15
+ else:
16
+ from typing_extensions import TypeAlias
17
+
18
+ if TYPE_CHECKING:
19
+ from collections.abc import Sequence
20
+
21
+ from polars import Expr
22
+
23
+ from polars_sgt.typing import IntoExprColumn
24
+
25
+ Ambiguous: TypeAlias = Literal["earliest", "latest", "raise", "null"]
26
+
27
+ RollStrategy: TypeAlias = Literal["raise", "forward", "backward"]
28
+
29
+
30
+ PLUGIN_PATH = Path(__file__).parent
31
+
32
+ mapping = {"Mon": 1, "Tue": 2, "Wed": 3, "Thu": 4, "Fri": 5, "Sat": 6, "Sun": 7}
33
+ reverse_mapping = {value: key for key, value in mapping.items()}
34
+
35
+
36
+ def get_weekmask(weekend: Sequence[str]) -> list[bool]:
37
+ if weekend == ("Sat", "Sun"):
38
+ weekmask = [True, True, True, True, True, False, False]
39
+ else:
40
+ weekmask = [reverse_mapping[i] not in weekend for i in range(1, 8)]
41
+ if sum(weekmask) == 0:
42
+ msg = f"At least one day of the week must be a business day. Got weekend={weekend}"
43
+ raise ValueError(msg)
44
+ return weekmask
45
+
46
+
47
+ def is_workday(
48
+ expr: IntoExprColumn,
49
+ *,
50
+ weekend: Sequence[str] = ("Sat", "Sun"),
51
+ holidays: Sequence[date] | None = None,
52
+ ) -> pl.Expr:
53
+ """
54
+ Determine whether a day is a workday.
55
+
56
+ Parameters
57
+ ----------
58
+ expr
59
+ Input expression.
60
+ weekend
61
+ The days of the week that are considered weekends. Defaults to ("Sat", "Sun").
62
+ holidays
63
+ The holidays to exclude from the calculation. Defaults to None. This should
64
+ be a list of ``datetime.date`` s.
65
+
66
+ Returns
67
+ -------
68
+ polars.Expr
69
+
70
+ Examples
71
+ --------
72
+ >>> from datetime import date
73
+ >>> import polars as pl
74
+ >>> import polars_xdt as xdt
75
+ >>> df = pl.DataFrame(
76
+ ... {
77
+ ... "date": [
78
+ ... date(2023, 1, 4),
79
+ ... date(2023, 5, 1),
80
+ ... date(2023, 9, 9),
81
+ ... ],
82
+ ... }
83
+ ... )
84
+ >>> df.with_columns(is_workday=xdt.is_workday("date"))
85
+ shape: (3, 2)
86
+ ┌────────────┬────────────┐
87
+ │ date ┆ is_workday │
88
+ │ --- ┆ --- │
89
+ │ date ┆ bool │
90
+ ╞════════════╪════════════╡
91
+ │ 2023-01-04 ┆ true │
92
+ │ 2023-05-01 ┆ true │
93
+ │ 2023-09-09 ┆ false │
94
+ └────────────┴────────────┘
95
+
96
+ """
97
+ expr = parse_into_expr(expr)
98
+ weekend_int = [mapping[x] for x in weekend]
99
+ if holidays is not None:
100
+ return ~(
101
+ expr.dt.date().is_in(holidays)
102
+ | expr.dt.weekday().is_in(weekend_int)
103
+ )
104
+ return ~expr.dt.weekday().is_in(weekend_int)
105
+
106
+
107
+ def from_local_datetime(
108
+ expr: IntoExprColumn,
109
+ from_tz: str | Expr,
110
+ to_tz: str,
111
+ ambiguous: Ambiguous = "raise",
112
+ ) -> pl.Expr:
113
+ """
114
+ Convert from local datetime in given time zone to new timezone.
115
+
116
+ Parameters
117
+ ----------
118
+ expr
119
+ Expression to convert.
120
+ from_tz
121
+ Current timezone of each datetime
122
+ to_tz
123
+ Timezone to convert to
124
+ ambiguous
125
+ Determine how to deal with ambiguous datetimes:
126
+
127
+ - `'raise'` (default): raise
128
+ - `'earliest'`: use the earliest datetime
129
+ - `'latest'`: use the latest datetime
130
+
131
+ Returns
132
+ -------
133
+ Expr
134
+ Expression of data type :class:`DateTime`.
135
+
136
+ Examples
137
+ --------
138
+ You can go from a localized datetime back to expressing the datetimes
139
+ in a single timezone with `from_local_datetime`.
140
+
141
+ >>> from datetime import datetime
142
+ >>> import polars_xdt as xdt
143
+ >>> df = pl.DataFrame(
144
+ ... {
145
+ ... "local_dt": [
146
+ ... datetime(2020, 10, 10, 1),
147
+ ... datetime(2020, 10, 10, 2),
148
+ ... datetime(2020, 10, 9, 20),
149
+ ... ],
150
+ ... "timezone": [
151
+ ... "Europe/London",
152
+ ... "Africa/Kigali",
153
+ ... "America/New_York",
154
+ ... ],
155
+ ... }
156
+ ... )
157
+ >>> df.with_columns(
158
+ ... xdt.from_local_datetime(
159
+ ... "local_dt", pl.col("timezone"), "UTC"
160
+ ... ).alias("date")
161
+ ... )
162
+ shape: (3, 3)
163
+ ┌─────────────────────┬──────────────────┬─────────────────────────┐
164
+ │ local_dt ┆ timezone ┆ date │
165
+ │ --- ┆ --- ┆ --- │
166
+ │ datetime[μs] ┆ str ┆ datetime[μs, UTC] │
167
+ ╞═════════════════════╪══════════════════╪═════════════════════════╡
168
+ │ 2020-10-10 01:00:00 ┆ Europe/London ┆ 2020-10-10 00:00:00 UTC │
169
+ │ 2020-10-10 02:00:00 ┆ Africa/Kigali ┆ 2020-10-10 00:00:00 UTC │
170
+ │ 2020-10-09 20:00:00 ┆ America/New_York ┆ 2020-10-10 00:00:00 UTC │
171
+ └─────────────────────┴──────────────────┴─────────────────────────┘
172
+
173
+ """
174
+ expr = parse_into_expr(expr)
175
+ from_tz = parse_into_expr(from_tz, str_as_lit=True)
176
+ return register_plugin_function(
177
+ plugin_path=PLUGIN_PATH,
178
+ function_name="from_local_datetime",
179
+ is_elementwise=True,
180
+ args=[expr, from_tz],
181
+ kwargs={
182
+ "to_tz": to_tz,
183
+ "ambiguous": ambiguous,
184
+ },
185
+ )
186
+
187
+
188
+ def to_local_datetime(
189
+ expr: IntoExprColumn,
190
+ time_zone: str | Expr,
191
+ ) -> pl.Expr:
192
+ """
193
+ Convert to local datetime in given time zone.
194
+
195
+ Parameters
196
+ ----------
197
+ expr
198
+ Expression to convert.
199
+ time_zone
200
+ Time zone to convert to.
201
+
202
+ Returns
203
+ -------
204
+ Expr
205
+ Expression of data type :class:`DateTime`.
206
+
207
+ Examples
208
+ --------
209
+ You can use `to_local_datetime` to figure out how a tz-aware datetime
210
+ will be expressed as a local datetime.
211
+
212
+ >>> from datetime import datetime
213
+ >>> import polars_xdt as xdt
214
+ >>> df = pl.DataFrame(
215
+ ... {
216
+ ... "date_col": [datetime(2020, 10, 10)] * 3,
217
+ ... "timezone": [
218
+ ... "Europe/London",
219
+ ... "Africa/Kigali",
220
+ ... "America/New_York",
221
+ ... ],
222
+ ... }
223
+ ... ).with_columns(pl.col("date_col").dt.replace_time_zone("UTC"))
224
+ >>> df.with_columns(
225
+ ... xdt.to_local_datetime("date_col", pl.col("timezone")).alias(
226
+ ... "local_dt"
227
+ ... )
228
+ ... )
229
+ shape: (3, 3)
230
+ ┌─────────────────────────┬──────────────────┬─────────────────────┐
231
+ │ date_col ┆ timezone ┆ local_dt │
232
+ │ --- ┆ --- ┆ --- │
233
+ │ datetime[μs, UTC] ┆ str ┆ datetime[μs] │
234
+ ╞═════════════════════════╪══════════════════╪═════════════════════╡
235
+ │ 2020-10-10 00:00:00 UTC ┆ Europe/London ┆ 2020-10-10 01:00:00 │
236
+ │ 2020-10-10 00:00:00 UTC ┆ Africa/Kigali ┆ 2020-10-10 02:00:00 │
237
+ │ 2020-10-10 00:00:00 UTC ┆ America/New_York ┆ 2020-10-09 20:00:00 │
238
+ └─────────────────────────┴──────────────────┴─────────────────────┘
239
+
240
+ """
241
+ expr = parse_into_expr(expr)
242
+ time_zone = parse_into_expr(time_zone, str_as_lit=True)
243
+ return register_plugin_function(
244
+ plugin_path=PLUGIN_PATH,
245
+ function_name="to_local_datetime",
246
+ is_elementwise=True,
247
+ args=[expr, time_zone],
248
+ )
249
+
250
+
251
+ def format_localized(
252
+ expr: IntoExprColumn,
253
+ format: str, # noqa: A002
254
+ locale: str = "uk_UA",
255
+ ) -> pl.Expr:
256
+ """
257
+ Convert to local datetime in given time zone.
258
+
259
+ Parameters
260
+ ----------
261
+ expr
262
+ Expression to format.
263
+ format
264
+ Format string, see https://docs.rs/chrono/latest/chrono/format/strftime/index.html
265
+ for what's available.
266
+ locale
267
+ Locale to use for formatting. Defaults to "uk_UA", because that's what the OP
268
+ requested https://github.com/pola-rs/polars/issues/12341.
269
+
270
+ Returns
271
+ -------
272
+ Expr
273
+ Expression of data type :class:`Utf8`.
274
+
275
+ Examples
276
+ --------
277
+ >>> from datetime import datetime
278
+ >>> import polars_xdt as xdt
279
+ >>> df = pl.DataFrame(
280
+ ... {
281
+ ... "date_col": [datetime(2024, 8, 24), datetime(2024, 10, 1)],
282
+ ... }
283
+ ... )
284
+ >>> df.with_columns(
285
+ ... result=xdt.format_localized(
286
+ ... "date_col", format="%A, %d %B %Y", locale="uk_UA"
287
+ ... )
288
+ ... )
289
+ shape: (2, 2)
290
+ ┌─────────────────────┬──────────────────────────┐
291
+ │ date_col ┆ result │
292
+ │ --- ┆ --- │
293
+ │ datetime[μs] ┆ str │
294
+ ╞═════════════════════╪══════════════════════════╡
295
+ │ 2024-08-24 00:00:00 ┆ субота, 24 серпня 2024 │
296
+ │ 2024-10-01 00:00:00 ┆ вівторок, 01 жовтня 2024 │
297
+ └─────────────────────┴──────────────────────────┘
298
+
299
+ """
300
+ expr = parse_into_expr(expr)
301
+ return register_plugin_function(
302
+ plugin_path=PLUGIN_PATH,
303
+ function_name="format_localized",
304
+ is_elementwise=True,
305
+ args=[expr],
306
+ kwargs={"format": format, "locale": locale},
307
+ )
308
+
309
+
310
+ def to_julian_date(expr: str | pl.Expr) -> pl.Expr:
311
+ """
312
+ Return the Julian date corresponding to given datetimes.
313
+
314
+ Examples
315
+ --------
316
+ >>> from datetime import datetime
317
+ >>> import polars_xdt as xdt
318
+ >>> df = pl.DataFrame(
319
+ ... {
320
+ ... "date_col": [
321
+ ... datetime(2013, 1, 1, 0, 30),
322
+ ... datetime(2024, 1, 7, 13, 18, 51),
323
+ ... ],
324
+ ... }
325
+ ... )
326
+ >>> with pl.Config(float_precision=10) as cfg:
327
+ ... df.with_columns(julian_date=xdt.to_julian_date("date_col"))
328
+ shape: (2, 2)
329
+ ┌─────────────────────┬────────────────────┐
330
+ │ date_col ┆ julian_date │
331
+ │ --- ┆ --- │
332
+ │ datetime[μs] ┆ f64 │
333
+ ╞═════════════════════╪════════════════════╡
334
+ │ 2013-01-01 00:30:00 ┆ 2456293.5208333335 │
335
+ │ 2024-01-07 13:18:51 ┆ 2460317.0547569445 │
336
+ └─────────────────────┴────────────────────┘
337
+
338
+ """
339
+ expr = parse_into_expr(expr)
340
+ return register_plugin_function(
341
+ plugin_path=PLUGIN_PATH,
342
+ function_name="to_julian_date",
343
+ is_elementwise=True,
344
+ args=[expr],
345
+ )
346
+
347
+
348
+ def ceil(
349
+ expr: IntoExprColumn,
350
+ every: str | pl.Expr,
351
+ ) -> pl.Expr:
352
+ """
353
+ Find "ceiling" of datetime.
354
+
355
+ Parameters
356
+ ----------
357
+ expr
358
+ Expression to take "ceiling" of.
359
+ every
360
+ Duration string, created with the
361
+ the following string language:
362
+
363
+ - 1ns (1 nanosecond)
364
+ - 1us (1 microsecond)
365
+ - 1ms (1 millisecond)
366
+ - 1s (1 second)
367
+ - 1m (1 minute)
368
+ - 1h (1 hour)
369
+ - 1d (1 calendar day)
370
+ - 1w (1 calendar week)
371
+ - 1mo (1 calendar month)
372
+ - 1q (1 calendar quarter)
373
+ - 1y (1 calendar year)
374
+
375
+ These strings can be combined:
376
+
377
+ - 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds
378
+
379
+ By "calendar day", we mean the corresponding time on the next day (which may
380
+ not be 24 hours, due to daylight savings). Similarly for "calendar week",
381
+ "calendar month", "calendar quarter", and "calendar year".
382
+
383
+ Returns
384
+ -------
385
+ Expr
386
+ Expression of the same type.
387
+
388
+ Examples
389
+ --------
390
+ >>> from datetime import datetime
391
+ >>> import polars_xdt as xdt
392
+ >>> df = pl.DataFrame(
393
+ ... {
394
+ ... "date_col": [datetime(2024, 8, 24), datetime(2024, 10, 1)],
395
+ ... }
396
+ ... )
397
+ >>> df.with_columns(result=xdt.ceil("date_col", "1mo"))
398
+ shape: (2, 2)
399
+ ┌─────────────────────┬─────────────────────┐
400
+ │ date_col ┆ result │
401
+ │ --- ┆ --- │
402
+ │ datetime[μs] ┆ datetime[μs] │
403
+ ╞═════════════════════╪═════════════════════╡
404
+ │ 2024-08-24 00:00:00 ┆ 2024-09-01 00:00:00 │
405
+ │ 2024-10-01 00:00:00 ┆ 2024-10-01 00:00:00 │
406
+ └─────────────────────┴─────────────────────┘
407
+
408
+ """
409
+ expr = parse_into_expr(expr)
410
+ truncated = expr.dt.truncate(every)
411
+ return (
412
+ pl.when(expr == truncated)
413
+ .then(expr)
414
+ .otherwise(truncated.dt.offset_by(every))
415
+ )
416
+
417
+
418
+ def day_name(expr: str | pl.Expr, locale: str | None = None) -> pl.Expr:
419
+ """
420
+ Return day name, in specified locale (if specified).
421
+
422
+ Returns
423
+ -------
424
+ Expr
425
+ Expression of data type :class:`Utf8`.
426
+
427
+ See Also
428
+ --------
429
+ format_localized : format according to locale.
430
+
431
+ Examples
432
+ --------
433
+ >>> from datetime import datetime
434
+ >>> import polars_xdt as xdt
435
+ >>> df = pl.DataFrame(
436
+ ... {
437
+ ... "ts": [datetime(2020, 10, 25), datetime(2020, 10, 26)],
438
+ ... }
439
+ ... )
440
+ >>> df.with_columns(
441
+ ... english_day_name=xdt.day_name("ts"),
442
+ ... french_day_name=xdt.day_name("ts", locale="fr_FR"),
443
+ ... ukrainian_day_name=xdt.day_name("ts", locale="uk_UA"),
444
+ ... )
445
+ shape: (2, 4)
446
+ ┌─────────────────────┬──────────────────┬─────────────────┬────────────────────┐
447
+ │ ts ┆ english_day_name ┆ french_day_name ┆ ukrainian_day_name │
448
+ │ --- ┆ --- ┆ --- ┆ --- │
449
+ │ datetime[μs] ┆ str ┆ str ┆ str │
450
+ ╞═════════════════════╪══════════════════╪═════════════════╪════════════════════╡
451
+ │ 2020-10-25 00:00:00 ┆ Sunday ┆ dimanche ┆ неділя │
452
+ │ 2020-10-26 00:00:00 ┆ Monday ┆ lundi ┆ понеділок │
453
+ └─────────────────────┴──────────────────┴─────────────────┴────────────────────┘
454
+
455
+ """
456
+ expr = parse_into_expr(expr)
457
+ if locale is None:
458
+ result = expr.dt.to_string("%A")
459
+ else:
460
+ result = format_localized(expr, "%A", locale=locale) # type: ignore[attr-defined]
461
+ return result
462
+
463
+
464
+ def month_name(expr: str | pl.Expr, locale: str | None = None) -> pl.Expr:
465
+ """
466
+ Return month name, in specified locale (if specified).
467
+
468
+ Returns
469
+ -------
470
+ Expr
471
+ Expression of data type :class:`Utf8`.
472
+
473
+ See Also
474
+ --------
475
+ format_localized : format according to locale.
476
+
477
+ Examples
478
+ --------
479
+ >>> from datetime import datetime
480
+ >>> import polars_xdt as xdt
481
+ >>> df = pl.DataFrame(
482
+ ... {
483
+ ... "ts": [datetime(2020, 10, 25), datetime(2020, 11, 26)],
484
+ ... }
485
+ ... )
486
+ >>> df.with_columns(
487
+ ... english_month_name=xdt.month_name("ts"),
488
+ ... french_month_name=xdt.month_name("ts", locale="fr_FR"),
489
+ ... ukrainian_month_name=xdt.month_name("ts", locale="uk_UA"),
490
+ ... )
491
+ shape: (2, 4)
492
+ ┌─────────────────────┬────────────────────┬───────────────────┬──────────────────────┐
493
+ │ ts ┆ english_month_name ┆ french_month_name ┆ ukrainian_month_name │
494
+ │ --- ┆ --- ┆ --- ┆ --- │
495
+ │ datetime[μs] ┆ str ┆ str ┆ str │
496
+ ╞═════════════════════╪════════════════════╪═══════════════════╪══════════════════════╡
497
+ │ 2020-10-25 00:00:00 ┆ October ┆ octobre ┆ жовтня │
498
+ │ 2020-11-26 00:00:00 ┆ November ┆ novembre ┆ листопада │
499
+ └─────────────────────┴────────────────────┴───────────────────┴──────────────────────┘
500
+
501
+ """
502
+ expr = parse_into_expr(expr)
503
+ if locale is None:
504
+ result = expr.dt.to_string("%B")
505
+ else:
506
+ result = format_localized(expr, "%B", locale=locale)
507
+ return result
508
+
509
+
510
+ def month_delta(
511
+ start_dates: IntoExprColumn,
512
+ end_dates: IntoExprColumn | date,
513
+ ) -> pl.Expr:
514
+ """
515
+ Calculate the number of months between two Series.
516
+
517
+ Parameters
518
+ ----------
519
+ start_dates
520
+ A Series object containing the start dates.
521
+ end_dates
522
+ A Series object containing the end dates.
523
+
524
+ Returns
525
+ -------
526
+ polars.Expr
527
+
528
+ Examples
529
+ --------
530
+ >>> from datetime import date
531
+ >>> import polars as pl
532
+ >>> import polars_xdt as xdt
533
+ >>> df = pl.DataFrame(
534
+ ... {
535
+ ... "start_date": [
536
+ ... date(2024, 3, 1),
537
+ ... date(2024, 3, 31),
538
+ ... date(2022, 2, 28),
539
+ ... date(2023, 1, 31),
540
+ ... date(2019, 12, 31),
541
+ ... ],
542
+ ... "end_date": [
543
+ ... date(2023, 2, 28),
544
+ ... date(2023, 2, 28),
545
+ ... date(2023, 2, 28),
546
+ ... date(2023, 1, 31),
547
+ ... date(2023, 1, 1),
548
+ ... ],
549
+ ... },
550
+ ... )
551
+ >>> df.with_columns(
552
+ ... xdt.month_delta("start_date", "end_date").alias("month_delta")
553
+ ... )
554
+ shape: (5, 3)
555
+ ┌────────────┬────────────┬─────────────┐
556
+ │ start_date ┆ end_date ┆ month_delta │
557
+ │ --- ┆ --- ┆ --- │
558
+ │ date ┆ date ┆ i32 │
559
+ ╞════════════╪════════════╪═════════════╡
560
+ │ 2024-03-01 ┆ 2023-02-28 ┆ -12 │
561
+ │ 2024-03-31 ┆ 2023-02-28 ┆ -13 │
562
+ │ 2022-02-28 ┆ 2023-02-28 ┆ 12 │
563
+ │ 2023-01-31 ┆ 2023-01-31 ┆ 0 │
564
+ │ 2019-12-31 ┆ 2023-01-01 ┆ 36 │
565
+ └────────────┴────────────┴─────────────┘
566
+
567
+ """
568
+ start_dates = parse_into_expr(start_dates)
569
+ if not isinstance(end_dates, date):
570
+ end_dates = parse_into_expr(end_dates)
571
+
572
+ return register_plugin_function(
573
+ plugin_path=PLUGIN_PATH,
574
+ function_name="month_delta",
575
+ is_elementwise=True,
576
+ args=[start_dates, end_dates],
577
+ )
578
+
579
+
580
+ def arg_previous_greater(expr: IntoExprColumn) -> pl.Expr:
581
+ """
582
+ Find the row count of the previous value greater than the current one.
583
+
584
+ Parameters
585
+ ----------
586
+ expr
587
+ Expression.
588
+
589
+ Returns
590
+ -------
591
+ Expr
592
+ UInt64 or UInt32 type, depending on the platform.
593
+
594
+ Examples
595
+ --------
596
+ >>> import polars as pl
597
+ >>> import polars_xdt as xdt
598
+ >>> df = pl.DataFrame({"value": [1, 9, 6, 7, 3]})
599
+ >>> df.with_columns(result=xdt.arg_previous_greater("value"))
600
+ shape: (5, 2)
601
+ ┌───────┬────────┐
602
+ │ value ┆ result │
603
+ │ --- ┆ --- │
604
+ │ i64 ┆ u32 │
605
+ ╞═══════╪════════╡
606
+ │ 1 ┆ null │
607
+ │ 9 ┆ 1 │
608
+ │ 6 ┆ 1 │
609
+ │ 7 ┆ 1 │
610
+ │ 3 ┆ 3 │
611
+ └───────┴────────┘
612
+
613
+ This can be useful when working with time series. For example,
614
+ if you a dataset like this:
615
+
616
+ >>> df = pl.DataFrame(
617
+ ... {
618
+ ... "date": [
619
+ ... "2024-02-01",
620
+ ... "2024-02-02",
621
+ ... "2024-02-03",
622
+ ... "2024-02-04",
623
+ ... "2024-02-05",
624
+ ... "2024-02-06",
625
+ ... "2024-02-07",
626
+ ... "2024-02-08",
627
+ ... "2024-02-09",
628
+ ... "2024-02-10",
629
+ ... ],
630
+ ... "group": ["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"],
631
+ ... "value": [1, 9, None, 7, 3, 2, 4, 5, 1, 9],
632
+ ... }
633
+ ... )
634
+ >>> df = df.with_columns(pl.col("date").str.to_date())
635
+
636
+ and want find out, for each day and each item, how many days it's
637
+ been since `'value'` was higher than it currently is, you could do
638
+
639
+ >>> df.with_columns(
640
+ ... result=(
641
+ ... (
642
+ ... pl.col("date")
643
+ ... - pl.col("date")
644
+ ... .gather(xdt.arg_previous_greater("value"))
645
+ ... .over("group")
646
+ ... ).dt.total_days()
647
+ ... ),
648
+ ... )
649
+ shape: (10, 4)
650
+ ┌────────────┬───────┬───────┬────────┐
651
+ │ date ┆ group ┆ value ┆ result │
652
+ │ --- ┆ --- ┆ --- ┆ --- │
653
+ │ date ┆ str ┆ i64 ┆ i64 │
654
+ ╞════════════╪═══════╪═══════╪════════╡
655
+ │ 2024-02-01 ┆ A ┆ 1 ┆ null │
656
+ │ 2024-02-02 ┆ A ┆ 9 ┆ 0 │
657
+ │ 2024-02-03 ┆ A ┆ null ┆ null │
658
+ │ 2024-02-04 ┆ A ┆ 7 ┆ 2 │
659
+ │ 2024-02-05 ┆ A ┆ 3 ┆ 1 │
660
+ │ 2024-02-06 ┆ B ┆ 2 ┆ null │
661
+ │ 2024-02-07 ┆ B ┆ 4 ┆ 0 │
662
+ │ 2024-02-08 ┆ B ┆ 5 ┆ 0 │
663
+ │ 2024-02-09 ┆ B ┆ 1 ┆ 1 │
664
+ │ 2024-02-10 ┆ B ┆ 9 ┆ 0 │
665
+ └────────────┴───────┴───────┴────────┘
666
+
667
+ """
668
+ expr = parse_into_expr(expr)
669
+ return register_plugin_function(
670
+ plugin_path=PLUGIN_PATH,
671
+ function_name="arg_previous_greater",
672
+ is_elementwise=False,
673
+ args=[expr],
674
+ )
675
+
676
+
677
+ def sgt_transform(
678
+ sequence_id_col: IntoExprColumn,
679
+ state_col: IntoExprColumn,
680
+ time_col: IntoExprColumn | None = None,
681
+ *,
682
+ kappa: int = 1,
683
+ length_sensitive: bool = False,
684
+ mode: Literal["l1", "l2", "none"] = "l1",
685
+ time_penalty: Literal["inverse", "exponential", "linear", "power", "none"] = "inverse",
686
+ alpha: float = 1.0,
687
+ beta: float = 2.0,
688
+ deltatime: Literal["s", "m", "h", "d", "w", "month", "q", "y"] | None = None,
689
+ ) -> pl.Expr:
690
+ """
691
+ Compute Sequence Graph Transform (SGT) features from sequential data.
692
+
693
+ SGT transforms sequences into weighted n-gram representations, capturing
694
+ both the order and timing of events in your data.
695
+
696
+ Parameters
697
+ ----------
698
+ sequence_id_col
699
+ Column name containing sequence identifiers (groups)
700
+ state_col
701
+ Column name containing state/event values
702
+ time_col
703
+ Column name containing timestamps or numeric time values
704
+ kappa
705
+ Maximum n-gram size (1=unigrams, 2=bigrams, etc.)
706
+ Higher values capture longer-range dependencies
707
+ length_sensitive
708
+ If True, apply length normalization to prevent long sequences
709
+ from dominating the feature space
710
+ mode
711
+ Normalization mode for n-gram weights:
712
+ - 'l1': Sum of weights equals 1
713
+ - 'l2': L2 norm of weights equals 1
714
+ - 'none': No normalization
715
+ time_penalty
716
+ Time decay function applied to n-gram weights:
717
+ - 'inverse': weight = alpha / time_diff
718
+ - 'exponential': weight = exp(-alpha * time_diff)
719
+ - 'linear': weight = max(0, 1 - alpha * time_diff)
720
+ - 'power': weight = 1 / time_diff^beta
721
+ - 'none': No time penalty (weight = 1)
722
+ alpha
723
+ Time penalty scale parameter (used in all penalties except 'power')
724
+ beta
725
+ Power parameter (only used when time_penalty='power')
726
+ deltatime
727
+ Time unit for date/datetime columns:
728
+ - 's': seconds
729
+ - 'm': minutes
730
+ - 'h': hours
731
+ - 'd': days
732
+ - 'w': weeks
733
+ - 'month': months (30.44 days)
734
+ - 'q': quarters (91.31 days)
735
+ - 'y': years (365.25 days)
736
+
737
+ Returns
738
+ -------
739
+ pl.Expr
740
+ Struct expression containing:
741
+ - sequence_id: Original sequence identifier
742
+ - ngram_keys: List of n-gram strings
743
+ - ngram_values: List of corresponding weights
744
+
745
+ Examples
746
+ --------
747
+ Basic usage with unigrams:
748
+
749
+ >>> df = pl.DataFrame({
750
+ ... "user_id": [1, 1, 1, 2, 2],
751
+ ... "action": ["login", "view", "purchase", "login", "view"],
752
+ ... })
753
+ >>> result = df.select(
754
+ ... xdt.sgt_transform("user_id", "action", kappa=1)
755
+ ... )
756
+
757
+ Bigrams with time decay:
758
+
759
+ >>> df = pl.DataFrame({
760
+ ... "user_id": [1, 1, 1, 2, 2],
761
+ ... "action": ["login", "view", "purchase", "login", "view"],
762
+ ... "timestamp": [0, 10, 20, 0, 5],
763
+ ... })
764
+ >>> result = df.select(
765
+ ... xdt.sgt_transform(
766
+ ... "user_id",
767
+ ... "action",
768
+ ... time_col="timestamp",
769
+ ... kappa=2,
770
+ ... time_penalty="exponential",
771
+ ... alpha=0.1,
772
+ ... )
773
+ ... )
774
+
775
+ With datetime columns:
776
+
777
+ >>> df = pl.DataFrame({
778
+ ... "session_id": ["A", "A", "A"],
779
+ ... "event": ["start", "click", "end"],
780
+ ... "time": pl.datetime_range(
781
+ ... start=pl.datetime(2024, 1, 1),
782
+ ... end=pl.datetime(2024, 1, 1, 0, 10),
783
+ ... interval="5m",
784
+ ... eager=True,
785
+ ... ),
786
+ ... })
787
+ >>> result = df.select(
788
+ ... xdt.sgt_transform(
789
+ ... "session_id",
790
+ ... "event",
791
+ ... time_col="time",
792
+ ... deltatime="m",
793
+ ... kappa=2,
794
+ ... )
795
+ ... )
796
+
797
+ Integration with Polars pipeline (Lazy):
798
+
799
+ >>> result = (
800
+ ... pl.scan_csv("sequences.csv")
801
+ ... .with_columns(pl.col("timestamp").str.to_datetime())
802
+ ... .group_by("user_id")
803
+ ... .agg([
804
+ ... pl.col("action"),
805
+ ... pl.col("timestamp"),
806
+ ... ])
807
+ ... .select(
808
+ ... xdt.sgt_transform(
809
+ ... "user_id",
810
+ ... "action",
811
+ ... time_col="timestamp",
812
+ ... kappa=3,
813
+ ... deltatime="h",
814
+ ... )
815
+ ... )
816
+ ... .collect(streaming=True)
817
+ ... )
818
+
819
+ Extracting n-gram features:
820
+
821
+ >>> df_features = result.select([
822
+ ... pl.col("sgt_result").struct.field("sequence_id"),
823
+ ... pl.col("sgt_result").struct.field("ngram_keys").alias("ngrams"),
824
+ ... pl.col("sgt_result").struct.field("ngram_values").alias("weights"),
825
+ ... ]).explode(["ngrams", "weights"])
826
+
827
+ Notes
828
+ -----
829
+ - The function is highly optimized with parallel processing
830
+ - Supports both eager and lazy evaluation
831
+ - Compatible with streaming execution
832
+ - Time columns can be numeric or temporal (date/datetime/duration)
833
+ - Missing values in time columns are treated as 0
834
+
835
+ """
836
+ sequence_id_col = parse_into_expr(sequence_id_col)
837
+ state_col = parse_into_expr(state_col)
838
+
839
+ if time_col is not None:
840
+ time_col = parse_into_expr(time_col)
841
+ args = [sequence_id_col, state_col, time_col]
842
+ else:
843
+ args = [sequence_id_col, state_col]
844
+
845
+ return register_plugin_function(
846
+ plugin_path=PLUGIN_PATH,
847
+ function_name="sgt_transform",
848
+ is_elementwise=False,
849
+ args=args,
850
+ kwargs={
851
+ "kappa": kappa,
852
+ "length_sensitive": length_sensitive,
853
+ "mode": mode,
854
+ "time_penalty": time_penalty,
855
+ "alpha": alpha,
856
+ "beta": beta,
857
+ "deltatime": deltatime,
858
+ },
859
+ )