polars-sgt 0.1.0__cp39-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Callable
4
+
5
+ import polars as pl
6
+
7
+ from polars_sgt import functions
8
+
9
+
10
+ @pl.api.register_expr_namespace("xdt")
11
+ class ExprXDTNamespace:
12
+ """eXtra stuff for DateTimes."""
13
+
14
+ def __init__(self, expr: pl.Expr) -> None:
15
+ self._expr = expr
16
+
17
+ def __getattr__(self, function_name: str) -> Callable[[Any], pl.Expr]:
18
+ def func(*args: Any, **kwargs: Any) -> pl.Expr:
19
+ return getattr(functions, function_name)(
20
+ self._expr, *args, **kwargs
21
+ )
22
+
23
+ return func
polars_sgt/py.typed ADDED
File without changes
polars_sgt/ranges.py ADDED
@@ -0,0 +1,162 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import TYPE_CHECKING, Literal, Union, overload
5
+ from typing import TYPE_CHECKING, Literal, overload
6
+
7
+ import polars as pl
8
+
9
+ mapping = {"Mon": 1, "Tue": 2, "Wed": 3, "Thu": 4, "Fri": 5, "Sat": 6, "Sun": 7}
10
+
11
+ if TYPE_CHECKING:
12
+ import sys
13
+ from collections.abc import Sequence
14
+ from datetime import date, datetime, timedelta
15
+
16
+ if sys.version_info >= (3, 10):
17
+ from typing import TypeAlias
18
+ else:
19
+ from typing_extensions import TypeAlias
20
+
21
+ ClosedInterval: TypeAlias = Literal[
22
+ "left", "right", "both", "none"
23
+ ] # ClosedWindow
24
+ IntoExprColumn: TypeAlias = "pl.Expr" | "pl.Series" | str
25
+
26
+ from polars_sgt.typing import IntoExprColumn
27
+ from polars_sgt.utils import parse_into_expr
28
+
29
+
30
+ @overload
31
+ def date_range(
32
+ start: date | datetime | IntoExprColumn,
33
+ end: date | datetime | IntoExprColumn,
34
+ interval: str | timedelta = "1d",
35
+ *,
36
+ closed: ClosedInterval = ...,
37
+ eager: Literal[False] = ...,
38
+ weekend: Sequence[str] = ...,
39
+ holidays: Sequence[date] | None = ...,
40
+ ) -> pl.Expr: ...
41
+
42
+
43
+ @overload
44
+ def date_range(
45
+ start: date | IntoExprColumn,
46
+ end: date | IntoExprColumn,
47
+ interval: str | timedelta = "1d",
48
+ *,
49
+ closed: ClosedInterval = ...,
50
+ eager: Literal[True],
51
+ weekend: Sequence[str] = ...,
52
+ holidays: Sequence[date] | None = ...,
53
+ ) -> pl.Series: ...
54
+
55
+
56
+ @overload
57
+ def date_range(
58
+ start: date | IntoExprColumn,
59
+ end: date | IntoExprColumn,
60
+ interval: str | timedelta = "1d",
61
+ *,
62
+ closed: ClosedInterval = ...,
63
+ eager: bool = ...,
64
+ weekend: Sequence[str] = ...,
65
+ holidays: Sequence[date] | None = ...,
66
+ ) -> pl.Series | pl.Expr: ...
67
+
68
+
69
+ def date_range( # noqa: PLR0913
70
+ start: date | IntoExprColumn,
71
+ end: date | IntoExprColumn,
72
+ interval: str | timedelta = "1bd",
73
+ *,
74
+ closed: ClosedInterval = "both",
75
+ eager: bool = False,
76
+ weekend: Sequence[str] = ("Sat", "Sun"),
77
+ holidays: Sequence[date] | None = None,
78
+ ) -> pl.Series | pl.Expr:
79
+ """
80
+ Create a range of dates with a given interval and filter out weekends and holidays.
81
+
82
+ Parameters
83
+ ----------
84
+ start
85
+ Lower bound of the date range.
86
+ end
87
+ Upper bound of the date range.
88
+ interval
89
+ Interval of the range periods, specified as a Python ``timedelta`` object
90
+ or using the Polars duration string language (see "Notes" section below).
91
+
92
+ To create a month-end date series, combine with :meth:`Expr.dt.month_end` (see
93
+ "Examples" section below).
94
+ closed : {'both', 'left', 'right', 'none'}
95
+ Define which sides of the range are closed (inclusive).
96
+ eager
97
+ Evaluate immediately and return a ``Series``.
98
+ If set to ``False`` (default), return an expression instead.
99
+ weekend
100
+ The days of the week that are considered weekends. Defaults to ("Sat", "Sun").
101
+ holidays
102
+ The holidays to exclude from the calculation. Defaults to None. This should
103
+ be a list of ``datetime.date`` s.
104
+
105
+ Returns
106
+ -------
107
+ Expr or Series
108
+ Column of data type :class:`Date`.
109
+
110
+ Examples
111
+ --------
112
+ >>> from datetime import date
113
+ >>> import polars as pl
114
+ >>> import polars_sgt
115
+ >>> pl.DataFrame(
116
+ ... {
117
+ ... "date": polars_sgt.date_range(
118
+ ... date(2023, 1, 1), date(2023, 1, 10), "1bd", eager=True
119
+ ... ),
120
+ ... }
121
+ ... )
122
+ shape: (7, 1)
123
+ ┌────────────┐
124
+ │ date │
125
+ │ --- │
126
+ │ date │
127
+ ╞════════════╡
128
+ │ 2023-01-02 │
129
+ │ 2023-01-03 │
130
+ │ 2023-01-04 │
131
+ │ 2023-01-05 │
132
+ │ 2023-01-06 │
133
+ │ 2023-01-09 │
134
+ │ 2023-01-10 │
135
+ └────────────┘
136
+
137
+ """
138
+ if weekend == ("Sat", "Sun"):
139
+ weekend_int = [6, 7]
140
+ else:
141
+ weekend_int = sorted({mapping[name] for name in weekend})
142
+ if holidays is None:
143
+ holidays = []
144
+
145
+ if not (isinstance(interval, str) and re.match(r"^-?\d+bd$", interval)):
146
+ msg = "Only intervals of the form 'nbd' (where n is an integer) are supported."
147
+ raise ValueError(msg)
148
+ interval = interval.replace("bd", "d")
149
+
150
+ expr = pl.date_range(
151
+ start,
152
+ end,
153
+ interval,
154
+ closed=closed,
155
+ eager=False,
156
+ )
157
+ expr = expr.filter(~expr.dt.date().is_in(holidays))
158
+ expr = expr.filter(~expr.dt.weekday().is_in(weekend_int))
159
+ if eager:
160
+ df = pl.select(expr)
161
+ return df[df.columns[0]]
162
+ return expr
polars_sgt/typing.py ADDED
@@ -0,0 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from typing import TYPE_CHECKING, Union
5
+
6
+ if sys.version_info >= (3, 10):
7
+ from typing import TypeAlias
8
+ else:
9
+ from typing_extensions import TypeAlias
10
+
11
+ if TYPE_CHECKING:
12
+ import polars as pl
13
+ from polars.type_aliases import PolarsDataType
14
+
15
+ IntoExprColumn: TypeAlias = Union["pl.Expr", "pl.Series", str]
16
+
17
+ __all__ = ["IntoExprColumn", "PolarsDataType"]
polars_sgt/utils.py ADDED
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ import polars as pl
6
+
7
+ if TYPE_CHECKING:
8
+ from polars_sgt.typing import IntoExprColumn, PolarsDataType
9
+
10
+
11
+ def parse_into_expr(
12
+ expr: IntoExprColumn,
13
+ *,
14
+ str_as_lit: bool = False,
15
+ list_as_lit: bool = True,
16
+ dtype: PolarsDataType | None = None,
17
+ ) -> pl.Expr:
18
+ """
19
+ Parse a single input into an expression.
20
+
21
+ Parameters
22
+ ----------
23
+ expr
24
+ The input to be parsed as an expression.
25
+ str_as_lit
26
+ Interpret string input as a string literal. If set to `False` (default),
27
+ strings are parsed as column names.
28
+ list_as_lit
29
+ Interpret list input as a lit literal, If set to `False`,
30
+ lists are parsed as `Series` literals.
31
+ dtype
32
+ If the input is expected to resolve to a literal with a known dtype, pass
33
+ this to the `lit` constructor.
34
+
35
+ Returns
36
+ -------
37
+ polars.Expr
38
+
39
+ """
40
+ if isinstance(expr, pl.Expr):
41
+ pass
42
+ elif isinstance(expr, str) and not str_as_lit:
43
+ expr = pl.col(expr)
44
+ elif isinstance(expr, list) and not list_as_lit:
45
+ expr = pl.lit(pl.Series(expr), dtype=dtype)
46
+ else:
47
+ expr = pl.lit(expr, dtype=dtype)
48
+
49
+ return expr
@@ -0,0 +1,205 @@
1
+ Metadata-Version: 2.4
2
+ Name: polars-sgt
3
+ Version: 0.1.0
4
+ Classifier: Programming Language :: Rust
5
+ Classifier: Programming Language :: Python :: Implementation :: CPython
6
+ Classifier: Programming Language :: Python :: Implementation :: PyPy
7
+ Requires-Dist: maturin>=1.11.5
8
+ Requires-Dist: polars>=1.36.1
9
+ Requires-Dist: pytest>=8.4.2
10
+ License-File: LICENSE
11
+ Summary: Sequence Graph Transform (SGT) for Polars - Transform sequential data into weighted n-gram representations
12
+ Author-email: Zedd <lytran14789@gmail.com>, Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
13
+ Requires-Python: >=3.9
14
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
15
+ Project-URL: Change Log, https://github.com/4ursmile/polars-sgt/releases
16
+ Project-URL: Documentation, https://github.com/4ursmile/polars-sgt
17
+ Project-URL: Issue Tracker, https://github.com/4ursmile/polars-sgt/issues
18
+ Project-URL: Repository, https://github.com/4ursmile/polars-sgt
19
+
20
+ # polars-sgt
21
+
22
+ ## Sequence Graph Transform for Polars
23
+
24
+
25
+
26
+ [![PyPI version](https://badge.fury.io/py/polars-sgt.svg)](https://badge.fury.io/py/polars-sgt)
27
+
28
+ Transform sequential data into powerful n-gram representations with [Polars](https://www.pola.rs/).
29
+
30
+ **polars-sgt** brings Sequence Graph Transform (SGT) to Polars, enabling you to:
31
+ - ✅ Transform sequences into weighted n-gram features
32
+ - ✅ Capture temporal patterns with time-based weighting
33
+ - ✅ Apply flexible normalization strategies (L1, L2, or none)
34
+ - ✅ Handle datetime, date, duration, and numeric time columns
35
+ - ✅ Blazingly fast, written in Rust
36
+ - ✅ Compatible with Polars lazy evaluation and streaming
37
+
38
+ ## What is SGT?
39
+
40
+ Sequence Graph Transform converts sequential data (like user clickstreams, sensor readings, or transaction histories) into weighted n-gram representations. It captures:
41
+
42
+ - **Sequential patterns**: Unigrams, bigrams, trigrams, and higher-order n-grams
43
+ - **Temporal dynamics**: Time-based weighting with multiple decay functions
44
+ - **Normalized features**: L1/L2 normalization for comparable feature spaces
45
+
46
+ Perfect for:
47
+ - User behavior analysis
48
+ - Time series feature engineering
49
+ - Sequential pattern mining
50
+ - Anomaly detection in sequences
51
+
52
+ ## Installation
53
+
54
+
55
+
56
+ Then install `polars-sgt`:
57
+
58
+ ```console
59
+ pip install polars-sgt
60
+ ```
61
+
62
+ ## Quick Start
63
+
64
+ ### Basic Example
65
+
66
+ ```python
67
+ import polars as pl
68
+ import polars_sgt as sgt
69
+
70
+ # User clickstream data
71
+ df = pl.DataFrame({
72
+ "user_id": [1, 1, 1, 2, 2, 2],
73
+ "action": ["login", "view_product", "purchase", "login", "view_product", "logout"],
74
+ "timestamp": [0, 10, 20, 0, 5, 15],
75
+ })
76
+
77
+ # Generate bigrams with exponential time decay
78
+ result = df.select(
79
+ sgt.sgt_transform(
80
+ "user_id",
81
+ "action",
82
+ time_col="timestamp",
83
+ kappa=2, # bigrams
84
+ time_penalty="exponential",
85
+ alpha=0.1,
86
+ mode="l1" # L1 normalization
87
+ ).alias("sgt_features")
88
+ )
89
+
90
+ # Extract features
91
+ features = result.select([
92
+ pl.col("sgt_features").struct.field("sequence_id"),
93
+ pl.col("sgt_features").struct.field("ngram_keys").alias("ngrams"),
94
+ pl.col("sgt_features").struct.field("ngram_values").alias("weights"),
95
+ ]).explode(["ngrams", "weights"])
96
+
97
+ print(features)
98
+ ```
99
+
100
+ ### With DateTime Columns
101
+
102
+ ```python
103
+ from datetime import datetime
104
+
105
+ df = pl.DataFrame({
106
+ "session_id": ["A", "A", "A", "A"],
107
+ "event": ["start", "click", "scroll", "exit"],
108
+ "time": [
109
+ datetime(2024, 1, 1, 10, 0),
110
+ datetime(2024, 1, 1, 10, 5),
111
+ datetime(2024, 1, 1, 10, 7),
112
+ datetime(2024, 1, 1, 10, 15),
113
+ ],
114
+ })
115
+
116
+ result = df.select(
117
+ sgt.sgt_transform(
118
+ "session_id",
119
+ "event",
120
+ time_col="time",
121
+ deltatime="m", # minutes
122
+ kappa=3, # trigrams
123
+ time_penalty="inverse",
124
+ )
125
+ )
126
+ ```
127
+
128
+ ### Lazy Evaluation & Streaming
129
+
130
+ ```python
131
+ result = (
132
+ pl.scan_csv("large_sequences.csv")
133
+ .with_columns(pl.col("timestamp").str.to_datetime())
134
+ .select(
135
+ sgt.sgt_transform(
136
+ "user_id",
137
+ "action",
138
+ time_col="timestamp",
139
+ kappa=2,
140
+ deltatime="h",
141
+ )
142
+ )
143
+ .collect(streaming=True)
144
+ )
145
+ ```
146
+
147
+ ## Parameters
148
+
149
+ ### Required
150
+ - `sequence_id_col`: Column with sequence identifiers (groups)
151
+ - `state_col`: Column with state/event values
152
+
153
+ ### Optional
154
+ - `time_col`: Timestamp column (datetime, date, duration, or numeric)
155
+ - `kappa`: Maximum n-gram size (default: 1)
156
+ - 1 = unigrams only
157
+ - 2 = unigrams + bigrams
158
+ - 3 = unigrams + bigrams + trigrams, etc.
159
+
160
+ - `time_penalty`: Time decay function (default: "inverse")
161
+ - `"inverse"`: weight = alpha / time_diff
162
+ - `"exponential"`: weight = exp(-alpha × time_diff)
163
+ - `"linear"`: weight = max(0, 1 - alpha × time_diff)
164
+ - `"power"`: weight = 1 / time_diff^beta
165
+ - `"none"`: No time penalty
166
+
167
+ - `mode`: Normalization mode (default: "l1")
168
+ - `"l1"`: Sum of weights = 1
169
+ - `"l2"`: L2 norm = 1
170
+ - `"none"`: No normalization
171
+
172
+ - `length_sensitive`: Apply length normalization (default: False)
173
+ - `alpha`: Time penalty scale parameter (default: 1.0)
174
+ - `beta`: Power parameter for "power" penalty (default: 2.0)
175
+ - `deltatime`: Time unit for datetime columns
176
+ - `"s"`, `"m"`, `"h"`, `"d"`, `"w"`, `"month"`, `"q"`, `"y"`
177
+
178
+ ## Output
179
+
180
+ Returns a Struct with three fields:
181
+ - `sequence_id`: Original sequence identifier
182
+ - `ngram_keys`: List of n-gram strings (e.g., "login -> view -> purchase")
183
+ - `ngram_values`: List of corresponding weights
184
+
185
+ ## Additional DateTime Utilities
186
+
187
+ While SGT is the primary focus, polars-sgt also includes helpful datetime utilities from the original polars-xdt:
188
+ - Timezone conversions
189
+ - Localized date formatting
190
+ - Julian date conversion
191
+ - Month delta calculations
192
+
193
+ See the [full API documentation](https://github.com/Zedd-L/polars-sgt) for details.
194
+
195
+ ## Author & Acknowledgments
196
+
197
+ **Author:** Zedd (lytran14789@gmail.com)
198
+
199
+ **Special Thanks:** This project is built upon [polars-xdt](https://github.com/MarcoGorelli/polars-xdt)
200
+ created by [Marco Gorelli](https://github.com/MarcoGorelli). We are grateful for his excellent foundation.
201
+
202
+ ## License
203
+
204
+ MIT
205
+
@@ -0,0 +1,14 @@
1
+ polars_sgt\.mypy.ini,sha256=5mJftXUSCrV-3kIF5PBb8zdS955xTJp0K373cqIs-KI,21
2
+ polars_sgt\__init__.py,sha256=8Xs67DT-X1ehNxiawFMDvr3wx8oXkdSpavlmFf3j814,717
3
+ polars_sgt\_internal.pyd,sha256=Vt1ztpOBU8Wggl7BDW1hYw5XigBfi0iq0DY9JHF3wgk,23472640
4
+ polars_sgt\_internal.pyi,sha256=xq3DTJlvvVHWjbZ1EvMjKJBpN-UQO3z1Rulqaz2NN6w,18
5
+ polars_sgt\functions.py,sha256=u5eoNwafTxkbFzU-o3RzyMca8oE2DPpXNaY2sfUd7Wg,31161
6
+ polars_sgt\namespace.py,sha256=pSsy2oeHx6wo_DNbL9wmoi_5AhyuJAt-MBEZAVY04QY,588
7
+ polars_sgt\py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ polars_sgt\ranges.py,sha256=Gd00LnGrer5vqNI3z9ocQkaoIUoZpDScGXm1xlGXEt4,4747
9
+ polars_sgt\typing.py,sha256=ERDcKyZpgtrN4tzm-sXgLnufk-Kllo8WKloruFY12MM,426
10
+ polars_sgt\utils.py,sha256=kQujAa3Ddpt3aEyvwIXqcNgD4tQKVJnalmh9vgvunqE,1285
11
+ polars_sgt-0.1.0.dist-info\METADATA,sha256=slJnjv8Q3g0p7UGr632asDhZNVwr7pI57ym07AL1p_Q,6224
12
+ polars_sgt-0.1.0.dist-info\WHEEL,sha256=OD0Is1kLHE07aD7XukSVDFU8ymUMI4Fdg0tKYWce3N0,95
13
+ polars_sgt-0.1.0.dist-info\licenses\LICENSE,sha256=N2uLjQs8DWUxWQieXP8Hpq6Q1IXIUVJqZj8KKMe4jTA,1098
14
+ polars_sgt-0.1.0.dist-info\RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.11.5)
3
+ Root-Is-Purelib: false
4
+ Tag: cp39-abi3-win_amd64
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Marco Edward Gorelli
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.