polars-sgt 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polars_sgt-0.3.0/CHANGELOG.md +49 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/Cargo.lock +1 -1
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/Cargo.toml +1 -1
- polars_sgt-0.3.0/PKG-INFO +216 -0
- polars_sgt-0.3.0/README.md +195 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/polars_sgt/__init__.py +2 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/polars_sgt/functions.py +236 -4
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/pyproject.toml +1 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/src/sgt_transform.rs +172 -59
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/tests/test_benchmark.py +1 -1
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/tests/test_sgt_transform.py +44 -0
- polars_sgt-0.3.0/tests/verify_sgt.py +103 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/uv.lock +14 -0
- polars_sgt-0.2.0/CHANGELOG.md +0 -19
- polars_sgt-0.2.0/PKG-INFO +0 -225
- polars_sgt-0.2.0/README.md +0 -205
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/.github/workflows/CI.yml +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/.gitignore +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/.python-version +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/.readthedocs.yaml +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/CODE_OF_CONDUCT.md +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/LICENSE +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/Makefile +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/assets/.DS_Store +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/assets/polars-business.png +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/bump_version.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/docs/API.rst +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/docs/Makefile +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/docs/conf.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/docs/index.rst +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/docs/installation.rst +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/docs/requirements-docs.txt +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/dprint.json +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/licenses/NUMPY_LICENSE.txt +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/licenses/PANDAS_LICENSE.txt +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/polars_sgt/.mypy.ini +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/polars_sgt/_internal.pyi +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/polars_sgt/namespace.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/polars_sgt/py.typed +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/polars_sgt/ranges.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/polars_sgt/typing.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/polars_sgt/utils.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/requirements.txt +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/rust-toolchain.toml +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/src/arg_previous_greater.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/src/expressions.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/src/format_localized.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/src/lib.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/src/month_delta.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/src/timezone.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/src/to_julian.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/tests/__init__.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/tests/ceil_test.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/tests/julian_date_test.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/tests/test_date_range.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/tests/test_format_localized.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/tests/test_is_busday.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/tests/test_month_delta.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.3.0}/tests/test_timezone.py +0 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.3.0] - 2026-02-04
|
|
9
|
+
|
|
10
|
+
### Changed
|
|
11
|
+
- **Major Performance Optimization**: Optimized SGT transform for billion-row scale with O(1) time weight lookups via cumulative product prefix arrays.
|
|
12
|
+
- **Speed Improvements**: Optimized Rust implementation with fast `exp`/`pow` approximations, pre-allocated buffers, and elimination of post-sort overhead.
|
|
13
|
+
- **Enhanced `sgt_transform_df`**:
|
|
14
|
+
- Returns a single merged wide-format DataFrame by default.
|
|
15
|
+
- Automatically prefixes feature names with group values (e.g., `sgt_buy_login`).
|
|
16
|
+
- Uses efficient reduce-join for merging multi-group analysis.
|
|
17
|
+
- Full support for Polar's LazyFrame and streaming engine.
|
|
18
|
+
|
|
19
|
+
### Fixed
|
|
20
|
+
- **Time Weight Correctness**: Fixed weight calculation for `kappa > 2` to correctly accumulate time penalties across *all* individual transitions in an n-gram.
|
|
21
|
+
- **Numerical Stability**: Implemented periodic renormalization and zero-trap protection for weighted products to prevent underflow in very long sequences.
|
|
22
|
+
|
|
23
|
+
### Added
|
|
24
|
+
- Comprehensive README documentation with spotlights on high-level APIs, scalability, and grouped analysis usage.
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
## [0.2.5] - 2026-02-04
|
|
28
|
+
|
|
29
|
+
### Added
|
|
30
|
+
- `use_tqdm` parameter to `sgt_transform_df` to control progress bar visibility.
|
|
31
|
+
- `keep_original_name` parameter to `sgt_transform_df` to optionally restore original sequence ID names.
|
|
32
|
+
- Support for multiple columns in `sequence_id_col` in `sgt_transform_df` (automatically concatenates and splits).
|
|
33
|
+
|
|
34
|
+
### Fixed
|
|
35
|
+
- `sgt_transform_df` now correctly handles `group_cols=None` by processing the entire DataFrame.
|
|
36
|
+
- `sgt_transform_df` now correctly filters subsets dynamically based on unique values of `group_cols` instead of hardcoded columns.
|
|
37
|
+
|
|
38
|
+
## [0.2.0] - 2026-02-02
|
|
39
|
+
|
|
40
|
+
### Added
|
|
41
|
+
- Parallel processing support with `rayon` for SGT transform.
|
|
42
|
+
- Support for custom output struct field names via `sequence_id_name` and `state_name` parameters.
|
|
43
|
+
|
|
44
|
+
### Changed
|
|
45
|
+
- **Major Performance Optimization**: Rewrote SGT transform to use O(n) group-based indexing instead of O(n*m) scanning. Throughput increased to ~1.4M+ records/second.
|
|
46
|
+
- **Struct Field Rename (BREAKING)**: Renamed `ngram_values` field in the output struct to `value` for consistency with current Polars version and parameter names.
|
|
47
|
+
|
|
48
|
+
### Fixed
|
|
49
|
+
- Performance bottleneck on large datasets (10M+ records).
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: polars-sgt
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Classifier: Programming Language :: Rust
|
|
5
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
|
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
7
|
+
Requires-Dist: maturin>=1.11.5
|
|
8
|
+
Requires-Dist: polars>=1.36.1
|
|
9
|
+
Requires-Dist: pytest>=8.4.2
|
|
10
|
+
Requires-Dist: tqdm>=4.66.0
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Summary: Sequence Graph Transform (SGT) for Polars - Transform sequential data into weighted n-gram representations
|
|
13
|
+
Author-email: Zedd <lytran14789@gmail.com>, Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
16
|
+
Project-URL: Change Log, https://github.com/4ursmile/polars-sgt/releases
|
|
17
|
+
Project-URL: Documentation, https://github.com/4ursmile/polars-sgt
|
|
18
|
+
Project-URL: Issue Tracker, https://github.com/4ursmile/polars-sgt/issues
|
|
19
|
+
Project-URL: Repository, https://github.com/4ursmile/polars-sgt
|
|
20
|
+
|
|
21
|
+
# polars-sgt
|
|
22
|
+
|
|
23
|
+
## Sequence Graph Transform for Polars
|
|
24
|
+
|
|
25
|
+
[](https://badge.fury.io/py/polars-sgt)
|
|
26
|
+
|
|
27
|
+
Transform sequential data into powerful n-gram representations with [Polars](https://www.pola.rs/).
|
|
28
|
+
|
|
29
|
+
**polars-sgt** brings Sequence Graph Transform (SGT) to Polars, enabling you to:
|
|
30
|
+
- ✅ **Transform** sequences into weighted n-gram features
|
|
31
|
+
- ✅ **Grouped Analysis**: Apply SGT across subsets (e.g., by direction, metric) and merge into a single wide DataFrame
|
|
32
|
+
- ✅ **Billion-Row Scale**: Optimized Rust implementation with O(1) time weight lookups
|
|
33
|
+
- ✅ **Temporal Dynamics**: Capture patterns with multiple decay functions across all n-gram transitions
|
|
34
|
+
- ✅ **Flexible**: Support for datetime, date, duration, and numeric time columns
|
|
35
|
+
- ✅ **Lazy & Parallel**: Fully compatible with Polars lazy evaluation and Rayon-backed parallel processing
|
|
36
|
+
|
|
37
|
+
## What is SGT?
|
|
38
|
+
|
|
39
|
+
Sequence Graph Transform converts sequential data (like user clickstreams, sensor readings, or transaction histories) into weighted n-gram representations. Unlike traditional n-grams, SGT captures:
|
|
40
|
+
|
|
41
|
+
- **Sequential patterns**: Multi-transition dependencies (Unigrams, bigrams, trigrams...)
|
|
42
|
+
- **Temporal dynamics**: Weights decay based on time gaps between events.
|
|
43
|
+
- **Normalized features**: L1/L2 normalization for machine-learning-ready feature spaces.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Performance at Scale
|
|
48
|
+
|
|
49
|
+
Optimized for processing billions of rows:
|
|
50
|
+
- **O(1) Weight Calculation**: Uses cumulative product prefix arrays to calculate multi-transition time weights in constant time.
|
|
51
|
+
- **Zero-Cost Abstraction**: Written in Rust with Rayon for automatic multi-core utilization.
|
|
52
|
+
- **Memory Efficient**: Leverages Polars' arrow-backed memory management.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Installation
|
|
57
|
+
|
|
58
|
+
```console
|
|
59
|
+
pip install polars-sgt
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Quick Start
|
|
63
|
+
|
|
64
|
+
### 1. High-Level API: `sgt_transform_df`
|
|
65
|
+
|
|
66
|
+
The `sgt_transform_df` function is the easiest way to generate SGT features. It handles unnesting, exploding, and pivoting into a wide format automatically.
|
|
67
|
+
|
|
68
|
+
#### Single Group (Default)
|
|
69
|
+
```python
|
|
70
|
+
import polars as pl
|
|
71
|
+
import polars_sgt as sgt
|
|
72
|
+
|
|
73
|
+
df = pl.DataFrame({
|
|
74
|
+
"user_id": ["A", "A", "A", "B", "B"],
|
|
75
|
+
"action": ["login", "view", "purchase", "login", "view"],
|
|
76
|
+
"time": [1, 2, 10, 1, 5],
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
# Generate wide-format features merged into one DataFrame
|
|
80
|
+
features = sgt.sgt_transform_df(
|
|
81
|
+
df,
|
|
82
|
+
sequence_id_col="user_id",
|
|
83
|
+
state_col="action",
|
|
84
|
+
time_col="time",
|
|
85
|
+
kappa=2
|
|
86
|
+
)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
#### Grouped Sequence Analysis
|
|
90
|
+
Calculate separate SGT features for different groups (e.g., event types or directions) and merge them into one wide DataFrame.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
# Calculate SGT features for each 'direction' and 'metric'
|
|
94
|
+
result = sgt.sgt_transform_df(
|
|
95
|
+
df,
|
|
96
|
+
sequence_id_col="user_id",
|
|
97
|
+
state_col="action",
|
|
98
|
+
time_col="time",
|
|
99
|
+
group_cols=["direction", "metric"],
|
|
100
|
+
kappa=3,
|
|
101
|
+
time_penalty="exponential",
|
|
102
|
+
alpha=0.7,
|
|
103
|
+
group_name="analysis"
|
|
104
|
+
)
|
|
105
|
+
# Columns: ['user_id', 'analysis-buy-p_login', 'analysis-sell-p_login', ...]
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### 2. Expression API: `sgt_transform`
|
|
109
|
+
|
|
110
|
+
For more control or integration into complex pipelines, use the expression-based API.
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
# Basic expression usage (returns a struct)
|
|
114
|
+
result = df.select(
|
|
115
|
+
sgt.sgt_transform(
|
|
116
|
+
"user_id",
|
|
117
|
+
"action",
|
|
118
|
+
time_col="time",
|
|
119
|
+
kappa=2,
|
|
120
|
+
time_penalty="exponential",
|
|
121
|
+
alpha=0.1,
|
|
122
|
+
mode="l1"
|
|
123
|
+
).alias("sgt_features")
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Extract and explode
|
|
127
|
+
features = result.select([
|
|
128
|
+
pl.col("sgt_features").struct.field("sequence_id"),
|
|
129
|
+
pl.col("sgt_features").struct.field("ngram_keys").alias("ngrams"),
|
|
130
|
+
pl.col("sgt_features").struct.field("value").alias("weights"),
|
|
131
|
+
]).explode(["ngrams", "weights"])
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### With DateTime Columns
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from datetime import datetime
|
|
138
|
+
|
|
139
|
+
df = pl.DataFrame({
|
|
140
|
+
"session_id": ["A", "A", "A", "A"],
|
|
141
|
+
"event": ["start", "click", "scroll", "exit"],
|
|
142
|
+
"time": [
|
|
143
|
+
datetime(2024, 1, 1, 10, 0),
|
|
144
|
+
datetime(2024, 1, 1, 10, 5),
|
|
145
|
+
datetime(2024, 1, 1, 10, 7),
|
|
146
|
+
datetime(2024, 1, 1, 10, 15),
|
|
147
|
+
],
|
|
148
|
+
})
|
|
149
|
+
|
|
150
|
+
result = df.select(
|
|
151
|
+
sgt.sgt_transform(
|
|
152
|
+
"session_id",
|
|
153
|
+
"event",
|
|
154
|
+
time_col="time",
|
|
155
|
+
deltatime="m", # unit: minutes
|
|
156
|
+
kappa=3,
|
|
157
|
+
)
|
|
158
|
+
)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Lazy Evaluation & Streaming
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
result = (
|
|
165
|
+
pl.scan_csv("large_sequences.csv")
|
|
166
|
+
.with_columns(pl.col("timestamp").str.to_datetime())
|
|
167
|
+
.select(
|
|
168
|
+
sgt.sgt_transform(
|
|
169
|
+
"user_id",
|
|
170
|
+
"action",
|
|
171
|
+
time_col="timestamp",
|
|
172
|
+
kappa=2,
|
|
173
|
+
deltatime="h",
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
.collect(engine="streaming")
|
|
177
|
+
)
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
## API Reference
|
|
183
|
+
|
|
184
|
+
### `sgt.sgt_transform_df`
|
|
185
|
+
The recommended high-level entry point. Returns a wide-format DataFrame.
|
|
186
|
+
|
|
187
|
+
- `df`: Input DataFrame or LazyFrame.
|
|
188
|
+
- `sequence_id_col`: Column(s) identifying sequences.
|
|
189
|
+
- `state_col`: Column containing states/events.
|
|
190
|
+
- `time_col`: Optional timestamp column.
|
|
191
|
+
- `group_cols`: Optional column(s) to group by before SGT.
|
|
192
|
+
- `kappa`: Maximum n-gram size.
|
|
193
|
+
- `mode`: Normalization (`"l1"`, `"l2"`, `"none"`).
|
|
194
|
+
- `time_penalty`: Decay function (`"inverse"`, `"exponential"`, `"linear"`, `"power"`, `"none"`).
|
|
195
|
+
|
|
196
|
+
### `sgt.sgt_transform` (Expression)
|
|
197
|
+
Returns a struct with `sequence_id`, `ngram_keys`, and `value`.
|
|
198
|
+
|
|
199
|
+
```python
|
|
200
|
+
df.select(
|
|
201
|
+
sgt.sgt_transform("user", "action", kappa=2).alias("sgt")
|
|
202
|
+
).unnest("sgt")
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## Author & Acknowledgments
|
|
208
|
+
|
|
209
|
+
**Author:** Zedd (lytran14789@gmail.com)
|
|
210
|
+
|
|
211
|
+
**Special Thanks:** Built upon [polars-xdt](https://github.com/MarcoGorelli/polars-xdt) by [Marco Gorelli](https://github.com/MarcoGorelli).
|
|
212
|
+
|
|
213
|
+
## License
|
|
214
|
+
|
|
215
|
+
MIT
|
|
216
|
+
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# polars-sgt
|
|
2
|
+
|
|
3
|
+
## Sequence Graph Transform for Polars
|
|
4
|
+
|
|
5
|
+
[](https://badge.fury.io/py/polars-sgt)
|
|
6
|
+
|
|
7
|
+
Transform sequential data into powerful n-gram representations with [Polars](https://www.pola.rs/).
|
|
8
|
+
|
|
9
|
+
**polars-sgt** brings Sequence Graph Transform (SGT) to Polars, enabling you to:
|
|
10
|
+
- ✅ **Transform** sequences into weighted n-gram features
|
|
11
|
+
- ✅ **Grouped Analysis**: Apply SGT across subsets (e.g., by direction, metric) and merge into a single wide DataFrame
|
|
12
|
+
- ✅ **Billion-Row Scale**: Optimized Rust implementation with O(1) time weight lookups
|
|
13
|
+
- ✅ **Temporal Dynamics**: Capture patterns with multiple decay functions across all n-gram transitions
|
|
14
|
+
- ✅ **Flexible**: Support for datetime, date, duration, and numeric time columns
|
|
15
|
+
- ✅ **Lazy & Parallel**: Fully compatible with Polars lazy evaluation and Rayon-backed parallel processing
|
|
16
|
+
|
|
17
|
+
## What is SGT?
|
|
18
|
+
|
|
19
|
+
Sequence Graph Transform converts sequential data (like user clickstreams, sensor readings, or transaction histories) into weighted n-gram representations. Unlike traditional n-grams, SGT captures:
|
|
20
|
+
|
|
21
|
+
- **Sequential patterns**: Multi-transition dependencies (Unigrams, bigrams, trigrams...)
|
|
22
|
+
- **Temporal dynamics**: Weights decay based on time gaps between events.
|
|
23
|
+
- **Normalized features**: L1/L2 normalization for machine-learning-ready feature spaces.
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Performance at Scale
|
|
28
|
+
|
|
29
|
+
Optimized for processing billions of rows:
|
|
30
|
+
- **O(1) Weight Calculation**: Uses cumulative product prefix arrays to calculate multi-transition time weights in constant time.
|
|
31
|
+
- **Zero-Cost Abstraction**: Written in Rust with Rayon for automatic multi-core utilization.
|
|
32
|
+
- **Memory Efficient**: Leverages Polars' arrow-backed memory management.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
```console
|
|
39
|
+
pip install polars-sgt
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Quick Start
|
|
43
|
+
|
|
44
|
+
### 1. High-Level API: `sgt_transform_df`
|
|
45
|
+
|
|
46
|
+
The `sgt_transform_df` function is the easiest way to generate SGT features. It handles unnesting, exploding, and pivoting into a wide format automatically.
|
|
47
|
+
|
|
48
|
+
#### Single Group (Default)
|
|
49
|
+
```python
|
|
50
|
+
import polars as pl
|
|
51
|
+
import polars_sgt as sgt
|
|
52
|
+
|
|
53
|
+
df = pl.DataFrame({
|
|
54
|
+
"user_id": ["A", "A", "A", "B", "B"],
|
|
55
|
+
"action": ["login", "view", "purchase", "login", "view"],
|
|
56
|
+
"time": [1, 2, 10, 1, 5],
|
|
57
|
+
})
|
|
58
|
+
|
|
59
|
+
# Generate wide-format features merged into one DataFrame
|
|
60
|
+
features = sgt.sgt_transform_df(
|
|
61
|
+
df,
|
|
62
|
+
sequence_id_col="user_id",
|
|
63
|
+
state_col="action",
|
|
64
|
+
time_col="time",
|
|
65
|
+
kappa=2
|
|
66
|
+
)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
#### Grouped Sequence Analysis
|
|
70
|
+
Calculate separate SGT features for different groups (e.g., event types or directions) and merge them into one wide DataFrame.
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
# Calculate SGT features for each 'direction' and 'metric'
|
|
74
|
+
result = sgt.sgt_transform_df(
|
|
75
|
+
df,
|
|
76
|
+
sequence_id_col="user_id",
|
|
77
|
+
state_col="action",
|
|
78
|
+
time_col="time",
|
|
79
|
+
group_cols=["direction", "metric"],
|
|
80
|
+
kappa=3,
|
|
81
|
+
time_penalty="exponential",
|
|
82
|
+
alpha=0.7,
|
|
83
|
+
group_name="analysis"
|
|
84
|
+
)
|
|
85
|
+
# Columns: ['user_id', 'analysis-buy-p_login', 'analysis-sell-p_login', ...]
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### 2. Expression API: `sgt_transform`
|
|
89
|
+
|
|
90
|
+
For more control or integration into complex pipelines, use the expression-based API.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
# Basic expression usage (returns a struct)
|
|
94
|
+
result = df.select(
|
|
95
|
+
sgt.sgt_transform(
|
|
96
|
+
"user_id",
|
|
97
|
+
"action",
|
|
98
|
+
time_col="time",
|
|
99
|
+
kappa=2,
|
|
100
|
+
time_penalty="exponential",
|
|
101
|
+
alpha=0.1,
|
|
102
|
+
mode="l1"
|
|
103
|
+
).alias("sgt_features")
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Extract and explode
|
|
107
|
+
features = result.select([
|
|
108
|
+
pl.col("sgt_features").struct.field("sequence_id"),
|
|
109
|
+
pl.col("sgt_features").struct.field("ngram_keys").alias("ngrams"),
|
|
110
|
+
pl.col("sgt_features").struct.field("value").alias("weights"),
|
|
111
|
+
]).explode(["ngrams", "weights"])
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### With DateTime Columns
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from datetime import datetime
|
|
118
|
+
|
|
119
|
+
df = pl.DataFrame({
|
|
120
|
+
"session_id": ["A", "A", "A", "A"],
|
|
121
|
+
"event": ["start", "click", "scroll", "exit"],
|
|
122
|
+
"time": [
|
|
123
|
+
datetime(2024, 1, 1, 10, 0),
|
|
124
|
+
datetime(2024, 1, 1, 10, 5),
|
|
125
|
+
datetime(2024, 1, 1, 10, 7),
|
|
126
|
+
datetime(2024, 1, 1, 10, 15),
|
|
127
|
+
],
|
|
128
|
+
})
|
|
129
|
+
|
|
130
|
+
result = df.select(
|
|
131
|
+
sgt.sgt_transform(
|
|
132
|
+
"session_id",
|
|
133
|
+
"event",
|
|
134
|
+
time_col="time",
|
|
135
|
+
deltatime="m", # unit: minutes
|
|
136
|
+
kappa=3,
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Lazy Evaluation & Streaming
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
result = (
|
|
145
|
+
pl.scan_csv("large_sequences.csv")
|
|
146
|
+
.with_columns(pl.col("timestamp").str.to_datetime())
|
|
147
|
+
.select(
|
|
148
|
+
sgt.sgt_transform(
|
|
149
|
+
"user_id",
|
|
150
|
+
"action",
|
|
151
|
+
time_col="timestamp",
|
|
152
|
+
kappa=2,
|
|
153
|
+
deltatime="h",
|
|
154
|
+
)
|
|
155
|
+
)
|
|
156
|
+
.collect(engine="streaming")
|
|
157
|
+
)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## API Reference
|
|
163
|
+
|
|
164
|
+
### `sgt.sgt_transform_df`
|
|
165
|
+
The recommended high-level entry point. Returns a wide-format DataFrame.
|
|
166
|
+
|
|
167
|
+
- `df`: Input DataFrame or LazyFrame.
|
|
168
|
+
- `sequence_id_col`: Column(s) identifying sequences.
|
|
169
|
+
- `state_col`: Column containing states/events.
|
|
170
|
+
- `time_col`: Optional timestamp column.
|
|
171
|
+
- `group_cols`: Optional column(s) to group by before SGT.
|
|
172
|
+
- `kappa`: Maximum n-gram size.
|
|
173
|
+
- `mode`: Normalization (`"l1"`, `"l2"`, `"none"`).
|
|
174
|
+
- `time_penalty`: Decay function (`"inverse"`, `"exponential"`, `"linear"`, `"power"`, `"none"`).
|
|
175
|
+
|
|
176
|
+
### `sgt.sgt_transform` (Expression)
|
|
177
|
+
Returns a struct with `sequence_id`, `ngram_keys`, and `value`.
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
df.select(
|
|
181
|
+
sgt.sgt_transform("user", "action", kappa=2).alias("sgt")
|
|
182
|
+
).unnest("sgt")
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## Author & Acknowledgments
|
|
188
|
+
|
|
189
|
+
**Author:** Zedd (lytran14789@gmail.com)
|
|
190
|
+
|
|
191
|
+
**Special Thanks:** Built upon [polars-xdt](https://github.com/MarcoGorelli/polars-xdt) by [Marco Gorelli](https://github.com/MarcoGorelli).
|
|
192
|
+
|
|
193
|
+
## License
|
|
194
|
+
|
|
195
|
+
MIT
|
|
@@ -11,6 +11,7 @@ from polars_sgt.functions import (
|
|
|
11
11
|
month_delta,
|
|
12
12
|
month_name,
|
|
13
13
|
sgt_transform,
|
|
14
|
+
sgt_transform_df,
|
|
14
15
|
to_julian_date,
|
|
15
16
|
to_local_datetime,
|
|
16
17
|
)
|
|
@@ -30,6 +31,7 @@ __all__ = [
|
|
|
30
31
|
"month_delta",
|
|
31
32
|
"month_name",
|
|
32
33
|
"sgt_transform",
|
|
34
|
+
"sgt_transform_df",
|
|
33
35
|
"to_julian_date",
|
|
34
36
|
"to_local_datetime",
|
|
35
37
|
]
|