PyPI - polars-sgt - Versions diffs - 0.1.0__tar.gz → 0.2.5__tar.gz - Mend

polars-sgt 0.1.0tar.gz → 0.2.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

polars_sgt-0.2.5/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,30 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.2.5] - 2026-02-04
+### Added
+- `use_tqdm` parameter to `sgt_transform_df` to control progress bar visibility.
+- `keep_original_name` parameter to `sgt_transform_df` to optionally restore original sequence ID names.
+- Support for multiple columns in `sequence_id_col` in `sgt_transform_df` (automatically concatenates and splits).
+### Fixed
+- `sgt_transform_df` now correctly handles `group_cols=None` by processing the entire DataFrame.
+- `sgt_transform_df` now correctly filters subsets dynamically based on unique values of `group_cols` instead of hardcoded columns.
+## [0.2.0] - 2026-02-02
+### Added
+- Parallel processing support with `rayon` for SGT transform.
+- Support for custom output struct field names via `sequence_id_name` and `state_name` parameters.
+### Changed
+- **Major Performance Optimization**: Rewrote SGT transform to use O(n) group-based indexing instead of O(n*m) scanning. Throughput increased to ~1.4M+ records/second.
+- **Struct Field Rename (BREAKING)**: Renamed `ngram_values` field in the output struct to `value` for consistency with current Polars version and parameter names.
+### Fixed
+- Performance bottleneck on large datasets (10M+ records).

{polars_sgt-0.1.0 → polars_sgt-0.2.5}/Cargo.lock RENAMED Viewed

@@ -2010,7 +2010,7 @@ dependencies = [
 [[package]]
 name = "polars_sgt"
-version = "0.1.0"
+version = "0.2.5"
 dependencies = [
  "chrono",
  "chrono-tz",
@@ -2019,6 +2019,7 @@ dependencies = [
  "polars-ops",
  "pyo3",
  "pyo3-polars",
+ "rayon",
  "serde",
 ]

{polars_sgt-0.1.0 → polars_sgt-0.2.5}/Cargo.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "polars_sgt"
-version = "0.1.0"
+version = "0.2.5"
 edition = "2021"
 authors = ["Zedd <lytran14789@gmail.com>", "Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>"]
 readme = "README.md"
@@ -19,4 +19,5 @@ chrono-tz = "0.10.4"
 polars = { version = "0.52.0", features = ["strings", "timezones"]}
 polars-ops = { version = "0.52.0", default-features = false }
 polars-arrow = { version = "0.52.0", default-features = false }
+rayon = "1.10"

{polars_sgt-0.1.0 → polars_sgt-0.2.5}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,13 @@
 Metadata-Version: 2.4
 Name: polars-sgt
-Version: 0.1.0
+Version: 0.2.5
 Classifier: Programming Language :: Rust
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
 Requires-Dist: maturin>=1.11.5
 Requires-Dist: polars>=1.36.1
 Requires-Dist: pytest>=8.4.2
+Requires-Dist: tqdm>=4.66.0
 License-File: LICENSE
 Summary: Sequence Graph Transform (SGT) for Polars - Transform sequential data into weighted n-gram representations
 Author-email: Zedd <lytran14789@gmail.com>, Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
@@ -91,10 +92,30 @@ result = df.select(
 features = result.select([
     pl.col("sgt_features").struct.field("sequence_id"),
     pl.col("sgt_features").struct.field("ngram_keys").alias("ngrams"),
-    pl.col("sgt_features").struct.field("ngram_values").alias("weights"),
+    pl.col("sgt_features").struct.field("value").alias("weights"),
 ]).explode(["ngrams", "weights"])
 print(features)
+#OR
+result = df.select(
+    sgt.sgt_transform(
+        "session_id",
+        "event",
+        time_col="time",
+        deltatime="m",  # minutes
+        kappa=3,  # trigrams
+        time_penalty="inverse",
+        mode="l2",
+        alpha=0.5
+    ).alias("struct_type")
+)
+out = (
+    result
+    .unnest("struct_type")
+    .explode(["ngram_keys", "value"])
+    .filter(pl.col("ngram_keys").str.split("->").list.len() > 0)
+)
 ```
 ### With DateTime Columns
@@ -180,7 +201,7 @@ result = (
 Returns a Struct with three fields:
 - `sequence_id`: Original sequence identifier
 - `ngram_keys`: List of n-gram strings (e.g., "login -> view -> purchase")
-- `ngram_values`: List of corresponding weights
+- `value`: List of corresponding weights
 ## Additional DateTime Utilities

{polars_sgt-0.1.0 → polars_sgt-0.2.5}/README.md RENAMED Viewed

@@ -72,10 +72,30 @@ result = df.select(
 features = result.select([
     pl.col("sgt_features").struct.field("sequence_id"),
     pl.col("sgt_features").struct.field("ngram_keys").alias("ngrams"),
-    pl.col("sgt_features").struct.field("ngram_values").alias("weights"),
+    pl.col("sgt_features").struct.field("value").alias("weights"),
 ]).explode(["ngrams", "weights"])
 print(features)
+#OR
+result = df.select(
+    sgt.sgt_transform(
+        "session_id",
+        "event",
+        time_col="time",
+        deltatime="m",  # minutes
+        kappa=3,  # trigrams
+        time_penalty="inverse",
+        mode="l2",
+        alpha=0.5
+    ).alias("struct_type")
+)
+out = (
+    result
+    .unnest("struct_type")
+    .explode(["ngram_keys", "value"])
+    .filter(pl.col("ngram_keys").str.split("->").list.len() > 0)
+)
 ```
 ### With DateTime Columns
@@ -161,7 +181,7 @@ result = (
 Returns a Struct with three fields:
 - `sequence_id`: Original sequence identifier
 - `ngram_keys`: List of n-gram strings (e.g., "login -> view -> purchase")
-- `ngram_values`: List of corresponding weights
+- `value`: List of corresponding weights
 ## Additional DateTime Utilities

{polars_sgt-0.1.0 → polars_sgt-0.2.5}/polars_sgt/__init__.py RENAMED Viewed

@@ -11,6 +11,7 @@ from polars_sgt.functions import (
     month_delta,
     month_name,
     sgt_transform,
+    sgt_transform_df,
     to_julian_date,
     to_local_datetime,
 )
@@ -30,6 +31,7 @@ __all__ = [
     "month_delta",
     "month_name",
     "sgt_transform",
+    "sgt_transform_df",
     "to_julian_date",
     "to_local_datetime",
 ]

{polars_sgt-0.1.0 → polars_sgt-0.2.5}/polars_sgt/functions.py RENAMED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 import sys
 from datetime import date
 from pathlib import Path
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING, Literal, Union, Any, Iterable
 import polars as pl
 from polars.plugins import register_plugin_function
@@ -675,7 +675,7 @@ def arg_previous_greater(expr: IntoExprColumn) -> pl.Expr:
 def sgt_transform(
-    sequence_id_col: IntoExprColumn,
+    sequence_id_col: Union[IntoExprColumn, Iterable[IntoExprColumn]],
     state_col: IntoExprColumn,
     time_col: IntoExprColumn | None = None,
     *,
@@ -696,7 +696,7 @@ def sgt_transform(
     Parameters
     ----------
     sequence_id_col
-        Column name containing sequence identifiers (groups)
+        Column or list of name/pl.col/pl.series containing sequence identifiers (groups)
     state_col
         Column name containing state/event values
     time_col
@@ -740,7 +740,7 @@ def sgt_transform(
         Struct expression containing:
         - sequence_id: Original sequence identifier
         - ngram_keys: List of n-gram strings
-        - ngram_values: List of corresponding weights
+        - value: List of corresponding weights
     Examples
     --------
@@ -821,7 +821,7 @@ def sgt_transform(
     >>> df_features = result.select([
     ...     pl.col("sgt_result").struct.field("sequence_id"),
     ...     pl.col("sgt_result").struct.field("ngram_keys").alias("ngrams"),
-    ...     pl.col("sgt_result").struct.field("ngram_values").alias("weights"),
+    ...     pl.col("sgt_result").struct.field("value").alias("weights"),
     ... ]).explode(["ngrams", "weights"])
     Notes
@@ -833,7 +833,12 @@ def sgt_transform(
     - Missing values in time columns are treated as 0
     """
-    sequence_id_col = parse_into_expr(sequence_id_col)
+    # check if col is iterable
+    if isinstance(sequence_id_col, Iterable) and not isinstance(sequence_id_col, str):
+        sequence_id_cols = [parse_into_expr(col) for col in sequence_id_col]
+        sequence_id_col = pl.concat_str(sequence_id_cols, separator="--")
+    else:
+        sequence_id_col = parse_into_expr(sequence_id_col)
     state_col = parse_into_expr(state_col)
     if time_col is not None:
@@ -855,5 +860,206 @@ def sgt_transform(
             "alpha": alpha,
             "beta": beta,
             "deltatime": deltatime,
+            "sequence_id_name": None,
+            "state_name": None,
         },
     )
+from tqdm import tqdm
+def clean_name(n:str):
+    new_n = n.strip("{}").replace('"', '').replace(',', '--').replace(" ", "").lower()
+    return new_n
+def clean_column_name(df: pl.DataFrame)->pl.DataFrame:
+    cols = [clean_name(c) for c in df.columns]
+    return df.rename(
+        {
+            oc: c for oc, c in zip(df.columns, cols)
+        }
+    )
+def sgt_transform_df(
+    df: Union[pl.DataFrame, pl.LazyFrame],
+    sequence_id_col: Union[IntoExprColumn, Iterable[IntoExprColumn]],
+    state_col: IntoExprColumn,
+    time_col: IntoExprColumn | None = None,
+    group_cols: Union[IntoExprColumn, Iterable[IntoExprColumn]] | None = None,
+    *,
+    kappa: int = 1,
+    length_sensitive: bool = False,
+    mode: Literal["l1", "l2", "none"] = "l1",
+    time_penalty: Literal["inverse", "exponential", "linear", "power", "none"] = "inverse",
+    alpha: float = 1.0,
+    beta: float = 2.0,
+    deltatime: Literal["s", "m", "h", "d", "w", "month", "q", "y"] | None = None,
+    group_name: str = "sgt_",
+    use_tqdm: bool = True,
+    keep_original_name: bool = True,
+) -> Union[pl.DataFrame, dict[Any, pl.DataFrame]]:
+    """
+    Apply SGT transform to a DataFrame, optionally grouped by columns.
+    Parameters
+    ----------
+    df
+        Input DataFrame or LazyFrame
+    sequence_id_col
+        Column(s) identifying sequences. If multiple columns are provided,
+        they will be concatenated for processing and optionally restored.
+    state_col
+        Column containing states/events
+    time_col
+        Optional column containing timestamps
+    group_cols
+        Column(s) to group by before applying SGT.
+        If None, applies SGT to the whole DataFrame (or by existing sequence_id).
+        If provided, the DataFrame is split into subsets based on unique values of these columns.
+    kappa
+        SGT kappa parameter
+    length_sensitive
+        SGT length_sensitive parameter
+    mode
+        SGT mode parameter
+    time_penalty
+        SGT time_penalty parameter
+    alpha
+        SGT alpha parameter
+    beta
+        SGT beta parameter
+    deltatime
+        SGT deltatime parameter
+    group_name
+        Prefix for keys in the returned dictionary when group_cols is used.
+    use_tqdm
+        Whether to show a progress bar when iterating over groups.
+    keep_original_name
+        If True, and sequence_id_col was multiple columns, split the concatenated ID
+        back into original columns in the result.
+    Returns
+    -------
+    Union[pl.DataFrame, dict[Any, pl.DataFrame]]
+        If group_cols is None, returns a single DataFrame with SGT features.
+        If group_cols is provided, returns a dictionary where keys map to group values
+        and values are DataFrames with SGT features.
+    """
+    # Handle multiple sequence ID columns
+    is_multi_seq = False
+    original_seq_cols = []
+    if isinstance(sequence_id_col, (list, tuple)) and not isinstance(sequence_id_col, str):
+         is_multi_seq = True
+         original_seq_cols = [str(c) for c in sequence_id_col]
+    # If no grouping is requested, just run sgt_transform on the whole DF
+    if group_cols is None:
+        result = df.select(
+            sgt_transform(
+                sequence_id_col,
+                state_col,
+                time_col=time_col,
+                deltatime=deltatime,
+                kappa=kappa,
+                length_sensitive=length_sensitive,
+                mode=mode,
+                time_penalty=time_penalty,
+                alpha=alpha,
+                beta=beta,
+            ).alias("struct_type")
+        )
+        out = (
+            result
+            .unnest("struct_type")
+            .explode(["ngram_keys", "value"])
+        )
+        # Pivot to get features as columns
+        # Note: sequence_id in output is named "sequence_id" from the struct
+        df_sub = out.pivot(on="ngram_keys", index="sequence_id", values="value")
+        df_sub = clean_column_name(df_sub)
+        if keep_original_name:
+             # Identify the sequence id column in the result
+             # It comes out as "sequence_id" from the struct
+             if is_multi_seq:
+                 # Split the "sequence_id" column back into original cols
+                 # Assuming "--" separator as used in sgt_transform
+                 split_exprs = [
+                     pl.col("sequence_id").str.split_exact("--", len(original_seq_cols) - 1)
+                     .struct.field(f"field_{i}")
+                     .alias(col_name)
+                     for i, col_name in enumerate(original_seq_cols)
+                 ]
+                 df_sub = df_sub.with_columns(split_exprs).drop("sequence_id")
+             elif isinstance(sequence_id_col, str) and sequence_id_col != "sequence_id":
+                 # Rename back to original name if it's a single string col
+                 df_sub = df_sub.rename({"sequence_id": sequence_id_col})
+        return df_sub
+    # If grouping is requested
+    if isinstance(group_cols, str):
+        group_cols = [group_cols]
+    # Get unique combinations of group columns
+    subset_filters = df.select(group_cols).unique().to_dicts()
+    dfs = {}
+    iterator = tqdm(subset_filters, desc=f"Calculate SGT for each sub df in {group_name}") if use_tqdm else subset_filters
+    for i in iterator:
+        # Create filter expression
+        filter_expr = pl.lit(True)
+        key_parts = []
+        for col_name, val in i.items():
+            filter_expr &= (pl.col(col_name) == val)
+            key_parts.append(str(val))
+        key = f"{group_name}{'-'.join(key_parts)}"
+        dfsub = df.filter(filter_expr)
+        result = dfsub.select(
+            sgt_transform(
+                sequence_id_col,
+                state_col,
+                time_col=time_col,
+                deltatime=deltatime,
+                kappa=kappa,
+                length_sensitive=length_sensitive,
+                mode=mode,
+                time_penalty=time_penalty,
+                alpha=alpha,
+                beta=beta,
+            ).alias("struct_type")
+        )
+        out = (
+            result
+            .unnest("struct_type")
+            .explode(["ngram_keys", "value"])
+            .with_columns(pl.lit(key).alias("kind"))
+        )
+        # Pivot
+        df_sub = out.pivot(on="ngram_keys", index=["kind", "sequence_id"], values="value")
+        df_sub = clean_column_name(df_sub)
+        if keep_original_name:
+             if is_multi_seq:
+                 # Split the "sequence_id" column back into original cols
+                 split_exprs = [
+                     pl.col("sequence_id").str.split_exact("--", len(original_seq_cols) - 1)
+                     .struct.field(f"field_{i}")
+                     .alias(col_name)
+                     for i, col_name in enumerate(original_seq_cols)
+                 ]
+                 df_sub = df_sub.with_columns(split_exprs).drop("sequence_id")
+             elif isinstance(sequence_id_col, str) and sequence_id_col != "sequence_id":
+                 df_sub = df_sub.rename({"sequence_id": sequence_id_col})
+        dfs[key] = df_sub
+    return dfs

{polars_sgt-0.1.0 → polars_sgt-0.2.5}/pyproject.toml RENAMED Viewed

@@ -22,6 +22,7 @@ dependencies = [
     "maturin>=1.11.5",
     "polars>=1.36.1",
     "pytest>=8.4.2",
+    "tqdm>=4.66.0",
 ]
 [project.urls]

{polars_sgt-0.1.0 → polars_sgt-0.2.5}/src/expressions.rs RENAMED Viewed

@@ -30,6 +30,8 @@ pub struct SgtTransformKwargs {
     alpha: f64,
     beta: f64,
     deltatime: Option<String>,
+    sequence_id_name: Option<String>,
+    state_name: Option<String>,
 }
 pub fn to_local_datetime_output(input_fields: &[Field]) -> PolarsResult<Field> {
@@ -122,7 +124,7 @@ fn sgt_transform_output(_input_fields: &[Field]) -> PolarsResult<Field> {
     let fields = vec![
         Field::new(PlSmallStr::from_str("sequence_id"), DataType::String),
         Field::new(PlSmallStr::from_str("ngram_keys"), DataType::List(Box::new(DataType::String))),
-        Field::new(PlSmallStr::from_str("ngram_values"), DataType::List(Box::new(DataType::Float64))),
+        Field::new(PlSmallStr::from_str("value"), DataType::List(Box::new(DataType::Float64))),
     ];
     Ok(Field::new(
         PlSmallStr::from_str("sgt_result"),
@@ -141,5 +143,7 @@ fn sgt_transform(inputs: &[Series], kwargs: SgtTransformKwargs) -> PolarsResult<
         kwargs.alpha,
         kwargs.beta,
         kwargs.deltatime.as_deref(),
+        kwargs.sequence_id_name.as_deref(),
+        kwargs.state_name.as_deref(),
     )
 }

polars-sgt 0.1.0__tar.gz → 0.2.5__tar.gz

polars-sgt 0.1.0tar.gz → 0.2.5tar.gz