PyPI - polars-sgt - Versions diffs - 0.2.0__tar.gz → 0.2.5__tar.gz - Mend

polars-sgt 0.2.0tar.gz → 0.2.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{polars_sgt-0.2.0 → polars_sgt-0.2.5}/CHANGELOG.md RENAMED Viewed

@@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.2.5] - 2026-02-04
+### Added
+- `use_tqdm` parameter to `sgt_transform_df` to control progress bar visibility.
+- `keep_original_name` parameter to `sgt_transform_df` to optionally restore original sequence ID names.
+- Support for multiple columns in `sequence_id_col` in `sgt_transform_df` (automatically concatenates and splits).
+### Fixed
+- `sgt_transform_df` now correctly handles `group_cols=None` by processing the entire DataFrame.
+- `sgt_transform_df` now correctly filters subsets dynamically based on unique values of `group_cols` instead of hardcoded columns.
 ## [0.2.0] - 2026-02-02
 ### Added

{polars_sgt-0.2.0 → polars_sgt-0.2.5}/Cargo.lock RENAMED Viewed

@@ -2010,7 +2010,7 @@ dependencies = [
 [[package]]
 name = "polars_sgt"
-version = "0.2.0"
+version = "0.2.5"
 dependencies = [
  "chrono",
  "chrono-tz",

{polars_sgt-0.2.0 → polars_sgt-0.2.5}/Cargo.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "polars_sgt"
-version = "0.2.0"
+version = "0.2.5"
 edition = "2021"
 authors = ["Zedd <lytran14789@gmail.com>", "Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>"]
 readme = "README.md"

{polars_sgt-0.2.0 → polars_sgt-0.2.5}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,13 @@
 Metadata-Version: 2.4
 Name: polars-sgt
-Version: 0.2.0
+Version: 0.2.5
 Classifier: Programming Language :: Rust
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
 Requires-Dist: maturin>=1.11.5
 Requires-Dist: polars>=1.36.1
 Requires-Dist: pytest>=8.4.2
+Requires-Dist: tqdm>=4.66.0
 License-File: LICENSE
 Summary: Sequence Graph Transform (SGT) for Polars - Transform sequential data into weighted n-gram representations
 Author-email: Zedd <lytran14789@gmail.com>, Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>

{polars_sgt-0.2.0 → polars_sgt-0.2.5}/polars_sgt/__init__.py RENAMED Viewed

@@ -11,6 +11,7 @@ from polars_sgt.functions import (
     month_delta,
     month_name,
     sgt_transform,
+    sgt_transform_df,
     to_julian_date,
     to_local_datetime,
 )
@@ -30,6 +31,7 @@ __all__ = [
     "month_delta",
     "month_name",
     "sgt_transform",
+    "sgt_transform_df",
     "to_julian_date",
     "to_local_datetime",
 ]

{polars_sgt-0.2.0 → polars_sgt-0.2.5}/polars_sgt/functions.py RENAMED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 import sys
 from datetime import date
 from pathlib import Path
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING, Literal, Union, Any, Iterable
 import polars as pl
 from polars.plugins import register_plugin_function
@@ -675,7 +675,7 @@ def arg_previous_greater(expr: IntoExprColumn) -> pl.Expr:
 def sgt_transform(
-    sequence_id_col: IntoExprColumn,
+    sequence_id_col: Union[IntoExprColumn, Iterable[IntoExprColumn]],
     state_col: IntoExprColumn,
     time_col: IntoExprColumn | None = None,
     *,
@@ -696,7 +696,7 @@ def sgt_transform(
     Parameters
     ----------
     sequence_id_col
-        Column name containing sequence identifiers (groups)
+        Column or list of name/pl.col/pl.series containing sequence identifiers (groups)
     state_col
         Column name containing state/event values
     time_col
@@ -833,7 +833,12 @@ def sgt_transform(
     - Missing values in time columns are treated as 0
     """
-    sequence_id_col = parse_into_expr(sequence_id_col)
+    # check if col is iterable
+    if isinstance(sequence_id_col, Iterable) and not isinstance(sequence_id_col, str):
+        sequence_id_cols = [parse_into_expr(col) for col in sequence_id_col]
+        sequence_id_col = pl.concat_str(sequence_id_cols, separator="--")
+    else:
+        sequence_id_col = parse_into_expr(sequence_id_col)
     state_col = parse_into_expr(state_col)
     if time_col is not None:
@@ -859,3 +864,202 @@ def sgt_transform(
             "state_name": None,
         },
     )
+from tqdm import tqdm
+def clean_name(n:str):
+    new_n = n.strip("{}").replace('"', '').replace(',', '--').replace(" ", "").lower()
+    return new_n
+def clean_column_name(df: pl.DataFrame)->pl.DataFrame:
+    cols = [clean_name(c) for c in df.columns]
+    return df.rename(
+        {
+            oc: c for oc, c in zip(df.columns, cols)
+        }
+    )
+def sgt_transform_df(
+    df: Union[pl.DataFrame, pl.LazyFrame],
+    sequence_id_col: Union[IntoExprColumn, Iterable[IntoExprColumn]],
+    state_col: IntoExprColumn,
+    time_col: IntoExprColumn | None = None,
+    group_cols: Union[IntoExprColumn, Iterable[IntoExprColumn]] | None = None,
+    *,
+    kappa: int = 1,
+    length_sensitive: bool = False,
+    mode: Literal["l1", "l2", "none"] = "l1",
+    time_penalty: Literal["inverse", "exponential", "linear", "power", "none"] = "inverse",
+    alpha: float = 1.0,
+    beta: float = 2.0,
+    deltatime: Literal["s", "m", "h", "d", "w", "month", "q", "y"] | None = None,
+    group_name: str = "sgt_",
+    use_tqdm: bool = True,
+    keep_original_name: bool = True,
+) -> Union[pl.DataFrame, dict[Any, pl.DataFrame]]:
+    """
+    Apply SGT transform to a DataFrame, optionally grouped by columns.
+    Parameters
+    ----------
+    df
+        Input DataFrame or LazyFrame
+    sequence_id_col
+        Column(s) identifying sequences. If multiple columns are provided,
+        they will be concatenated for processing and optionally restored.
+    state_col
+        Column containing states/events
+    time_col
+        Optional column containing timestamps
+    group_cols
+        Column(s) to group by before applying SGT.
+        If None, applies SGT to the whole DataFrame (or by existing sequence_id).
+        If provided, the DataFrame is split into subsets based on unique values of these columns.
+    kappa
+        SGT kappa parameter
+    length_sensitive
+        SGT length_sensitive parameter
+    mode
+        SGT mode parameter
+    time_penalty
+        SGT time_penalty parameter
+    alpha
+        SGT alpha parameter
+    beta
+        SGT beta parameter
+    deltatime
+        SGT deltatime parameter
+    group_name
+        Prefix for keys in the returned dictionary when group_cols is used.
+    use_tqdm
+        Whether to show a progress bar when iterating over groups.
+    keep_original_name
+        If True, and sequence_id_col was multiple columns, split the concatenated ID
+        back into original columns in the result.
+    Returns
+    -------
+    Union[pl.DataFrame, dict[Any, pl.DataFrame]]
+        If group_cols is None, returns a single DataFrame with SGT features.
+        If group_cols is provided, returns a dictionary where keys map to group values
+        and values are DataFrames with SGT features.
+    """
+    # Handle multiple sequence ID columns
+    is_multi_seq = False
+    original_seq_cols = []
+    if isinstance(sequence_id_col, (list, tuple)) and not isinstance(sequence_id_col, str):
+         is_multi_seq = True
+         original_seq_cols = [str(c) for c in sequence_id_col]
+    # If no grouping is requested, just run sgt_transform on the whole DF
+    if group_cols is None:
+        result = df.select(
+            sgt_transform(
+                sequence_id_col,
+                state_col,
+                time_col=time_col,
+                deltatime=deltatime,
+                kappa=kappa,
+                length_sensitive=length_sensitive,
+                mode=mode,
+                time_penalty=time_penalty,
+                alpha=alpha,
+                beta=beta,
+            ).alias("struct_type")
+        )
+        out = (
+            result
+            .unnest("struct_type")
+            .explode(["ngram_keys", "value"])
+        )
+        # Pivot to get features as columns
+        # Note: sequence_id in output is named "sequence_id" from the struct
+        df_sub = out.pivot(on="ngram_keys", index="sequence_id", values="value")
+        df_sub = clean_column_name(df_sub)
+        if keep_original_name:
+             # Identify the sequence id column in the result
+             # It comes out as "sequence_id" from the struct
+             if is_multi_seq:
+                 # Split the "sequence_id" column back into original cols
+                 # Assuming "--" separator as used in sgt_transform
+                 split_exprs = [
+                     pl.col("sequence_id").str.split_exact("--", len(original_seq_cols) - 1)
+                     .struct.field(f"field_{i}")
+                     .alias(col_name)
+                     for i, col_name in enumerate(original_seq_cols)
+                 ]
+                 df_sub = df_sub.with_columns(split_exprs).drop("sequence_id")
+             elif isinstance(sequence_id_col, str) and sequence_id_col != "sequence_id":
+                 # Rename back to original name if it's a single string col
+                 df_sub = df_sub.rename({"sequence_id": sequence_id_col})
+        return df_sub
+    # If grouping is requested
+    if isinstance(group_cols, str):
+        group_cols = [group_cols]
+    # Get unique combinations of group columns
+    subset_filters = df.select(group_cols).unique().to_dicts()
+    dfs = {}
+    iterator = tqdm(subset_filters, desc=f"Calculate SGT for each sub df in {group_name}") if use_tqdm else subset_filters
+    for i in iterator:
+        # Create filter expression
+        filter_expr = pl.lit(True)
+        key_parts = []
+        for col_name, val in i.items():
+            filter_expr &= (pl.col(col_name) == val)
+            key_parts.append(str(val))
+        key = f"{group_name}{'-'.join(key_parts)}"
+        dfsub = df.filter(filter_expr)
+        result = dfsub.select(
+            sgt_transform(
+                sequence_id_col,
+                state_col,
+                time_col=time_col,
+                deltatime=deltatime,
+                kappa=kappa,
+                length_sensitive=length_sensitive,
+                mode=mode,
+                time_penalty=time_penalty,
+                alpha=alpha,
+                beta=beta,
+            ).alias("struct_type")
+        )
+        out = (
+            result
+            .unnest("struct_type")
+            .explode(["ngram_keys", "value"])
+            .with_columns(pl.lit(key).alias("kind"))
+        )
+        # Pivot
+        df_sub = out.pivot(on="ngram_keys", index=["kind", "sequence_id"], values="value")
+        df_sub = clean_column_name(df_sub)
+        if keep_original_name:
+             if is_multi_seq:
+                 # Split the "sequence_id" column back into original cols
+                 split_exprs = [
+                     pl.col("sequence_id").str.split_exact("--", len(original_seq_cols) - 1)
+                     .struct.field(f"field_{i}")
+                     .alias(col_name)
+                     for i, col_name in enumerate(original_seq_cols)
+                 ]
+                 df_sub = df_sub.with_columns(split_exprs).drop("sequence_id")
+             elif isinstance(sequence_id_col, str) and sequence_id_col != "sequence_id":
+                 df_sub = df_sub.rename({"sequence_id": sequence_id_col})
+        dfs[key] = df_sub
+    return dfs

{polars_sgt-0.2.0 → polars_sgt-0.2.5}/pyproject.toml RENAMED Viewed

@@ -22,6 +22,7 @@ dependencies = [
     "maturin>=1.11.5",
     "polars>=1.36.1",
     "pytest>=8.4.2",
+    "tqdm>=4.66.0",
 ]
 [project.urls]

polars_sgt-0.2.5/tests/verify_sgt.py ADDED Viewed

@@ -0,0 +1,103 @@
+import polars as pl
+import polars_sgt as xdt
+import pytest
+from datetime import date
+def test_sgt_transform_df_no_group():
+    df = pl.DataFrame({
+        "user_id": [1, 1, 2, 2],
+        "action": ["A", "B", "A", "C"],
+        "date": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 1), date(2023, 1, 3)]
+    })
+    result = xdt.sgt_transform_df(
+        df,
+        sequence_id_col="user_id",
+        state_col="action",
+        time_col="date",
+        group_cols=None,
+        use_tqdm=False
+    )
+    assert isinstance(result, pl.DataFrame)
+    assert "user_id" in result.columns
+    assert "A" in result.columns  # Assuming unigrams are generated as column names
+def test_sgt_transform_df_with_group():
+    df = pl.DataFrame({
+        "group": ["X", "X", "Y", "Y"],
+        "user_id": [1, 1, 2, 2],
+        "action": ["A", "B", "A", "C"],
+        "date": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 1), date(2023, 1, 3)]
+    })
+    result = xdt.sgt_transform_df(
+        df,
+        sequence_id_col="user_id",
+        state_col="action",
+        time_col="date",
+        group_cols="group",
+        use_tqdm=False,
+        group_name="test_group_"
+    )
+    assert isinstance(result, dict)
+    assert "test_group_X" in result
+    assert "test_group_Y" in result
+    assert isinstance(result["test_group_X"], pl.DataFrame)
+def test_sgt_transform_df_multi_seq_id():
+    df = pl.DataFrame({
+        "id_part1": ["u1", "u1", "u2", "u2"],
+        "id_part2": ["01", "01", "02", "02"],
+        "action": ["A", "B", "A", "C"],
+        "date": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 1), date(2023, 1, 3)]
+    })
+    result = xdt.sgt_transform_df(
+        df,
+        sequence_id_col=["id_part1", "id_part2"],
+        state_col="action",
+        time_col="date",
+        group_cols=None,
+        use_tqdm=False,
+        keep_original_name=True
+    )
+    assert isinstance(result, pl.DataFrame)
+    assert "id_part1" in result.columns
+    assert "id_part2" in result.columns
+    # Verify values are correct
+    assert result.filter(pl.col("id_part1") == "u1").height > 0
+def test_sgt_transform_df_keep_original_name_false():
+    df = pl.DataFrame({
+        "user_id": [1, 1, 2, 2],
+        "action": ["A", "B", "A", "C"],
+        "date": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 1), date(2023, 1, 3)]
+    })
+    result = xdt.sgt_transform_df(
+        df,
+        sequence_id_col="user_id",
+        state_col="action",
+        time_col="date",
+        group_cols=None,
+        use_tqdm=False,
+        keep_original_name=False
+    )
+    assert isinstance(result, pl.DataFrame)
+    assert "sequence_id" in result.columns
+    # user_id should NOT be in columns unless it's an alias, but function renames it if keep_original_name is True.
+    # If False, it should stay as sequence_id (or whatever the pivot output is).
+    # Current implementation outputs "sequence_id" if keep_original_name is False.
+if __name__ == "__main__":
+    # Manually running tests
+    test_sgt_transform_df_no_group()
+    test_sgt_transform_df_with_group()
+    test_sgt_transform_df_multi_seq_id()
+    test_sgt_transform_df_keep_original_name_false()
+    print("All verification tests passed!")

{polars_sgt-0.2.0 → polars_sgt-0.2.5}/uv.lock RENAMED Viewed

@@ -168,6 +168,7 @@ dependencies = [
     { name = "polars", version = "1.37.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
     { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
     { name = "pytest", version = "9.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "tqdm" },
 ]
 [package.metadata]
@@ -175,6 +176,7 @@ requires-dist = [
     { name = "maturin", specifier = ">=1.11.5" },
     { name = "polars", specifier = ">=1.36.1" },
     { name = "pytest", specifier = ">=8.4.2" },
+    { name = "tqdm", specifier = ">=4.66.0" },
 ]
 [[package]]
@@ -282,6 +284,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/23/d1/136eb2cb77520a31e1f64cbae9d33ec6df0d78bdf4160398e86eec8a8754/tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a", size = 14477, upload-time = "2026-01-11T11:22:37.446Z" },
 ]
+[[package]]
+name = "tqdm"
+version = "4.67.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" },
+]
 [[package]]
 name = "typing-extensions"
 version = "4.15.0"