polars-sgt 0.2.0__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/CHANGELOG.md +11 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/Cargo.lock +1 -1
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/Cargo.toml +1 -1
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/PKG-INFO +2 -1
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/polars_sgt/__init__.py +2 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/polars_sgt/functions.py +208 -4
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/pyproject.toml +1 -0
- polars_sgt-0.2.5/tests/verify_sgt.py +103 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/uv.lock +14 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/.github/workflows/CI.yml +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/.gitignore +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/.python-version +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/.readthedocs.yaml +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/CODE_OF_CONDUCT.md +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/LICENSE +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/Makefile +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/README.md +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/assets/.DS_Store +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/assets/polars-business.png +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/bump_version.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/docs/API.rst +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/docs/Makefile +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/docs/conf.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/docs/index.rst +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/docs/installation.rst +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/docs/requirements-docs.txt +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/dprint.json +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/licenses/NUMPY_LICENSE.txt +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/licenses/PANDAS_LICENSE.txt +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/polars_sgt/.mypy.ini +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/polars_sgt/_internal.pyi +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/polars_sgt/namespace.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/polars_sgt/py.typed +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/polars_sgt/ranges.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/polars_sgt/typing.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/polars_sgt/utils.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/requirements.txt +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/rust-toolchain.toml +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/src/arg_previous_greater.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/src/expressions.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/src/format_localized.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/src/lib.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/src/month_delta.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/src/sgt_transform.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/src/timezone.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/src/to_julian.rs +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/tests/__init__.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/tests/ceil_test.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/tests/julian_date_test.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/tests/test_benchmark.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/tests/test_date_range.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/tests/test_format_localized.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/tests/test_is_busday.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/tests/test_month_delta.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/tests/test_sgt_transform.py +0 -0
- {polars_sgt-0.2.0 → polars_sgt-0.2.5}/tests/test_timezone.py +0 -0
|
@@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.2.5] - 2026-02-04
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- `use_tqdm` parameter to `sgt_transform_df` to control progress bar visibility.
|
|
12
|
+
- `keep_original_name` parameter to `sgt_transform_df` to optionally restore original sequence ID names.
|
|
13
|
+
- Support for multiple columns in `sequence_id_col` in `sgt_transform_df` (automatically concatenates and splits).
|
|
14
|
+
|
|
15
|
+
### Fixed
|
|
16
|
+
- `sgt_transform_df` now correctly handles `group_cols=None` by processing the entire DataFrame.
|
|
17
|
+
- `sgt_transform_df` now correctly filters subsets dynamically based on unique values of `group_cols` instead of hardcoded columns.
|
|
18
|
+
|
|
8
19
|
## [0.2.0] - 2026-02-02
|
|
9
20
|
|
|
10
21
|
### Added
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: polars-sgt
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Classifier: Programming Language :: Rust
|
|
5
5
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
6
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
7
7
|
Requires-Dist: maturin>=1.11.5
|
|
8
8
|
Requires-Dist: polars>=1.36.1
|
|
9
9
|
Requires-Dist: pytest>=8.4.2
|
|
10
|
+
Requires-Dist: tqdm>=4.66.0
|
|
10
11
|
License-File: LICENSE
|
|
11
12
|
Summary: Sequence Graph Transform (SGT) for Polars - Transform sequential data into weighted n-gram representations
|
|
12
13
|
Author-email: Zedd <lytran14789@gmail.com>, Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com>
|
|
@@ -11,6 +11,7 @@ from polars_sgt.functions import (
|
|
|
11
11
|
month_delta,
|
|
12
12
|
month_name,
|
|
13
13
|
sgt_transform,
|
|
14
|
+
sgt_transform_df,
|
|
14
15
|
to_julian_date,
|
|
15
16
|
to_local_datetime,
|
|
16
17
|
)
|
|
@@ -30,6 +31,7 @@ __all__ = [
|
|
|
30
31
|
"month_delta",
|
|
31
32
|
"month_name",
|
|
32
33
|
"sgt_transform",
|
|
34
|
+
"sgt_transform_df",
|
|
33
35
|
"to_julian_date",
|
|
34
36
|
"to_local_datetime",
|
|
35
37
|
]
|
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import sys
|
|
4
4
|
from datetime import date
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING, Literal
|
|
6
|
+
from typing import TYPE_CHECKING, Literal, Union, Any, Iterable
|
|
7
7
|
|
|
8
8
|
import polars as pl
|
|
9
9
|
from polars.plugins import register_plugin_function
|
|
@@ -675,7 +675,7 @@ def arg_previous_greater(expr: IntoExprColumn) -> pl.Expr:
|
|
|
675
675
|
|
|
676
676
|
|
|
677
677
|
def sgt_transform(
|
|
678
|
-
sequence_id_col: IntoExprColumn,
|
|
678
|
+
sequence_id_col: Union[IntoExprColumn, Iterable[IntoExprColumn]],
|
|
679
679
|
state_col: IntoExprColumn,
|
|
680
680
|
time_col: IntoExprColumn | None = None,
|
|
681
681
|
*,
|
|
@@ -696,7 +696,7 @@ def sgt_transform(
|
|
|
696
696
|
Parameters
|
|
697
697
|
----------
|
|
698
698
|
sequence_id_col
|
|
699
|
-
Column name containing sequence identifiers (groups)
|
|
699
|
+
Column or list of name/pl.col/pl.series containing sequence identifiers (groups)
|
|
700
700
|
state_col
|
|
701
701
|
Column name containing state/event values
|
|
702
702
|
time_col
|
|
@@ -833,7 +833,12 @@ def sgt_transform(
|
|
|
833
833
|
- Missing values in time columns are treated as 0
|
|
834
834
|
|
|
835
835
|
"""
|
|
836
|
-
|
|
836
|
+
# check if col is iterable
|
|
837
|
+
if isinstance(sequence_id_col, Iterable) and not isinstance(sequence_id_col, str):
|
|
838
|
+
sequence_id_cols = [parse_into_expr(col) for col in sequence_id_col]
|
|
839
|
+
sequence_id_col = pl.concat_str(sequence_id_cols, separator="--")
|
|
840
|
+
else:
|
|
841
|
+
sequence_id_col = parse_into_expr(sequence_id_col)
|
|
837
842
|
state_col = parse_into_expr(state_col)
|
|
838
843
|
|
|
839
844
|
if time_col is not None:
|
|
@@ -859,3 +864,202 @@ def sgt_transform(
|
|
|
859
864
|
"state_name": None,
|
|
860
865
|
},
|
|
861
866
|
)
|
|
867
|
+
|
|
868
|
+
from tqdm import tqdm
|
|
869
|
+
def clean_name(n:str):
|
|
870
|
+
new_n = n.strip("{}").replace('"', '').replace(',', '--').replace(" ", "").lower()
|
|
871
|
+
return new_n
|
|
872
|
+
def clean_column_name(df: pl.DataFrame)->pl.DataFrame:
|
|
873
|
+
cols = [clean_name(c) for c in df.columns]
|
|
874
|
+
return df.rename(
|
|
875
|
+
{
|
|
876
|
+
oc: c for oc, c in zip(df.columns, cols)
|
|
877
|
+
}
|
|
878
|
+
)
|
|
879
|
+
def sgt_transform_df(
|
|
880
|
+
df: Union[pl.DataFrame, pl.LazyFrame],
|
|
881
|
+
sequence_id_col: Union[IntoExprColumn, Iterable[IntoExprColumn]],
|
|
882
|
+
state_col: IntoExprColumn,
|
|
883
|
+
time_col: IntoExprColumn | None = None,
|
|
884
|
+
group_cols: Union[IntoExprColumn, Iterable[IntoExprColumn]] | None = None,
|
|
885
|
+
*,
|
|
886
|
+
kappa: int = 1,
|
|
887
|
+
length_sensitive: bool = False,
|
|
888
|
+
mode: Literal["l1", "l2", "none"] = "l1",
|
|
889
|
+
time_penalty: Literal["inverse", "exponential", "linear", "power", "none"] = "inverse",
|
|
890
|
+
alpha: float = 1.0,
|
|
891
|
+
beta: float = 2.0,
|
|
892
|
+
deltatime: Literal["s", "m", "h", "d", "w", "month", "q", "y"] | None = None,
|
|
893
|
+
group_name: str = "sgt_",
|
|
894
|
+
use_tqdm: bool = True,
|
|
895
|
+
keep_original_name: bool = True,
|
|
896
|
+
) -> Union[pl.DataFrame, dict[Any, pl.DataFrame]]:
|
|
897
|
+
"""
|
|
898
|
+
Apply SGT transform to a DataFrame, optionally grouped by columns.
|
|
899
|
+
|
|
900
|
+
Parameters
|
|
901
|
+
----------
|
|
902
|
+
df
|
|
903
|
+
Input DataFrame or LazyFrame
|
|
904
|
+
sequence_id_col
|
|
905
|
+
Column(s) identifying sequences. If multiple columns are provided,
|
|
906
|
+
they will be concatenated for processing and optionally restored.
|
|
907
|
+
state_col
|
|
908
|
+
Column containing states/events
|
|
909
|
+
time_col
|
|
910
|
+
Optional column containing timestamps
|
|
911
|
+
group_cols
|
|
912
|
+
Column(s) to group by before applying SGT.
|
|
913
|
+
If None, applies SGT to the whole DataFrame (or by existing sequence_id).
|
|
914
|
+
If provided, the DataFrame is split into subsets based on unique values of these columns.
|
|
915
|
+
kappa
|
|
916
|
+
SGT kappa parameter
|
|
917
|
+
length_sensitive
|
|
918
|
+
SGT length_sensitive parameter
|
|
919
|
+
mode
|
|
920
|
+
SGT mode parameter
|
|
921
|
+
time_penalty
|
|
922
|
+
SGT time_penalty parameter
|
|
923
|
+
alpha
|
|
924
|
+
SGT alpha parameter
|
|
925
|
+
beta
|
|
926
|
+
SGT beta parameter
|
|
927
|
+
deltatime
|
|
928
|
+
SGT deltatime parameter
|
|
929
|
+
group_name
|
|
930
|
+
Prefix for keys in the returned dictionary when group_cols is used.
|
|
931
|
+
use_tqdm
|
|
932
|
+
Whether to show a progress bar when iterating over groups.
|
|
933
|
+
keep_original_name
|
|
934
|
+
If True, and sequence_id_col was multiple columns, split the concatenated ID
|
|
935
|
+
back into original columns in the result.
|
|
936
|
+
|
|
937
|
+
Returns
|
|
938
|
+
-------
|
|
939
|
+
Union[pl.DataFrame, dict[Any, pl.DataFrame]]
|
|
940
|
+
If group_cols is None, returns a single DataFrame with SGT features.
|
|
941
|
+
If group_cols is provided, returns a dictionary where keys map to group values
|
|
942
|
+
and values are DataFrames with SGT features.
|
|
943
|
+
"""
|
|
944
|
+
|
|
945
|
+
# Handle multiple sequence ID columns
|
|
946
|
+
is_multi_seq = False
|
|
947
|
+
original_seq_cols = []
|
|
948
|
+
|
|
949
|
+
if isinstance(sequence_id_col, (list, tuple)) and not isinstance(sequence_id_col, str):
|
|
950
|
+
is_multi_seq = True
|
|
951
|
+
original_seq_cols = [str(c) for c in sequence_id_col]
|
|
952
|
+
|
|
953
|
+
# If no grouping is requested, just run sgt_transform on the whole DF
|
|
954
|
+
if group_cols is None:
|
|
955
|
+
result = df.select(
|
|
956
|
+
sgt_transform(
|
|
957
|
+
sequence_id_col,
|
|
958
|
+
state_col,
|
|
959
|
+
time_col=time_col,
|
|
960
|
+
deltatime=deltatime,
|
|
961
|
+
kappa=kappa,
|
|
962
|
+
length_sensitive=length_sensitive,
|
|
963
|
+
mode=mode,
|
|
964
|
+
time_penalty=time_penalty,
|
|
965
|
+
alpha=alpha,
|
|
966
|
+
beta=beta,
|
|
967
|
+
).alias("struct_type")
|
|
968
|
+
)
|
|
969
|
+
|
|
970
|
+
out = (
|
|
971
|
+
result
|
|
972
|
+
.unnest("struct_type")
|
|
973
|
+
.explode(["ngram_keys", "value"])
|
|
974
|
+
)
|
|
975
|
+
|
|
976
|
+
# Pivot to get features as columns
|
|
977
|
+
# Note: sequence_id in output is named "sequence_id" from the struct
|
|
978
|
+
df_sub = out.pivot(on="ngram_keys", index="sequence_id", values="value")
|
|
979
|
+
df_sub = clean_column_name(df_sub)
|
|
980
|
+
|
|
981
|
+
if keep_original_name:
|
|
982
|
+
# Identify the sequence id column in the result
|
|
983
|
+
# It comes out as "sequence_id" from the struct
|
|
984
|
+
|
|
985
|
+
if is_multi_seq:
|
|
986
|
+
# Split the "sequence_id" column back into original cols
|
|
987
|
+
# Assuming "--" separator as used in sgt_transform
|
|
988
|
+
split_exprs = [
|
|
989
|
+
pl.col("sequence_id").str.split_exact("--", len(original_seq_cols) - 1)
|
|
990
|
+
.struct.field(f"field_{i}")
|
|
991
|
+
.alias(col_name)
|
|
992
|
+
for i, col_name in enumerate(original_seq_cols)
|
|
993
|
+
]
|
|
994
|
+
df_sub = df_sub.with_columns(split_exprs).drop("sequence_id")
|
|
995
|
+
elif isinstance(sequence_id_col, str) and sequence_id_col != "sequence_id":
|
|
996
|
+
# Rename back to original name if it's a single string col
|
|
997
|
+
df_sub = df_sub.rename({"sequence_id": sequence_id_col})
|
|
998
|
+
|
|
999
|
+
return df_sub
|
|
1000
|
+
|
|
1001
|
+
# If grouping is requested
|
|
1002
|
+
if isinstance(group_cols, str):
|
|
1003
|
+
group_cols = [group_cols]
|
|
1004
|
+
|
|
1005
|
+
# Get unique combinations of group columns
|
|
1006
|
+
subset_filters = df.select(group_cols).unique().to_dicts()
|
|
1007
|
+
|
|
1008
|
+
dfs = {}
|
|
1009
|
+
|
|
1010
|
+
iterator = tqdm(subset_filters, desc=f"Calculate SGT for each sub df in {group_name}") if use_tqdm else subset_filters
|
|
1011
|
+
|
|
1012
|
+
for i in iterator:
|
|
1013
|
+
# Create filter expression
|
|
1014
|
+
filter_expr = pl.lit(True)
|
|
1015
|
+
key_parts = []
|
|
1016
|
+
for col_name, val in i.items():
|
|
1017
|
+
filter_expr &= (pl.col(col_name) == val)
|
|
1018
|
+
key_parts.append(str(val))
|
|
1019
|
+
|
|
1020
|
+
key = f"{group_name}{'-'.join(key_parts)}"
|
|
1021
|
+
|
|
1022
|
+
dfsub = df.filter(filter_expr)
|
|
1023
|
+
|
|
1024
|
+
result = dfsub.select(
|
|
1025
|
+
sgt_transform(
|
|
1026
|
+
sequence_id_col,
|
|
1027
|
+
state_col,
|
|
1028
|
+
time_col=time_col,
|
|
1029
|
+
deltatime=deltatime,
|
|
1030
|
+
kappa=kappa,
|
|
1031
|
+
length_sensitive=length_sensitive,
|
|
1032
|
+
mode=mode,
|
|
1033
|
+
time_penalty=time_penalty,
|
|
1034
|
+
alpha=alpha,
|
|
1035
|
+
beta=beta,
|
|
1036
|
+
).alias("struct_type")
|
|
1037
|
+
)
|
|
1038
|
+
|
|
1039
|
+
out = (
|
|
1040
|
+
result
|
|
1041
|
+
.unnest("struct_type")
|
|
1042
|
+
.explode(["ngram_keys", "value"])
|
|
1043
|
+
.with_columns(pl.lit(key).alias("kind"))
|
|
1044
|
+
)
|
|
1045
|
+
|
|
1046
|
+
# Pivot
|
|
1047
|
+
df_sub = out.pivot(on="ngram_keys", index=["kind", "sequence_id"], values="value")
|
|
1048
|
+
df_sub = clean_column_name(df_sub)
|
|
1049
|
+
|
|
1050
|
+
if keep_original_name:
|
|
1051
|
+
if is_multi_seq:
|
|
1052
|
+
# Split the "sequence_id" column back into original cols
|
|
1053
|
+
split_exprs = [
|
|
1054
|
+
pl.col("sequence_id").str.split_exact("--", len(original_seq_cols) - 1)
|
|
1055
|
+
.struct.field(f"field_{i}")
|
|
1056
|
+
.alias(col_name)
|
|
1057
|
+
for i, col_name in enumerate(original_seq_cols)
|
|
1058
|
+
]
|
|
1059
|
+
df_sub = df_sub.with_columns(split_exprs).drop("sequence_id")
|
|
1060
|
+
elif isinstance(sequence_id_col, str) and sequence_id_col != "sequence_id":
|
|
1061
|
+
df_sub = df_sub.rename({"sequence_id": sequence_id_col})
|
|
1062
|
+
|
|
1063
|
+
dfs[key] = df_sub
|
|
1064
|
+
|
|
1065
|
+
return dfs
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
|
|
2
|
+
import polars as pl
|
|
3
|
+
import polars_sgt as xdt
|
|
4
|
+
import pytest
|
|
5
|
+
from datetime import date
|
|
6
|
+
|
|
7
|
+
def test_sgt_transform_df_no_group():
|
|
8
|
+
df = pl.DataFrame({
|
|
9
|
+
"user_id": [1, 1, 2, 2],
|
|
10
|
+
"action": ["A", "B", "A", "C"],
|
|
11
|
+
"date": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 1), date(2023, 1, 3)]
|
|
12
|
+
})
|
|
13
|
+
|
|
14
|
+
result = xdt.sgt_transform_df(
|
|
15
|
+
df,
|
|
16
|
+
sequence_id_col="user_id",
|
|
17
|
+
state_col="action",
|
|
18
|
+
time_col="date",
|
|
19
|
+
group_cols=None,
|
|
20
|
+
use_tqdm=False
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
assert isinstance(result, pl.DataFrame)
|
|
24
|
+
assert "user_id" in result.columns
|
|
25
|
+
assert "A" in result.columns # Assuming unigrams are generated as column names
|
|
26
|
+
|
|
27
|
+
def test_sgt_transform_df_with_group():
|
|
28
|
+
df = pl.DataFrame({
|
|
29
|
+
"group": ["X", "X", "Y", "Y"],
|
|
30
|
+
"user_id": [1, 1, 2, 2],
|
|
31
|
+
"action": ["A", "B", "A", "C"],
|
|
32
|
+
"date": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 1), date(2023, 1, 3)]
|
|
33
|
+
})
|
|
34
|
+
|
|
35
|
+
result = xdt.sgt_transform_df(
|
|
36
|
+
df,
|
|
37
|
+
sequence_id_col="user_id",
|
|
38
|
+
state_col="action",
|
|
39
|
+
time_col="date",
|
|
40
|
+
group_cols="group",
|
|
41
|
+
use_tqdm=False,
|
|
42
|
+
group_name="test_group_"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
assert isinstance(result, dict)
|
|
46
|
+
assert "test_group_X" in result
|
|
47
|
+
assert "test_group_Y" in result
|
|
48
|
+
assert isinstance(result["test_group_X"], pl.DataFrame)
|
|
49
|
+
|
|
50
|
+
def test_sgt_transform_df_multi_seq_id():
|
|
51
|
+
df = pl.DataFrame({
|
|
52
|
+
"id_part1": ["u1", "u1", "u2", "u2"],
|
|
53
|
+
"id_part2": ["01", "01", "02", "02"],
|
|
54
|
+
"action": ["A", "B", "A", "C"],
|
|
55
|
+
"date": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 1), date(2023, 1, 3)]
|
|
56
|
+
})
|
|
57
|
+
|
|
58
|
+
result = xdt.sgt_transform_df(
|
|
59
|
+
df,
|
|
60
|
+
sequence_id_col=["id_part1", "id_part2"],
|
|
61
|
+
state_col="action",
|
|
62
|
+
time_col="date",
|
|
63
|
+
group_cols=None,
|
|
64
|
+
use_tqdm=False,
|
|
65
|
+
keep_original_name=True
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
assert isinstance(result, pl.DataFrame)
|
|
69
|
+
assert "id_part1" in result.columns
|
|
70
|
+
assert "id_part2" in result.columns
|
|
71
|
+
# Verify values are correct
|
|
72
|
+
assert result.filter(pl.col("id_part1") == "u1").height > 0
|
|
73
|
+
|
|
74
|
+
def test_sgt_transform_df_keep_original_name_false():
|
|
75
|
+
df = pl.DataFrame({
|
|
76
|
+
"user_id": [1, 1, 2, 2],
|
|
77
|
+
"action": ["A", "B", "A", "C"],
|
|
78
|
+
"date": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 1), date(2023, 1, 3)]
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
result = xdt.sgt_transform_df(
|
|
82
|
+
df,
|
|
83
|
+
sequence_id_col="user_id",
|
|
84
|
+
state_col="action",
|
|
85
|
+
time_col="date",
|
|
86
|
+
group_cols=None,
|
|
87
|
+
use_tqdm=False,
|
|
88
|
+
keep_original_name=False
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
assert isinstance(result, pl.DataFrame)
|
|
92
|
+
assert "sequence_id" in result.columns
|
|
93
|
+
# user_id should NOT be in columns unless it's an alias, but function renames it if keep_original_name is True.
|
|
94
|
+
# If False, it should stay as sequence_id (or whatever the pivot output is).
|
|
95
|
+
# Current implementation outputs "sequence_id" if keep_original_name is False.
|
|
96
|
+
|
|
97
|
+
if __name__ == "__main__":
|
|
98
|
+
# Manually running tests
|
|
99
|
+
test_sgt_transform_df_no_group()
|
|
100
|
+
test_sgt_transform_df_with_group()
|
|
101
|
+
test_sgt_transform_df_multi_seq_id()
|
|
102
|
+
test_sgt_transform_df_keep_original_name_false()
|
|
103
|
+
print("All verification tests passed!")
|
|
@@ -168,6 +168,7 @@ dependencies = [
|
|
|
168
168
|
{ name = "polars", version = "1.37.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
|
|
169
169
|
{ name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
|
|
170
170
|
{ name = "pytest", version = "9.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
|
|
171
|
+
{ name = "tqdm" },
|
|
171
172
|
]
|
|
172
173
|
|
|
173
174
|
[package.metadata]
|
|
@@ -175,6 +176,7 @@ requires-dist = [
|
|
|
175
176
|
{ name = "maturin", specifier = ">=1.11.5" },
|
|
176
177
|
{ name = "polars", specifier = ">=1.36.1" },
|
|
177
178
|
{ name = "pytest", specifier = ">=8.4.2" },
|
|
179
|
+
{ name = "tqdm", specifier = ">=4.66.0" },
|
|
178
180
|
]
|
|
179
181
|
|
|
180
182
|
[[package]]
|
|
@@ -282,6 +284,18 @@ wheels = [
|
|
|
282
284
|
{ url = "https://files.pythonhosted.org/packages/23/d1/136eb2cb77520a31e1f64cbae9d33ec6df0d78bdf4160398e86eec8a8754/tomli-2.4.0-py3-none-any.whl", hash = "sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a", size = 14477, upload-time = "2026-01-11T11:22:37.446Z" },
|
|
283
285
|
]
|
|
284
286
|
|
|
287
|
+
[[package]]
|
|
288
|
+
name = "tqdm"
|
|
289
|
+
version = "4.67.3"
|
|
290
|
+
source = { registry = "https://pypi.org/simple" }
|
|
291
|
+
dependencies = [
|
|
292
|
+
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
|
293
|
+
]
|
|
294
|
+
sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598, upload-time = "2026-02-03T17:35:53.048Z" }
|
|
295
|
+
wheels = [
|
|
296
|
+
{ url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" },
|
|
297
|
+
]
|
|
298
|
+
|
|
285
299
|
[[package]]
|
|
286
300
|
name = "typing-extensions"
|
|
287
301
|
version = "4.15.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|