gsppy 4.1.0__tar.gz → 5.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gsppy-4.1.0 → gsppy-5.0.0}/CHANGELOG.md +31 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/PKG-INFO +146 -1
- {gsppy-4.1.0 → gsppy-5.0.0}/README.md +145 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/cli.py +129 -18
- {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/gsp.py +485 -127
- {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/utils.py +333 -33
- {gsppy-4.1.0 → gsppy-5.0.0}/pyproject.toml +1 -1
- gsppy-5.0.0/tests/test_cli_hooks.py +197 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_gsp.py +6 -1
- gsppy-5.0.0/tests/test_hooks.py +510 -0
- gsppy-5.0.0/tests/test_itemsets.py +300 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/tox.ini +1 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/.gitignore +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/CONTRIBUTING.md +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/LICENSE +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/SECURITY.md +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/__init__.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/accelerate.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/dataframe_adapters.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/enums.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/pruning.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/py.typed +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/sequence.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/token_mapper.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/rust/Cargo.lock +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/rust/Cargo.toml +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/rust/src/lib.rs +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/tests/__init__.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_cli.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_dataframe.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_gsp_fuzzing.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_gsp_sequence_integration.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_pruning.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_sequence.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_spm_format.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_temporal_constraints.py +0 -0
- {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,37 @@
|
|
|
1
1
|
# CHANGELOG
|
|
2
2
|
|
|
3
3
|
|
|
4
|
+
## v5.0.0 (2026-02-06)
|
|
5
|
+
|
|
6
|
+
### Chores
|
|
7
|
+
|
|
8
|
+
- Adds support for optional types in item filters and ignores types in metadata printouts.
|
|
9
|
+
([`79111b4`](https://github.com/jacksonpradolima/gsp-py/commit/79111b4c781a65b21a17f85b0b507b41ba6e51f9))
|
|
10
|
+
|
|
11
|
+
- Update uv.lock for version 4.2.0
|
|
12
|
+
([`f8f690f`](https://github.com/jacksonpradolima/gsp-py/commit/f8f690f7f0304dc4331c17c68487fe3411436149))
|
|
13
|
+
|
|
14
|
+
### Features
|
|
15
|
+
|
|
16
|
+
- Add preprocessing, postprocessing, and candidate filtering hooks to GSP algorithm
|
|
17
|
+
([`495d290`](https://github.com/jacksonpradolima/gsp-py/commit/495d29009abe862bf992831bd276181efa40c99d))
|
|
18
|
+
|
|
19
|
+
feat!: add preprocessing, postprocessing, and candidate filtering hooks to GSP algorithm
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
## v4.2.0 (2026-02-01)
|
|
23
|
+
|
|
24
|
+
### Chores
|
|
25
|
+
|
|
26
|
+
- Update uv.lock for version 4.1.0
|
|
27
|
+
([`5ed3d9e`](https://github.com/jacksonpradolima/gsp-py/commit/5ed3d9e46cf158a2261462cb8974b6bbb452f32e))
|
|
28
|
+
|
|
29
|
+
### Features
|
|
30
|
+
|
|
31
|
+
- Add itemset support for co-occurrence semantics in sequence mining
|
|
32
|
+
([`90805b1`](https://github.com/jacksonpradolima/gsp-py/commit/90805b190f40ebf34a72da0bbe949cb627140337))
|
|
33
|
+
|
|
34
|
+
|
|
4
35
|
## v4.1.0 (2026-02-01)
|
|
5
36
|
|
|
6
37
|
### Bug Fixes
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gsppy
|
|
3
|
-
Version:
|
|
3
|
+
Version: 5.0.0
|
|
4
4
|
Summary: GSP (Generalized Sequence Pattern) algorithm in Python
|
|
5
5
|
Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
|
|
6
6
|
Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
@@ -112,6 +112,7 @@ Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal m
|
|
|
112
112
|
- [✅ Example: Analyzing Sales Data](#example-analyzing-sales-data)
|
|
113
113
|
- [📊 Explanation: Support and Results](#explanation-support-and-results)
|
|
114
114
|
- [📊 DataFrame Input Support](#dataframe-input-support)
|
|
115
|
+
- [🔗 Itemset Support](#itemset-support)
|
|
115
116
|
- [⏱️ Temporal Constraints](#temporal-constraints)
|
|
116
117
|
7. [⌨️ Typing](#typing)
|
|
117
118
|
8. [🌟 Planned Features](#planned-features)
|
|
@@ -980,6 +981,150 @@ For complete examples and edge cases, see:
|
|
|
980
981
|
|
|
981
982
|
---
|
|
982
983
|
|
|
984
|
+
## 🔗 Itemset Support
|
|
985
|
+
|
|
986
|
+
GSP-Py supports **itemsets** within sequence elements, enabling you to capture **co-occurrence** of multiple items at the same time step. This is crucial for applications where items occur together rather than in strict sequential order.
|
|
987
|
+
|
|
988
|
+
### What are Itemsets?
|
|
989
|
+
|
|
990
|
+
- **Flat sequences**: `['A', 'B', 'C']` - each item occurs at a separate time step
|
|
991
|
+
- **Itemset sequences**: `[['A', 'B'], ['C']]` - items A and B occur together at the first time step, then C occurs later
|
|
992
|
+
|
|
993
|
+
### Why Use Itemsets?
|
|
994
|
+
|
|
995
|
+
Itemsets are essential when temporal co-occurrence matters in your domain:
|
|
996
|
+
|
|
997
|
+
- **Market basket analysis**: Customers buy multiple items in a single shopping trip, then return for more items later
|
|
998
|
+
- **Web analytics**: Users open multiple pages in parallel tabs before moving to the next set of pages
|
|
999
|
+
- **Event logs**: Multiple events can occur simultaneously in complex systems
|
|
1000
|
+
- **Purchase patterns**: Items bought together vs. items bought in sequence
|
|
1001
|
+
|
|
1002
|
+
### Using Itemsets
|
|
1003
|
+
|
|
1004
|
+
#### Basic Example
|
|
1005
|
+
|
|
1006
|
+
```python
|
|
1007
|
+
from gsppy import GSP
|
|
1008
|
+
|
|
1009
|
+
# Itemset format: nested lists where inner lists are items that occur together
|
|
1010
|
+
transactions = [
|
|
1011
|
+
[['Bread', 'Milk'], ['Eggs']], # Bought Bread & Milk together, then Eggs later
|
|
1012
|
+
[['Bread', 'Milk', 'Butter']], # Bought all three items together
|
|
1013
|
+
[['Bread', 'Milk'], ['Eggs']], # Same pattern as customer 1
|
|
1014
|
+
]
|
|
1015
|
+
|
|
1016
|
+
gsp = GSP(transactions)
|
|
1017
|
+
patterns = gsp.search(min_support=0.5)
|
|
1018
|
+
|
|
1019
|
+
# Pattern ('Bread',) will match any itemset containing Bread
|
|
1020
|
+
# Pattern ('Bread', 'Eggs') will match sequences where Bread appears before Eggs
|
|
1021
|
+
# (even if they're in different itemsets)
|
|
1022
|
+
```
|
|
1023
|
+
|
|
1024
|
+
#### Backward Compatibility with Flat Sequences
|
|
1025
|
+
|
|
1026
|
+
GSP-Py automatically normalizes flat sequences to itemsets internally, ensuring full backward compatibility:
|
|
1027
|
+
|
|
1028
|
+
```python
|
|
1029
|
+
from gsppy import GSP
|
|
1030
|
+
|
|
1031
|
+
# These are equivalent after normalization:
|
|
1032
|
+
flat_transactions = [['A', 'B', 'C']] # Flat format
|
|
1033
|
+
itemset_transactions = [[['A'], ['B'], ['C']]] # Equivalent itemset format
|
|
1034
|
+
|
|
1035
|
+
# Both produce the same results
|
|
1036
|
+
gsp1 = GSP(flat_transactions)
|
|
1037
|
+
gsp2 = GSP(itemset_transactions)
|
|
1038
|
+
|
|
1039
|
+
# Patterns are identical
|
|
1040
|
+
patterns1 = gsp1.search(min_support=0.5)
|
|
1041
|
+
patterns2 = gsp2.search(min_support=0.5)
|
|
1042
|
+
```
|
|
1043
|
+
|
|
1044
|
+
### Itemset Matching Semantics
|
|
1045
|
+
|
|
1046
|
+
Pattern matching with itemsets uses **subset semantics**:
|
|
1047
|
+
|
|
1048
|
+
- A pattern element matches a sequence element if all items in the pattern element are present in the sequence element
|
|
1049
|
+
- Example: Pattern `[['A', 'B']]` matches sequence element `['A', 'B', 'C']` because {A, B} ⊆ {A, B, C}
|
|
1050
|
+
- Pattern elements must appear in order across the sequence
|
|
1051
|
+
|
|
1052
|
+
```python
|
|
1053
|
+
from gsppy import GSP
|
|
1054
|
+
|
|
1055
|
+
transactions = [
|
|
1056
|
+
[['A', 'B', 'D'], ['E'], ['C', 'F']], # A,B,D together, then E, then C,F together
|
|
1057
|
+
]
|
|
1058
|
+
|
|
1059
|
+
gsp = GSP(transactions)
|
|
1060
|
+
|
|
1061
|
+
# Pattern ('A', 'C') will match because:
|
|
1062
|
+
# - 'A' is in first itemset ['A', 'B', 'D'] ✓
|
|
1063
|
+
# - 'C' appears later in third itemset ['C', 'F'] ✓
|
|
1064
|
+
# - Order is preserved ✓
|
|
1065
|
+
```
|
|
1066
|
+
|
|
1067
|
+
### Reading Itemsets from SPM Format
|
|
1068
|
+
|
|
1069
|
+
The SPM/GSP format supports itemsets using delimiters:
|
|
1070
|
+
|
|
1071
|
+
- `-1`: End of itemset
|
|
1072
|
+
- `-2`: End of sequence
|
|
1073
|
+
|
|
1074
|
+
```python
|
|
1075
|
+
from gsppy.utils import read_transactions_from_spm
|
|
1076
|
+
|
|
1077
|
+
# SPM file content:
|
|
1078
|
+
# 1 2 -1 3 -1 -2
|
|
1079
|
+
# 1 -1 3 4 -1 -2
|
|
1080
|
+
|
|
1081
|
+
# Read with itemsets preserved
|
|
1082
|
+
transactions = read_transactions_from_spm("data.txt", preserve_itemsets=True)
|
|
1083
|
+
# Result: [[['1', '2'], ['3']], [['1'], ['3', '4']]]
|
|
1084
|
+
|
|
1085
|
+
# Read with itemsets flattened (backward compatible)
|
|
1086
|
+
transactions = read_transactions_from_spm("data.txt", preserve_itemsets=False)
|
|
1087
|
+
# Result: [['1', '2', '3'], ['1', '3', '4']]
|
|
1088
|
+
```
|
|
1089
|
+
|
|
1090
|
+
### Itemsets with Timestamps
|
|
1091
|
+
|
|
1092
|
+
Itemsets work seamlessly with temporal constraints:
|
|
1093
|
+
|
|
1094
|
+
```python
|
|
1095
|
+
from gsppy import GSP
|
|
1096
|
+
|
|
1097
|
+
# Itemsets with timestamps: [(item, timestamp), ...]
|
|
1098
|
+
transactions = [
|
|
1099
|
+
[[('Login', 0), ('Home', 0)], [('Product', 5)], [('Checkout', 10)]],
|
|
1100
|
+
[[('Login', 0)], [('Home', 2), ('Product', 2)], [('Checkout', 15)]],
|
|
1101
|
+
]
|
|
1102
|
+
|
|
1103
|
+
# Find patterns where events in the same itemset occur together
|
|
1104
|
+
# and subsequent itemsets occur within maxgap time units
|
|
1105
|
+
gsp = GSP(transactions, maxgap=10)
|
|
1106
|
+
patterns = gsp.search(min_support=0.5)
|
|
1107
|
+
```
|
|
1108
|
+
|
|
1109
|
+
### Complete Example
|
|
1110
|
+
|
|
1111
|
+
See [examples/itemset_example.py](examples/itemset_example.py) for comprehensive examples including:
|
|
1112
|
+
|
|
1113
|
+
- Market basket analysis with itemsets
|
|
1114
|
+
- Web clickstream with parallel page views
|
|
1115
|
+
- Comparison of flat vs. itemset semantics
|
|
1116
|
+
- Reading and processing SPM format files
|
|
1117
|
+
|
|
1118
|
+
### Key Takeaways
|
|
1119
|
+
|
|
1120
|
+
✓ **Itemsets capture co-occurrence** of items at the same time step
|
|
1121
|
+
✓ **Flat sequences are automatically normalized** to itemsets internally
|
|
1122
|
+
✓ **Both formats work seamlessly** with GSP-Py
|
|
1123
|
+
✓ **Use itemsets when temporal co-occurrence matters** in your domain
|
|
1124
|
+
✓ **SPM format supports** both flat and itemset representations
|
|
1125
|
+
|
|
1126
|
+
---
|
|
1127
|
+
|
|
983
1128
|
## ⏱️ Temporal Constraints
|
|
984
1129
|
|
|
985
1130
|
GSP-Py supports **time-constrained sequential pattern mining** with three powerful temporal constraints: `mingap`, `maxgap`, and `maxspan`. These constraints enable domain-specific applications such as medical event mining, retail analytics, and temporal user journey discovery.
|
|
@@ -39,6 +39,7 @@ Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal m
|
|
|
39
39
|
- [✅ Example: Analyzing Sales Data](#example-analyzing-sales-data)
|
|
40
40
|
- [📊 Explanation: Support and Results](#explanation-support-and-results)
|
|
41
41
|
- [📊 DataFrame Input Support](#dataframe-input-support)
|
|
42
|
+
- [🔗 Itemset Support](#itemset-support)
|
|
42
43
|
- [⏱️ Temporal Constraints](#temporal-constraints)
|
|
43
44
|
7. [⌨️ Typing](#typing)
|
|
44
45
|
8. [🌟 Planned Features](#planned-features)
|
|
@@ -907,6 +908,150 @@ For complete examples and edge cases, see:
|
|
|
907
908
|
|
|
908
909
|
---
|
|
909
910
|
|
|
911
|
+
## 🔗 Itemset Support
|
|
912
|
+
|
|
913
|
+
GSP-Py supports **itemsets** within sequence elements, enabling you to capture **co-occurrence** of multiple items at the same time step. This is crucial for applications where items occur together rather than in strict sequential order.
|
|
914
|
+
|
|
915
|
+
### What are Itemsets?
|
|
916
|
+
|
|
917
|
+
- **Flat sequences**: `['A', 'B', 'C']` - each item occurs at a separate time step
|
|
918
|
+
- **Itemset sequences**: `[['A', 'B'], ['C']]` - items A and B occur together at the first time step, then C occurs later
|
|
919
|
+
|
|
920
|
+
### Why Use Itemsets?
|
|
921
|
+
|
|
922
|
+
Itemsets are essential when temporal co-occurrence matters in your domain:
|
|
923
|
+
|
|
924
|
+
- **Market basket analysis**: Customers buy multiple items in a single shopping trip, then return for more items later
|
|
925
|
+
- **Web analytics**: Users open multiple pages in parallel tabs before moving to the next set of pages
|
|
926
|
+
- **Event logs**: Multiple events can occur simultaneously in complex systems
|
|
927
|
+
- **Purchase patterns**: Items bought together vs. items bought in sequence
|
|
928
|
+
|
|
929
|
+
### Using Itemsets
|
|
930
|
+
|
|
931
|
+
#### Basic Example
|
|
932
|
+
|
|
933
|
+
```python
|
|
934
|
+
from gsppy import GSP
|
|
935
|
+
|
|
936
|
+
# Itemset format: nested lists where inner lists are items that occur together
|
|
937
|
+
transactions = [
|
|
938
|
+
[['Bread', 'Milk'], ['Eggs']], # Bought Bread & Milk together, then Eggs later
|
|
939
|
+
[['Bread', 'Milk', 'Butter']], # Bought all three items together
|
|
940
|
+
[['Bread', 'Milk'], ['Eggs']], # Same pattern as customer 1
|
|
941
|
+
]
|
|
942
|
+
|
|
943
|
+
gsp = GSP(transactions)
|
|
944
|
+
patterns = gsp.search(min_support=0.5)
|
|
945
|
+
|
|
946
|
+
# Pattern ('Bread',) will match any itemset containing Bread
|
|
947
|
+
# Pattern ('Bread', 'Eggs') will match sequences where Bread appears before Eggs
|
|
948
|
+
# (even if they're in different itemsets)
|
|
949
|
+
```
|
|
950
|
+
|
|
951
|
+
#### Backward Compatibility with Flat Sequences
|
|
952
|
+
|
|
953
|
+
GSP-Py automatically normalizes flat sequences to itemsets internally, ensuring full backward compatibility:
|
|
954
|
+
|
|
955
|
+
```python
|
|
956
|
+
from gsppy import GSP
|
|
957
|
+
|
|
958
|
+
# These are equivalent after normalization:
|
|
959
|
+
flat_transactions = [['A', 'B', 'C']] # Flat format
|
|
960
|
+
itemset_transactions = [[['A'], ['B'], ['C']]] # Equivalent itemset format
|
|
961
|
+
|
|
962
|
+
# Both produce the same results
|
|
963
|
+
gsp1 = GSP(flat_transactions)
|
|
964
|
+
gsp2 = GSP(itemset_transactions)
|
|
965
|
+
|
|
966
|
+
# Patterns are identical
|
|
967
|
+
patterns1 = gsp1.search(min_support=0.5)
|
|
968
|
+
patterns2 = gsp2.search(min_support=0.5)
|
|
969
|
+
```
|
|
970
|
+
|
|
971
|
+
### Itemset Matching Semantics
|
|
972
|
+
|
|
973
|
+
Pattern matching with itemsets uses **subset semantics**:
|
|
974
|
+
|
|
975
|
+
- A pattern element matches a sequence element if all items in the pattern element are present in the sequence element
|
|
976
|
+
- Example: Pattern `[['A', 'B']]` matches sequence element `['A', 'B', 'C']` because {A, B} ⊆ {A, B, C}
|
|
977
|
+
- Pattern elements must appear in order across the sequence
|
|
978
|
+
|
|
979
|
+
```python
|
|
980
|
+
from gsppy import GSP
|
|
981
|
+
|
|
982
|
+
transactions = [
|
|
983
|
+
[['A', 'B', 'D'], ['E'], ['C', 'F']], # A,B,D together, then E, then C,F together
|
|
984
|
+
]
|
|
985
|
+
|
|
986
|
+
gsp = GSP(transactions)
|
|
987
|
+
|
|
988
|
+
# Pattern ('A', 'C') will match because:
|
|
989
|
+
# - 'A' is in first itemset ['A', 'B', 'D'] ✓
|
|
990
|
+
# - 'C' appears later in third itemset ['C', 'F'] ✓
|
|
991
|
+
# - Order is preserved ✓
|
|
992
|
+
```
|
|
993
|
+
|
|
994
|
+
### Reading Itemsets from SPM Format
|
|
995
|
+
|
|
996
|
+
The SPM/GSP format supports itemsets using delimiters:
|
|
997
|
+
|
|
998
|
+
- `-1`: End of itemset
|
|
999
|
+
- `-2`: End of sequence
|
|
1000
|
+
|
|
1001
|
+
```python
|
|
1002
|
+
from gsppy.utils import read_transactions_from_spm
|
|
1003
|
+
|
|
1004
|
+
# SPM file content:
|
|
1005
|
+
# 1 2 -1 3 -1 -2
|
|
1006
|
+
# 1 -1 3 4 -1 -2
|
|
1007
|
+
|
|
1008
|
+
# Read with itemsets preserved
|
|
1009
|
+
transactions = read_transactions_from_spm("data.txt", preserve_itemsets=True)
|
|
1010
|
+
# Result: [[['1', '2'], ['3']], [['1'], ['3', '4']]]
|
|
1011
|
+
|
|
1012
|
+
# Read with itemsets flattened (backward compatible)
|
|
1013
|
+
transactions = read_transactions_from_spm("data.txt", preserve_itemsets=False)
|
|
1014
|
+
# Result: [['1', '2', '3'], ['1', '3', '4']]
|
|
1015
|
+
```
|
|
1016
|
+
|
|
1017
|
+
### Itemsets with Timestamps
|
|
1018
|
+
|
|
1019
|
+
Itemsets work seamlessly with temporal constraints:
|
|
1020
|
+
|
|
1021
|
+
```python
|
|
1022
|
+
from gsppy import GSP
|
|
1023
|
+
|
|
1024
|
+
# Itemsets with timestamps: [(item, timestamp), ...]
|
|
1025
|
+
transactions = [
|
|
1026
|
+
[[('Login', 0), ('Home', 0)], [('Product', 5)], [('Checkout', 10)]],
|
|
1027
|
+
[[('Login', 0)], [('Home', 2), ('Product', 2)], [('Checkout', 15)]],
|
|
1028
|
+
]
|
|
1029
|
+
|
|
1030
|
+
# Find patterns where events in the same itemset occur together
|
|
1031
|
+
# and subsequent itemsets occur within maxgap time units
|
|
1032
|
+
gsp = GSP(transactions, maxgap=10)
|
|
1033
|
+
patterns = gsp.search(min_support=0.5)
|
|
1034
|
+
```
|
|
1035
|
+
|
|
1036
|
+
### Complete Example
|
|
1037
|
+
|
|
1038
|
+
See [examples/itemset_example.py](examples/itemset_example.py) for comprehensive examples including:
|
|
1039
|
+
|
|
1040
|
+
- Market basket analysis with itemsets
|
|
1041
|
+
- Web clickstream with parallel page views
|
|
1042
|
+
- Comparison of flat vs. itemset semantics
|
|
1043
|
+
- Reading and processing SPM format files
|
|
1044
|
+
|
|
1045
|
+
### Key Takeaways
|
|
1046
|
+
|
|
1047
|
+
✓ **Itemsets capture co-occurrence** of items at the same time step
|
|
1048
|
+
✓ **Flat sequences are automatically normalized** to itemsets internally
|
|
1049
|
+
✓ **Both formats work seamlessly** with GSP-Py
|
|
1050
|
+
✓ **Use itemsets when temporal co-occurrence matters** in your domain
|
|
1051
|
+
✓ **SPM format supports** both flat and itemset representations
|
|
1052
|
+
|
|
1053
|
+
---
|
|
1054
|
+
|
|
910
1055
|
## ⏱️ Temporal Constraints
|
|
911
1056
|
|
|
912
1057
|
GSP-Py supports **time-constrained sequential pattern mining** with three powerful temporal constraints: `mingap`, `maxgap`, and `maxspan`. These constraints enable domain-specific applications such as medical event mining, retail analytics, and temporal user journey discovery.
|
|
@@ -35,7 +35,8 @@ import csv
|
|
|
35
35
|
import sys
|
|
36
36
|
import json
|
|
37
37
|
import logging
|
|
38
|
-
|
|
38
|
+
import importlib
|
|
39
|
+
from typing import Any, List, Tuple, Union, Callable, Optional, cast
|
|
39
40
|
|
|
40
41
|
import click
|
|
41
42
|
|
|
@@ -51,6 +52,54 @@ from gsppy.enums import (
|
|
|
51
52
|
from gsppy.utils import has_timestamps
|
|
52
53
|
|
|
53
54
|
|
|
55
|
+
def _load_hook_function(import_path: str, hook_type: str) -> Callable[..., Any]:
|
|
56
|
+
"""
|
|
57
|
+
Load a hook function from a Python module import path.
|
|
58
|
+
|
|
59
|
+
Parameters:
|
|
60
|
+
import_path (str): Import path in format 'module.submodule.function_name'
|
|
61
|
+
hook_type (str): Type of hook for error messages ('preprocess', 'postprocess', 'candidate_filter')
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Callable: The loaded hook function
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
ValueError: If the import path is invalid or function cannot be loaded
|
|
68
|
+
"""
|
|
69
|
+
try:
|
|
70
|
+
# Split into module path and function name
|
|
71
|
+
parts = import_path.rsplit(".", 1)
|
|
72
|
+
if len(parts) != 2:
|
|
73
|
+
raise ValueError(f"Invalid import path format. Expected 'module.function', got '{import_path}'")
|
|
74
|
+
|
|
75
|
+
module_name, function_name = parts
|
|
76
|
+
|
|
77
|
+
# Import the module
|
|
78
|
+
module = importlib.import_module(module_name)
|
|
79
|
+
|
|
80
|
+
# Get the function from the module
|
|
81
|
+
if not hasattr(module, function_name):
|
|
82
|
+
raise ValueError(f"Function '{function_name}' not found in module '{module_name}'")
|
|
83
|
+
|
|
84
|
+
hook_fn = getattr(module, function_name)
|
|
85
|
+
|
|
86
|
+
# Verify it's callable
|
|
87
|
+
if not callable(hook_fn):
|
|
88
|
+
raise ValueError(f"'{import_path}' is not a callable function")
|
|
89
|
+
|
|
90
|
+
return hook_fn
|
|
91
|
+
|
|
92
|
+
except ImportError as e:
|
|
93
|
+
# Extract module name from import path for error message
|
|
94
|
+
module_part = import_path.rsplit(".", 1)[0] if "." in import_path else import_path
|
|
95
|
+
raise ValueError(f"Failed to import {hook_type} hook module '{module_part}': {e}") from e
|
|
96
|
+
except ValueError:
|
|
97
|
+
# Re-raise ValueError as-is
|
|
98
|
+
raise
|
|
99
|
+
except Exception as e:
|
|
100
|
+
raise ValueError(f"Failed to load {hook_type} hook function '{import_path}': {e}") from e
|
|
101
|
+
|
|
102
|
+
|
|
54
103
|
def setup_logging(verbose: bool) -> None:
|
|
55
104
|
"""
|
|
56
105
|
Configure logging with standardized format based on verbosity level.
|
|
@@ -515,20 +564,26 @@ def _load_transactions_by_format(
|
|
|
515
564
|
help="File format to use. 'auto' detects format from file extension.",
|
|
516
565
|
)
|
|
517
566
|
@click.option("--verbose", is_flag=True, help="Enable verbose output for debugging purposes.")
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
567
|
+
@click.option(
|
|
568
|
+
"--preprocess-hook",
|
|
569
|
+
type=str,
|
|
570
|
+
default=None,
|
|
571
|
+
help="Python import path to preprocessing hook function (e.g., 'mymodule.preprocess_fn').",
|
|
572
|
+
)
|
|
573
|
+
@click.option(
|
|
574
|
+
"--postprocess-hook",
|
|
575
|
+
type=str,
|
|
576
|
+
default=None,
|
|
577
|
+
help="Python import path to postprocessing hook function (e.g., 'mymodule.postprocess_fn').",
|
|
578
|
+
)
|
|
579
|
+
@click.option(
|
|
580
|
+
"--candidate-filter-hook",
|
|
581
|
+
type=str,
|
|
582
|
+
default=None,
|
|
583
|
+
help="Python import path to candidate filter hook function (e.g., 'mymodule.filter_fn').",
|
|
584
|
+
)
|
|
585
|
+
@click.pass_context
|
|
586
|
+
def main(ctx: click.Context, **kwargs: Any) -> None:
|
|
532
587
|
"""
|
|
533
588
|
Run the GSP algorithm on transactional data from a file.
|
|
534
589
|
|
|
@@ -573,9 +628,59 @@ def main(
|
|
|
573
628
|
```bash
|
|
574
629
|
gsppy --file data.txt --format spm --min_support 0.3
|
|
575
630
|
```
|
|
631
|
+
|
|
632
|
+
With custom hooks (requires Python module with hook functions):
|
|
633
|
+
|
|
634
|
+
```bash
|
|
635
|
+
# Create a hooks module first (hooks.py):
|
|
636
|
+
# def my_filter(candidate, support, context):
|
|
637
|
+
# return len(candidate) <= 2 # Keep only short patterns
|
|
638
|
+
#
|
|
639
|
+
# def my_postprocess(patterns):
|
|
640
|
+
# return patterns[:2] # Keep only first 2 levels
|
|
641
|
+
|
|
642
|
+
gsppy --file data.json --min_support 0.3 \
|
|
643
|
+
--candidate-filter-hook hooks.my_filter \
|
|
644
|
+
--postprocess-hook hooks.my_postprocess
|
|
645
|
+
```
|
|
576
646
|
"""
|
|
647
|
+
# Extract parameters from kwargs
|
|
648
|
+
file_path = kwargs['file_path']
|
|
649
|
+
min_support = kwargs['min_support']
|
|
650
|
+
backend = kwargs['backend']
|
|
651
|
+
mingap = kwargs.get('mingap')
|
|
652
|
+
maxgap = kwargs.get('maxgap')
|
|
653
|
+
maxspan = kwargs.get('maxspan')
|
|
654
|
+
transaction_col = kwargs.get('transaction_col')
|
|
655
|
+
item_col = kwargs.get('item_col')
|
|
656
|
+
timestamp_col = kwargs.get('timestamp_col')
|
|
657
|
+
sequence_col = kwargs.get('sequence_col')
|
|
658
|
+
file_format = kwargs['format']
|
|
659
|
+
verbose = kwargs['verbose']
|
|
660
|
+
preprocess_hook = kwargs.get('preprocess_hook')
|
|
661
|
+
postprocess_hook = kwargs.get('postprocess_hook')
|
|
662
|
+
candidate_filter_hook = kwargs.get('candidate_filter_hook')
|
|
663
|
+
|
|
577
664
|
setup_logging(verbose)
|
|
578
665
|
|
|
666
|
+
# Load hook functions if specified
|
|
667
|
+
try:
|
|
668
|
+
preprocess_fn = _load_hook_function(preprocess_hook, "preprocess") if preprocess_hook else None
|
|
669
|
+
postprocess_fn = _load_hook_function(postprocess_hook, "postprocess") if postprocess_hook else None
|
|
670
|
+
candidate_filter_fn = (
|
|
671
|
+
_load_hook_function(candidate_filter_hook, "candidate_filter") if candidate_filter_hook else None
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
if preprocess_fn:
|
|
675
|
+
logger.info(f"Loaded preprocessing hook: {preprocess_hook}")
|
|
676
|
+
if postprocess_fn:
|
|
677
|
+
logger.info(f"Loaded postprocessing hook: {postprocess_hook}")
|
|
678
|
+
if candidate_filter_fn:
|
|
679
|
+
logger.info(f"Loaded candidate filter hook: {candidate_filter_hook}")
|
|
680
|
+
except ValueError as e:
|
|
681
|
+
logger.error(f"Error loading hook function: {e}")
|
|
682
|
+
sys.exit(1)
|
|
683
|
+
|
|
579
684
|
# Detect file extension to determine if DataFrame column params are needed
|
|
580
685
|
_, file_extension = os.path.splitext(file_path)
|
|
581
686
|
file_extension = file_extension.lower()
|
|
@@ -583,10 +688,10 @@ def main(
|
|
|
583
688
|
|
|
584
689
|
# Automatically detect and load transactions
|
|
585
690
|
try:
|
|
586
|
-
|
|
691
|
+
file_format_lower = file_format.lower()
|
|
587
692
|
transactions = _load_transactions_by_format(
|
|
588
693
|
file_path,
|
|
589
|
-
|
|
694
|
+
file_format_lower,
|
|
590
695
|
file_extension,
|
|
591
696
|
is_dataframe_format,
|
|
592
697
|
transaction_col,
|
|
@@ -608,7 +713,13 @@ def main(
|
|
|
608
713
|
# Initialize and run GSP algorithm
|
|
609
714
|
try:
|
|
610
715
|
gsp = GSP(transactions, mingap=mingap, maxgap=maxgap, maxspan=maxspan, verbose=verbose)
|
|
611
|
-
patterns = gsp.search(
|
|
716
|
+
patterns = gsp.search(
|
|
717
|
+
min_support=min_support,
|
|
718
|
+
return_sequences=False,
|
|
719
|
+
preprocess_fn=preprocess_fn,
|
|
720
|
+
postprocess_fn=postprocess_fn,
|
|
721
|
+
candidate_filter_fn=candidate_filter_fn,
|
|
722
|
+
)
|
|
612
723
|
logger.info("Frequent Patterns Found:")
|
|
613
724
|
for i, level in enumerate(patterns, start=1):
|
|
614
725
|
logger.info(f"\n{i}-Sequence Patterns:")
|