gsppy 4.0.0__tar.gz → 4.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gsppy-4.0.0 → gsppy-4.2.0}/CHANGELOG.md +70 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/PKG-INFO +222 -1
- {gsppy-4.0.0 → gsppy-4.2.0}/README.md +221 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/__init__.py +10 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/cli.py +2 -2
- {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/gsp.py +290 -87
- gsppy-4.2.0/gsppy/sequence.py +371 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/utils.py +333 -33
- {gsppy-4.0.0 → gsppy-4.2.0}/pyproject.toml +1 -1
- {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_gsp.py +6 -1
- gsppy-4.2.0/tests/test_gsp_sequence_integration.py +345 -0
- gsppy-4.2.0/tests/test_itemsets.py +300 -0
- gsppy-4.2.0/tests/test_sequence.py +466 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/tox.ini +1 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/.gitignore +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/CONTRIBUTING.md +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/LICENSE +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/SECURITY.md +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/accelerate.py +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/dataframe_adapters.py +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/enums.py +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/pruning.py +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/py.typed +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/token_mapper.py +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/rust/Cargo.lock +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/rust/Cargo.toml +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/rust/src/lib.rs +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/tests/__init__.py +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_cli.py +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_dataframe.py +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_gsp_fuzzing.py +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_pruning.py +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_spm_format.py +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_temporal_constraints.py +0 -0
- {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,76 @@
|
|
|
1
1
|
# CHANGELOG
|
|
2
2
|
|
|
3
3
|
|
|
4
|
+
## v4.2.0 (2026-02-01)
|
|
5
|
+
|
|
6
|
+
### Chores
|
|
7
|
+
|
|
8
|
+
- Update uv.lock for version 4.1.0
|
|
9
|
+
([`5ed3d9e`](https://github.com/jacksonpradolima/gsp-py/commit/5ed3d9e46cf158a2261462cb8974b6bbb452f32e))
|
|
10
|
+
|
|
11
|
+
### Features
|
|
12
|
+
|
|
13
|
+
- Add itemset support for co-occurrence semantics in sequence mining
|
|
14
|
+
([`90805b1`](https://github.com/jacksonpradolima/gsp-py/commit/90805b190f40ebf34a72da0bbe949cb627140337))
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
## v4.1.0 (2026-02-01)
|
|
18
|
+
|
|
19
|
+
### Bug Fixes
|
|
20
|
+
|
|
21
|
+
- Address code review feedback - add type annotations and remove unused variables
|
|
22
|
+
([`bf62d14`](https://github.com/jacksonpradolima/gsp-py/commit/bf62d144d8f1be1e7716291d41af955450612c81))
|
|
23
|
+
|
|
24
|
+
Co-authored-by: jacksonpradolima <7774063+jacksonpradolima@users.noreply.github.com>
|
|
25
|
+
|
|
26
|
+
### Chores
|
|
27
|
+
|
|
28
|
+
- Update uv.lock for version 4.0.0
|
|
29
|
+
([`f1ae2af`](https://github.com/jacksonpradolima/gsp-py/commit/f1ae2af2aa71ea44b9d8625ed647da79259ec096))
|
|
30
|
+
|
|
31
|
+
### Documentation
|
|
32
|
+
|
|
33
|
+
- Add Sequence documentation and examples to README
|
|
34
|
+
([`62d0d02`](https://github.com/jacksonpradolima/gsp-py/commit/62d0d02c19c5751331df53e680cc0b9aee19677b))
|
|
35
|
+
|
|
36
|
+
Co-authored-by: jacksonpradolima <7774063+jacksonpradolima@users.noreply.github.com>
|
|
37
|
+
|
|
38
|
+
- Update docs/ with Sequence abstraction documentation
|
|
39
|
+
([`2368cf3`](https://github.com/jacksonpradolima/gsp-py/commit/2368cf30239139e8e2af5457ee6acf14db30ef06))
|
|
40
|
+
|
|
41
|
+
Co-authored-by: jacksonpradolima <7774063+jacksonpradolima@users.noreply.github.com>
|
|
42
|
+
|
|
43
|
+
### Features
|
|
44
|
+
|
|
45
|
+
- Add Sequence abstraction class with comprehensive tests
|
|
46
|
+
([`6011bdb`](https://github.com/jacksonpradolima/gsp-py/commit/6011bdb7104755d109b58261b36e1dd1c36b2d61))
|
|
47
|
+
|
|
48
|
+
Co-authored-by: jacksonpradolima <7774063+jacksonpradolima@users.noreply.github.com>
|
|
49
|
+
|
|
50
|
+
- Integrate Sequence objects with GSP.search() via return_sequences parameter
|
|
51
|
+
([`7476588`](https://github.com/jacksonpradolima/gsp-py/commit/7476588f2b277276748e0550366014f2a93d8ef5))
|
|
52
|
+
|
|
53
|
+
Co-authored-by: jacksonpradolima <7774063+jacksonpradolima@users.noreply.github.com>
|
|
54
|
+
|
|
55
|
+
- Introduce Sequence abstraction for typed pattern representation
|
|
56
|
+
([`01ca37b`](https://github.com/jacksonpradolima/gsp-py/commit/01ca37b9bc4572eb7b1c1eaf6fdf26ca2324a3c5))
|
|
57
|
+
|
|
58
|
+
### Refactoring
|
|
59
|
+
|
|
60
|
+
- Address code review feedback - remove redundant checks
|
|
61
|
+
([`621e940`](https://github.com/jacksonpradolima/gsp-py/commit/621e9403379ae0fd07bf45b97616b9979f2d4aa6))
|
|
62
|
+
|
|
63
|
+
Co-authored-by: jacksonpradolima <7774063+jacksonpradolima@users.noreply.github.com>
|
|
64
|
+
|
|
65
|
+
- Reduce cognitive complexity in sequence_example.py and fix f-string
|
|
66
|
+
([`63ac4f9`](https://github.com/jacksonpradolima/gsp-py/commit/63ac4f9ceb869a5228cdccdcf6a9d0b9f46f0350))
|
|
67
|
+
|
|
68
|
+
Co-authored-by: jacksonpradolima <7774063+jacksonpradolima@users.noreply.github.com>
|
|
69
|
+
|
|
70
|
+
- Update type annotations and improve search method in GSP class
|
|
71
|
+
([`e2e9a3f`](https://github.com/jacksonpradolima/gsp-py/commit/e2e9a3f473d1e0c5d6990c8b7c5837a251761032))
|
|
72
|
+
|
|
73
|
+
|
|
4
74
|
## v4.0.0 (2026-02-01)
|
|
5
75
|
|
|
6
76
|
### Chores
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gsppy
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.2.0
|
|
4
4
|
Summary: GSP (Generalized Sequence Pattern) algorithm in Python
|
|
5
5
|
Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
|
|
6
6
|
Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
@@ -112,6 +112,7 @@ Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal m
|
|
|
112
112
|
- [✅ Example: Analyzing Sales Data](#example-analyzing-sales-data)
|
|
113
113
|
- [📊 Explanation: Support and Results](#explanation-support-and-results)
|
|
114
114
|
- [📊 DataFrame Input Support](#dataframe-input-support)
|
|
115
|
+
- [🔗 Itemset Support](#itemset-support)
|
|
115
116
|
- [⏱️ Temporal Constraints](#temporal-constraints)
|
|
116
117
|
7. [⌨️ Typing](#typing)
|
|
117
118
|
8. [🌟 Planned Features](#planned-features)
|
|
@@ -559,6 +560,82 @@ Verbose mode provides:
|
|
|
559
560
|
|
|
560
561
|
For complete documentation on logging, see [docs/logging.md](docs/logging.md).
|
|
561
562
|
|
|
563
|
+
### Using Sequence Objects for Rich Pattern Representation
|
|
564
|
+
|
|
565
|
+
GSP-Py 4.0+ introduces a **Sequence abstraction class** that provides a richer, more maintainable way to work with sequential patterns. The Sequence class encapsulates pattern items, support counts, and optional metadata in an immutable, hashable object.
|
|
566
|
+
|
|
567
|
+
#### Traditional Dict-based Output (Default)
|
|
568
|
+
|
|
569
|
+
```python
|
|
570
|
+
from gsppy import GSP
|
|
571
|
+
|
|
572
|
+
transactions = [
|
|
573
|
+
['Bread', 'Milk'],
|
|
574
|
+
['Bread', 'Diaper', 'Beer', 'Eggs'],
|
|
575
|
+
['Milk', 'Diaper', 'Beer', 'Coke']
|
|
576
|
+
]
|
|
577
|
+
|
|
578
|
+
gsp = GSP(transactions)
|
|
579
|
+
result = gsp.search(min_support=0.3)
|
|
580
|
+
|
|
581
|
+
# Returns: [{('Bread',): 4, ('Milk',): 4, ...}, {('Bread', 'Milk'): 3, ...}, ...]
|
|
582
|
+
for level_patterns in result:
|
|
583
|
+
for pattern, support in level_patterns.items():
|
|
584
|
+
print(f"Pattern: {pattern}, Support: {support}")
|
|
585
|
+
```
|
|
586
|
+
|
|
587
|
+
#### Sequence Objects (New Feature)
|
|
588
|
+
|
|
589
|
+
```python
|
|
590
|
+
from gsppy import GSP
|
|
591
|
+
|
|
592
|
+
transactions = [
|
|
593
|
+
['Bread', 'Milk'],
|
|
594
|
+
['Bread', 'Diaper', 'Beer', 'Eggs'],
|
|
595
|
+
['Milk', 'Diaper', 'Beer', 'Coke']
|
|
596
|
+
]
|
|
597
|
+
|
|
598
|
+
gsp = GSP(transactions)
|
|
599
|
+
result = gsp.search(min_support=0.3, return_sequences=True)
|
|
600
|
+
|
|
601
|
+
# Returns: [[Sequence(('Bread',), support=4), ...], [Sequence(('Bread', 'Milk'), support=3), ...], ...]
|
|
602
|
+
for level_patterns in result:
|
|
603
|
+
for seq in level_patterns:
|
|
604
|
+
print(f"Pattern: {seq.items}, Support: {seq.support}, Length: {seq.length}")
|
|
605
|
+
# Access sequence properties
|
|
606
|
+
print(f" First item: {seq.first_item}, Last item: {seq.last_item}")
|
|
607
|
+
# Check if item is in sequence
|
|
608
|
+
if "Milk" in seq:
|
|
609
|
+
print(f" Contains Milk!")
|
|
610
|
+
```
|
|
611
|
+
|
|
612
|
+
#### Key Benefits of Sequence Objects
|
|
613
|
+
|
|
614
|
+
1. **Rich API**: Access pattern properties like `length`, `first_item`, `last_item`
|
|
615
|
+
2. **Type Safety**: IDE autocomplete and better type hints
|
|
616
|
+
3. **Immutable & Hashable**: Can be used as dictionary keys
|
|
617
|
+
4. **Extensible**: Add metadata for confidence, lift, or custom properties
|
|
618
|
+
5. **Backward Compatible**: Convert to/from dict format as needed
|
|
619
|
+
|
|
620
|
+
```python
|
|
621
|
+
from gsppy import Sequence, sequences_to_dict, dict_to_sequences
|
|
622
|
+
|
|
623
|
+
# Create custom sequences
|
|
624
|
+
seq = Sequence.from_tuple(("A", "B", "C"), support=5)
|
|
625
|
+
|
|
626
|
+
# Extend sequences
|
|
627
|
+
extended = seq.extend("D") # Creates Sequence(("A", "B", "C", "D"))
|
|
628
|
+
|
|
629
|
+
# Add metadata
|
|
630
|
+
seq_with_meta = seq.with_metadata(confidence=0.85, lift=1.5)
|
|
631
|
+
|
|
632
|
+
# Convert between formats for compatibility
|
|
633
|
+
seq_result = gsp.search(min_support=0.3, return_sequences=True)
|
|
634
|
+
dict_format = sequences_to_dict(seq_result[0]) # Convert to dict
|
|
635
|
+
```
|
|
636
|
+
|
|
637
|
+
For a complete example, see [examples/sequence_example.py](examples/sequence_example.py).
|
|
638
|
+
|
|
562
639
|
### Loading SPM/GSP Format Files
|
|
563
640
|
|
|
564
641
|
GSP-Py supports loading datasets in the classical SPM/GSP delimiter format, which is widely used in sequential pattern mining research. This format uses:
|
|
@@ -904,6 +981,150 @@ For complete examples and edge cases, see:
|
|
|
904
981
|
|
|
905
982
|
---
|
|
906
983
|
|
|
984
|
+
## 🔗 Itemset Support
|
|
985
|
+
|
|
986
|
+
GSP-Py supports **itemsets** within sequence elements, enabling you to capture **co-occurrence** of multiple items at the same time step. This is crucial for applications where items occur together rather than in strict sequential order.
|
|
987
|
+
|
|
988
|
+
### What are Itemsets?
|
|
989
|
+
|
|
990
|
+
- **Flat sequences**: `['A', 'B', 'C']` - each item occurs at a separate time step
|
|
991
|
+
- **Itemset sequences**: `[['A', 'B'], ['C']]` - items A and B occur together at the first time step, then C occurs later
|
|
992
|
+
|
|
993
|
+
### Why Use Itemsets?
|
|
994
|
+
|
|
995
|
+
Itemsets are essential when temporal co-occurrence matters in your domain:
|
|
996
|
+
|
|
997
|
+
- **Market basket analysis**: Customers buy multiple items in a single shopping trip, then return for more items later
|
|
998
|
+
- **Web analytics**: Users open multiple pages in parallel tabs before moving to the next set of pages
|
|
999
|
+
- **Event logs**: Multiple events can occur simultaneously in complex systems
|
|
1000
|
+
- **Purchase patterns**: Items bought together vs. items bought in sequence
|
|
1001
|
+
|
|
1002
|
+
### Using Itemsets
|
|
1003
|
+
|
|
1004
|
+
#### Basic Example
|
|
1005
|
+
|
|
1006
|
+
```python
|
|
1007
|
+
from gsppy import GSP
|
|
1008
|
+
|
|
1009
|
+
# Itemset format: nested lists where inner lists are items that occur together
|
|
1010
|
+
transactions = [
|
|
1011
|
+
[['Bread', 'Milk'], ['Eggs']], # Bought Bread & Milk together, then Eggs later
|
|
1012
|
+
[['Bread', 'Milk', 'Butter']], # Bought all three items together
|
|
1013
|
+
[['Bread', 'Milk'], ['Eggs']], # Same pattern as customer 1
|
|
1014
|
+
]
|
|
1015
|
+
|
|
1016
|
+
gsp = GSP(transactions)
|
|
1017
|
+
patterns = gsp.search(min_support=0.5)
|
|
1018
|
+
|
|
1019
|
+
# Pattern ('Bread',) will match any itemset containing Bread
|
|
1020
|
+
# Pattern ('Bread', 'Eggs') will match sequences where Bread appears before Eggs
|
|
1021
|
+
# (even if they're in different itemsets)
|
|
1022
|
+
```
|
|
1023
|
+
|
|
1024
|
+
#### Backward Compatibility with Flat Sequences
|
|
1025
|
+
|
|
1026
|
+
GSP-Py automatically normalizes flat sequences to itemsets internally, ensuring full backward compatibility:
|
|
1027
|
+
|
|
1028
|
+
```python
|
|
1029
|
+
from gsppy import GSP
|
|
1030
|
+
|
|
1031
|
+
# These are equivalent after normalization:
|
|
1032
|
+
flat_transactions = [['A', 'B', 'C']] # Flat format
|
|
1033
|
+
itemset_transactions = [[['A'], ['B'], ['C']]] # Equivalent itemset format
|
|
1034
|
+
|
|
1035
|
+
# Both produce the same results
|
|
1036
|
+
gsp1 = GSP(flat_transactions)
|
|
1037
|
+
gsp2 = GSP(itemset_transactions)
|
|
1038
|
+
|
|
1039
|
+
# Patterns are identical
|
|
1040
|
+
patterns1 = gsp1.search(min_support=0.5)
|
|
1041
|
+
patterns2 = gsp2.search(min_support=0.5)
|
|
1042
|
+
```
|
|
1043
|
+
|
|
1044
|
+
### Itemset Matching Semantics
|
|
1045
|
+
|
|
1046
|
+
Pattern matching with itemsets uses **subset semantics**:
|
|
1047
|
+
|
|
1048
|
+
- A pattern element matches a sequence element if all items in the pattern element are present in the sequence element
|
|
1049
|
+
- Example: Pattern `[['A', 'B']]` matches sequence element `['A', 'B', 'C']` because {A, B} ⊆ {A, B, C}
|
|
1050
|
+
- Pattern elements must appear in order across the sequence
|
|
1051
|
+
|
|
1052
|
+
```python
|
|
1053
|
+
from gsppy import GSP
|
|
1054
|
+
|
|
1055
|
+
transactions = [
|
|
1056
|
+
[['A', 'B', 'D'], ['E'], ['C', 'F']], # A,B,D together, then E, then C,F together
|
|
1057
|
+
]
|
|
1058
|
+
|
|
1059
|
+
gsp = GSP(transactions)
|
|
1060
|
+
|
|
1061
|
+
# Pattern ('A', 'C') will match because:
|
|
1062
|
+
# - 'A' is in first itemset ['A', 'B', 'D'] ✓
|
|
1063
|
+
# - 'C' appears later in third itemset ['C', 'F'] ✓
|
|
1064
|
+
# - Order is preserved ✓
|
|
1065
|
+
```
|
|
1066
|
+
|
|
1067
|
+
### Reading Itemsets from SPM Format
|
|
1068
|
+
|
|
1069
|
+
The SPM/GSP format supports itemsets using delimiters:
|
|
1070
|
+
|
|
1071
|
+
- `-1`: End of itemset
|
|
1072
|
+
- `-2`: End of sequence
|
|
1073
|
+
|
|
1074
|
+
```python
|
|
1075
|
+
from gsppy.utils import read_transactions_from_spm
|
|
1076
|
+
|
|
1077
|
+
# SPM file content:
|
|
1078
|
+
# 1 2 -1 3 -1 -2
|
|
1079
|
+
# 1 -1 3 4 -1 -2
|
|
1080
|
+
|
|
1081
|
+
# Read with itemsets preserved
|
|
1082
|
+
transactions = read_transactions_from_spm("data.txt", preserve_itemsets=True)
|
|
1083
|
+
# Result: [[['1', '2'], ['3']], [['1'], ['3', '4']]]
|
|
1084
|
+
|
|
1085
|
+
# Read with itemsets flattened (backward compatible)
|
|
1086
|
+
transactions = read_transactions_from_spm("data.txt", preserve_itemsets=False)
|
|
1087
|
+
# Result: [['1', '2', '3'], ['1', '3', '4']]
|
|
1088
|
+
```
|
|
1089
|
+
|
|
1090
|
+
### Itemsets with Timestamps
|
|
1091
|
+
|
|
1092
|
+
Itemsets work seamlessly with temporal constraints:
|
|
1093
|
+
|
|
1094
|
+
```python
|
|
1095
|
+
from gsppy import GSP
|
|
1096
|
+
|
|
1097
|
+
# Itemsets with timestamps: [(item, timestamp), ...]
|
|
1098
|
+
transactions = [
|
|
1099
|
+
[[('Login', 0), ('Home', 0)], [('Product', 5)], [('Checkout', 10)]],
|
|
1100
|
+
[[('Login', 0)], [('Home', 2), ('Product', 2)], [('Checkout', 15)]],
|
|
1101
|
+
]
|
|
1102
|
+
|
|
1103
|
+
# Find patterns where events in the same itemset occur together
|
|
1104
|
+
# and subsequent itemsets occur within maxgap time units
|
|
1105
|
+
gsp = GSP(transactions, maxgap=10)
|
|
1106
|
+
patterns = gsp.search(min_support=0.5)
|
|
1107
|
+
```
|
|
1108
|
+
|
|
1109
|
+
### Complete Example
|
|
1110
|
+
|
|
1111
|
+
See [examples/itemset_example.py](examples/itemset_example.py) for comprehensive examples including:
|
|
1112
|
+
|
|
1113
|
+
- Market basket analysis with itemsets
|
|
1114
|
+
- Web clickstream with parallel page views
|
|
1115
|
+
- Comparison of flat vs. itemset semantics
|
|
1116
|
+
- Reading and processing SPM format files
|
|
1117
|
+
|
|
1118
|
+
### Key Takeaways
|
|
1119
|
+
|
|
1120
|
+
✓ **Itemsets capture co-occurrence** of items at the same time step
|
|
1121
|
+
✓ **Flat sequences are automatically normalized** to itemsets internally
|
|
1122
|
+
✓ **Both formats work seamlessly** with GSP-Py
|
|
1123
|
+
✓ **Use itemsets when temporal co-occurrence matters** in your domain
|
|
1124
|
+
✓ **SPM format supports** both flat and itemset representations
|
|
1125
|
+
|
|
1126
|
+
---
|
|
1127
|
+
|
|
907
1128
|
## ⏱️ Temporal Constraints
|
|
908
1129
|
|
|
909
1130
|
GSP-Py supports **time-constrained sequential pattern mining** with three powerful temporal constraints: `mingap`, `maxgap`, and `maxspan`. These constraints enable domain-specific applications such as medical event mining, retail analytics, and temporal user journey discovery.
|
|
@@ -39,6 +39,7 @@ Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal m
|
|
|
39
39
|
- [✅ Example: Analyzing Sales Data](#example-analyzing-sales-data)
|
|
40
40
|
- [📊 Explanation: Support and Results](#explanation-support-and-results)
|
|
41
41
|
- [📊 DataFrame Input Support](#dataframe-input-support)
|
|
42
|
+
- [🔗 Itemset Support](#itemset-support)
|
|
42
43
|
- [⏱️ Temporal Constraints](#temporal-constraints)
|
|
43
44
|
7. [⌨️ Typing](#typing)
|
|
44
45
|
8. [🌟 Planned Features](#planned-features)
|
|
@@ -486,6 +487,82 @@ Verbose mode provides:
|
|
|
486
487
|
|
|
487
488
|
For complete documentation on logging, see [docs/logging.md](docs/logging.md).
|
|
488
489
|
|
|
490
|
+
### Using Sequence Objects for Rich Pattern Representation
|
|
491
|
+
|
|
492
|
+
GSP-Py 4.0+ introduces a **Sequence abstraction class** that provides a richer, more maintainable way to work with sequential patterns. The Sequence class encapsulates pattern items, support counts, and optional metadata in an immutable, hashable object.
|
|
493
|
+
|
|
494
|
+
#### Traditional Dict-based Output (Default)
|
|
495
|
+
|
|
496
|
+
```python
|
|
497
|
+
from gsppy import GSP
|
|
498
|
+
|
|
499
|
+
transactions = [
|
|
500
|
+
['Bread', 'Milk'],
|
|
501
|
+
['Bread', 'Diaper', 'Beer', 'Eggs'],
|
|
502
|
+
['Milk', 'Diaper', 'Beer', 'Coke']
|
|
503
|
+
]
|
|
504
|
+
|
|
505
|
+
gsp = GSP(transactions)
|
|
506
|
+
result = gsp.search(min_support=0.3)
|
|
507
|
+
|
|
508
|
+
# Returns: [{('Bread',): 4, ('Milk',): 4, ...}, {('Bread', 'Milk'): 3, ...}, ...]
|
|
509
|
+
for level_patterns in result:
|
|
510
|
+
for pattern, support in level_patterns.items():
|
|
511
|
+
print(f"Pattern: {pattern}, Support: {support}")
|
|
512
|
+
```
|
|
513
|
+
|
|
514
|
+
#### Sequence Objects (New Feature)
|
|
515
|
+
|
|
516
|
+
```python
|
|
517
|
+
from gsppy import GSP
|
|
518
|
+
|
|
519
|
+
transactions = [
|
|
520
|
+
['Bread', 'Milk'],
|
|
521
|
+
['Bread', 'Diaper', 'Beer', 'Eggs'],
|
|
522
|
+
['Milk', 'Diaper', 'Beer', 'Coke']
|
|
523
|
+
]
|
|
524
|
+
|
|
525
|
+
gsp = GSP(transactions)
|
|
526
|
+
result = gsp.search(min_support=0.3, return_sequences=True)
|
|
527
|
+
|
|
528
|
+
# Returns: [[Sequence(('Bread',), support=4), ...], [Sequence(('Bread', 'Milk'), support=3), ...], ...]
|
|
529
|
+
for level_patterns in result:
|
|
530
|
+
for seq in level_patterns:
|
|
531
|
+
print(f"Pattern: {seq.items}, Support: {seq.support}, Length: {seq.length}")
|
|
532
|
+
# Access sequence properties
|
|
533
|
+
print(f" First item: {seq.first_item}, Last item: {seq.last_item}")
|
|
534
|
+
# Check if item is in sequence
|
|
535
|
+
if "Milk" in seq:
|
|
536
|
+
print(f" Contains Milk!")
|
|
537
|
+
```
|
|
538
|
+
|
|
539
|
+
#### Key Benefits of Sequence Objects
|
|
540
|
+
|
|
541
|
+
1. **Rich API**: Access pattern properties like `length`, `first_item`, `last_item`
|
|
542
|
+
2. **Type Safety**: IDE autocomplete and better type hints
|
|
543
|
+
3. **Immutable & Hashable**: Can be used as dictionary keys
|
|
544
|
+
4. **Extensible**: Add metadata for confidence, lift, or custom properties
|
|
545
|
+
5. **Backward Compatible**: Convert to/from dict format as needed
|
|
546
|
+
|
|
547
|
+
```python
|
|
548
|
+
from gsppy import Sequence, sequences_to_dict, dict_to_sequences
|
|
549
|
+
|
|
550
|
+
# Create custom sequences
|
|
551
|
+
seq = Sequence.from_tuple(("A", "B", "C"), support=5)
|
|
552
|
+
|
|
553
|
+
# Extend sequences
|
|
554
|
+
extended = seq.extend("D") # Creates Sequence(("A", "B", "C", "D"))
|
|
555
|
+
|
|
556
|
+
# Add metadata
|
|
557
|
+
seq_with_meta = seq.with_metadata(confidence=0.85, lift=1.5)
|
|
558
|
+
|
|
559
|
+
# Convert between formats for compatibility
|
|
560
|
+
seq_result = gsp.search(min_support=0.3, return_sequences=True)
|
|
561
|
+
dict_format = sequences_to_dict(seq_result[0]) # Convert to dict
|
|
562
|
+
```
|
|
563
|
+
|
|
564
|
+
For a complete example, see [examples/sequence_example.py](examples/sequence_example.py).
|
|
565
|
+
|
|
489
566
|
### Loading SPM/GSP Format Files
|
|
490
567
|
|
|
491
568
|
GSP-Py supports loading datasets in the classical SPM/GSP delimiter format, which is widely used in sequential pattern mining research. This format uses:
|
|
@@ -831,6 +908,150 @@ For complete examples and edge cases, see:
|
|
|
831
908
|
|
|
832
909
|
---
|
|
833
910
|
|
|
911
|
+
## 🔗 Itemset Support
|
|
912
|
+
|
|
913
|
+
GSP-Py supports **itemsets** within sequence elements, enabling you to capture **co-occurrence** of multiple items at the same time step. This is crucial for applications where items occur together rather than in strict sequential order.
|
|
914
|
+
|
|
915
|
+
### What are Itemsets?
|
|
916
|
+
|
|
917
|
+
- **Flat sequences**: `['A', 'B', 'C']` - each item occurs at a separate time step
|
|
918
|
+
- **Itemset sequences**: `[['A', 'B'], ['C']]` - items A and B occur together at the first time step, then C occurs later
|
|
919
|
+
|
|
920
|
+
### Why Use Itemsets?
|
|
921
|
+
|
|
922
|
+
Itemsets are essential when temporal co-occurrence matters in your domain:
|
|
923
|
+
|
|
924
|
+
- **Market basket analysis**: Customers buy multiple items in a single shopping trip, then return for more items later
|
|
925
|
+
- **Web analytics**: Users open multiple pages in parallel tabs before moving to the next set of pages
|
|
926
|
+
- **Event logs**: Multiple events can occur simultaneously in complex systems
|
|
927
|
+
- **Purchase patterns**: Items bought together vs. items bought in sequence
|
|
928
|
+
|
|
929
|
+
### Using Itemsets
|
|
930
|
+
|
|
931
|
+
#### Basic Example
|
|
932
|
+
|
|
933
|
+
```python
|
|
934
|
+
from gsppy import GSP
|
|
935
|
+
|
|
936
|
+
# Itemset format: nested lists where inner lists are items that occur together
|
|
937
|
+
transactions = [
|
|
938
|
+
[['Bread', 'Milk'], ['Eggs']], # Bought Bread & Milk together, then Eggs later
|
|
939
|
+
[['Bread', 'Milk', 'Butter']], # Bought all three items together
|
|
940
|
+
[['Bread', 'Milk'], ['Eggs']], # Same pattern as customer 1
|
|
941
|
+
]
|
|
942
|
+
|
|
943
|
+
gsp = GSP(transactions)
|
|
944
|
+
patterns = gsp.search(min_support=0.5)
|
|
945
|
+
|
|
946
|
+
# Pattern ('Bread',) will match any itemset containing Bread
|
|
947
|
+
# Pattern ('Bread', 'Eggs') will match sequences where Bread appears before Eggs
|
|
948
|
+
# (even if they're in different itemsets)
|
|
949
|
+
```
|
|
950
|
+
|
|
951
|
+
#### Backward Compatibility with Flat Sequences
|
|
952
|
+
|
|
953
|
+
GSP-Py automatically normalizes flat sequences to itemsets internally, ensuring full backward compatibility:
|
|
954
|
+
|
|
955
|
+
```python
|
|
956
|
+
from gsppy import GSP
|
|
957
|
+
|
|
958
|
+
# These are equivalent after normalization:
|
|
959
|
+
flat_transactions = [['A', 'B', 'C']] # Flat format
|
|
960
|
+
itemset_transactions = [[['A'], ['B'], ['C']]] # Equivalent itemset format
|
|
961
|
+
|
|
962
|
+
# Both produce the same results
|
|
963
|
+
gsp1 = GSP(flat_transactions)
|
|
964
|
+
gsp2 = GSP(itemset_transactions)
|
|
965
|
+
|
|
966
|
+
# Patterns are identical
|
|
967
|
+
patterns1 = gsp1.search(min_support=0.5)
|
|
968
|
+
patterns2 = gsp2.search(min_support=0.5)
|
|
969
|
+
```
|
|
970
|
+
|
|
971
|
+
### Itemset Matching Semantics
|
|
972
|
+
|
|
973
|
+
Pattern matching with itemsets uses **subset semantics**:
|
|
974
|
+
|
|
975
|
+
- A pattern element matches a sequence element if all items in the pattern element are present in the sequence element
|
|
976
|
+
- Example: Pattern `[['A', 'B']]` matches sequence element `['A', 'B', 'C']` because {A, B} ⊆ {A, B, C}
|
|
977
|
+
- Pattern elements must appear in order across the sequence
|
|
978
|
+
|
|
979
|
+
```python
|
|
980
|
+
from gsppy import GSP
|
|
981
|
+
|
|
982
|
+
transactions = [
|
|
983
|
+
[['A', 'B', 'D'], ['E'], ['C', 'F']], # A,B,D together, then E, then C,F together
|
|
984
|
+
]
|
|
985
|
+
|
|
986
|
+
gsp = GSP(transactions)
|
|
987
|
+
|
|
988
|
+
# Pattern ('A', 'C') will match because:
|
|
989
|
+
# - 'A' is in first itemset ['A', 'B', 'D'] ✓
|
|
990
|
+
# - 'C' appears later in third itemset ['C', 'F'] ✓
|
|
991
|
+
# - Order is preserved ✓
|
|
992
|
+
```
|
|
993
|
+
|
|
994
|
+
### Reading Itemsets from SPM Format
|
|
995
|
+
|
|
996
|
+
The SPM/GSP format supports itemsets using delimiters:
|
|
997
|
+
|
|
998
|
+
- `-1`: End of itemset
|
|
999
|
+
- `-2`: End of sequence
|
|
1000
|
+
|
|
1001
|
+
```python
|
|
1002
|
+
from gsppy.utils import read_transactions_from_spm
|
|
1003
|
+
|
|
1004
|
+
# SPM file content:
|
|
1005
|
+
# 1 2 -1 3 -1 -2
|
|
1006
|
+
# 1 -1 3 4 -1 -2
|
|
1007
|
+
|
|
1008
|
+
# Read with itemsets preserved
|
|
1009
|
+
transactions = read_transactions_from_spm("data.txt", preserve_itemsets=True)
|
|
1010
|
+
# Result: [[['1', '2'], ['3']], [['1'], ['3', '4']]]
|
|
1011
|
+
|
|
1012
|
+
# Read with itemsets flattened (backward compatible)
|
|
1013
|
+
transactions = read_transactions_from_spm("data.txt", preserve_itemsets=False)
|
|
1014
|
+
# Result: [['1', '2', '3'], ['1', '3', '4']]
|
|
1015
|
+
```
|
|
1016
|
+
|
|
1017
|
+
### Itemsets with Timestamps
|
|
1018
|
+
|
|
1019
|
+
Itemsets work seamlessly with temporal constraints:
|
|
1020
|
+
|
|
1021
|
+
```python
|
|
1022
|
+
from gsppy import GSP
|
|
1023
|
+
|
|
1024
|
+
# Itemsets with timestamps: [(item, timestamp), ...]
|
|
1025
|
+
transactions = [
|
|
1026
|
+
[[('Login', 0), ('Home', 0)], [('Product', 5)], [('Checkout', 10)]],
|
|
1027
|
+
[[('Login', 0)], [('Home', 2), ('Product', 2)], [('Checkout', 15)]],
|
|
1028
|
+
]
|
|
1029
|
+
|
|
1030
|
+
# Find patterns where events in the same itemset occur together
|
|
1031
|
+
# and subsequent itemsets occur within maxgap time units
|
|
1032
|
+
gsp = GSP(transactions, maxgap=10)
|
|
1033
|
+
patterns = gsp.search(min_support=0.5)
|
|
1034
|
+
```
|
|
1035
|
+
|
|
1036
|
+
### Complete Example
|
|
1037
|
+
|
|
1038
|
+
See [examples/itemset_example.py](examples/itemset_example.py) for comprehensive examples including:
|
|
1039
|
+
|
|
1040
|
+
- Market basket analysis with itemsets
|
|
1041
|
+
- Web clickstream with parallel page views
|
|
1042
|
+
- Comparison of flat vs. itemset semantics
|
|
1043
|
+
- Reading and processing SPM format files
|
|
1044
|
+
|
|
1045
|
+
### Key Takeaways
|
|
1046
|
+
|
|
1047
|
+
✓ **Itemsets capture co-occurrence** of items at the same time step
|
|
1048
|
+
✓ **Flat sequences are automatically normalized** to itemsets internally
|
|
1049
|
+
✓ **Both formats work seamlessly** with GSP-Py
|
|
1050
|
+
✓ **Use itemsets when temporal co-occurrence matters** in your domain
|
|
1051
|
+
✓ **SPM format supports** both flat and itemset representations
|
|
1052
|
+
|
|
1053
|
+
---
|
|
1054
|
+
|
|
834
1055
|
## ⏱️ Temporal Constraints
|
|
835
1056
|
|
|
836
1057
|
GSP-Py supports **time-constrained sequential pattern mining** with three powerful temporal constraints: `mingap`, `maxgap`, and `maxspan`. These constraints enable domain-specific applications such as medical event mining, retail analytics, and temporal user journey discovery.
|
|
@@ -24,6 +24,12 @@ from gsppy.pruning import (
|
|
|
24
24
|
FrequencyBasedPruning,
|
|
25
25
|
create_default_pruning_strategy,
|
|
26
26
|
)
|
|
27
|
+
from gsppy.sequence import (
|
|
28
|
+
Sequence,
|
|
29
|
+
sequences_to_dict,
|
|
30
|
+
dict_to_sequences,
|
|
31
|
+
to_sequence,
|
|
32
|
+
)
|
|
27
33
|
from gsppy.token_mapper import TokenMapper
|
|
28
34
|
|
|
29
35
|
# DataFrame adapters are optional - import only if dependencies are available
|
|
@@ -63,6 +69,10 @@ __all__ = [
|
|
|
63
69
|
"TemporalAwarePruning",
|
|
64
70
|
"CombinedPruning",
|
|
65
71
|
"create_default_pruning_strategy",
|
|
72
|
+
"Sequence",
|
|
73
|
+
"sequences_to_dict",
|
|
74
|
+
"dict_to_sequences",
|
|
75
|
+
"to_sequence",
|
|
66
76
|
"TokenMapper",
|
|
67
77
|
]
|
|
68
78
|
|
|
@@ -35,7 +35,7 @@ import csv
|
|
|
35
35
|
import sys
|
|
36
36
|
import json
|
|
37
37
|
import logging
|
|
38
|
-
from typing import Any,
|
|
38
|
+
from typing import Any, List, Tuple, Union, Optional, cast
|
|
39
39
|
|
|
40
40
|
import click
|
|
41
41
|
|
|
@@ -608,7 +608,7 @@ def main(
|
|
|
608
608
|
# Initialize and run GSP algorithm
|
|
609
609
|
try:
|
|
610
610
|
gsp = GSP(transactions, mingap=mingap, maxgap=maxgap, maxspan=maxspan, verbose=verbose)
|
|
611
|
-
patterns
|
|
611
|
+
patterns = gsp.search(min_support=min_support, return_sequences=False)
|
|
612
612
|
logger.info("Frequent Patterns Found:")
|
|
613
613
|
for i, level in enumerate(patterns, start=1):
|
|
614
614
|
logger.info(f"\n{i}-Sequence Patterns:")
|