gsppy 3.0.1__py3-none-any.whl → 3.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsppy/utils.py +23 -6
- {gsppy-3.0.1.dist-info → gsppy-3.1.1.dist-info}/METADATA +50 -16
- gsppy-3.1.1.dist-info/RECORD +10 -0
- {gsppy-3.0.1.dist-info → gsppy-3.1.1.dist-info}/WHEEL +1 -1
- gsppy-3.0.1.dist-info/RECORD +0 -10
- {gsppy-3.0.1.dist-info → gsppy-3.1.1.dist-info}/entry_points.txt +0 -0
- {gsppy-3.0.1.dist-info → gsppy-3.1.1.dist-info}/licenses/LICENSE +0 -0
gsppy/utils.py
CHANGED
|
@@ -5,14 +5,14 @@ and generating candidate patterns from previously frequent patterns.
|
|
|
5
5
|
|
|
6
6
|
The key functionalities include:
|
|
7
7
|
1. Splitting a list of items into smaller batches for easier processing.
|
|
8
|
-
2. Checking for the existence of
|
|
8
|
+
2. Checking for the existence of an ordered (non-contiguous) subsequence within a sequence,
|
|
9
9
|
with caching to optimize repeated comparisons.
|
|
10
10
|
3. Generating candidate patterns from a dictionary of frequent patterns
|
|
11
11
|
to support pattern generation tasks in algorithms like sequence mining.
|
|
12
12
|
|
|
13
13
|
Main functionalities:
|
|
14
14
|
- `split_into_batches`: Splits a list of items into smaller batches based on a specified batch size.
|
|
15
|
-
- `is_subsequence_in_list`: Determines if a subsequence exists within another sequence,
|
|
15
|
+
- `is_subsequence_in_list`: Determines if a subsequence exists within another sequence in order,
|
|
16
16
|
using caching to improve performance.
|
|
17
17
|
- `generate_candidates_from_previous`: Generates candidate patterns by joining previously
|
|
18
18
|
identified frequent patterns.
|
|
@@ -46,7 +46,10 @@ def split_into_batches(
|
|
|
46
46
|
@lru_cache(maxsize=None)
|
|
47
47
|
def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ...]) -> bool:
|
|
48
48
|
"""
|
|
49
|
-
Check if a subsequence exists within a sequence as
|
|
49
|
+
Check if a subsequence exists within a sequence as an ordered (non-contiguous) subsequence.
|
|
50
|
+
|
|
51
|
+
This function implements the standard GSP semantics where items in the subsequence
|
|
52
|
+
must appear in the same order in the sequence, but not necessarily contiguously.
|
|
50
53
|
|
|
51
54
|
Parameters:
|
|
52
55
|
subsequence: (tuple): The sequence to search for.
|
|
@@ -54,6 +57,14 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
|
|
|
54
57
|
|
|
55
58
|
Returns:
|
|
56
59
|
bool: True if the subsequence is found, False otherwise.
|
|
60
|
+
|
|
61
|
+
Examples:
|
|
62
|
+
>>> is_subsequence_in_list(('a', 'c'), ('a', 'b', 'c'))
|
|
63
|
+
True
|
|
64
|
+
>>> is_subsequence_in_list(('a', 'c'), ('c', 'a'))
|
|
65
|
+
False
|
|
66
|
+
>>> is_subsequence_in_list(('a', 'b'), ('a', 'b', 'c'))
|
|
67
|
+
True
|
|
57
68
|
"""
|
|
58
69
|
# Handle the case where the subsequence is empty - it should not exist in any sequence
|
|
59
70
|
if not subsequence:
|
|
@@ -61,12 +72,18 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
|
|
|
61
72
|
|
|
62
73
|
len_sub, len_seq = len(subsequence), len(sequence)
|
|
63
74
|
|
|
64
|
-
# Return False if the
|
|
75
|
+
# Return False if the subsequence is longer than the sequence
|
|
65
76
|
if len_sub > len_seq:
|
|
66
77
|
return False
|
|
67
78
|
|
|
68
|
-
# Use
|
|
69
|
-
|
|
79
|
+
# Use two-pointer approach to check if subsequence exists in order
|
|
80
|
+
sub_idx = 0
|
|
81
|
+
for seq_idx in range(len_seq):
|
|
82
|
+
if sequence[seq_idx] == subsequence[sub_idx]:
|
|
83
|
+
sub_idx += 1
|
|
84
|
+
if sub_idx == len_sub:
|
|
85
|
+
return True
|
|
86
|
+
return False
|
|
70
87
|
|
|
71
88
|
|
|
72
89
|
def generate_candidates_from_previous(prev_patterns: Dict[Tuple[str, ...], int]) -> List[Tuple[str, ...]]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gsppy
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.1.1
|
|
4
4
|
Summary: GSP (Generalized Sequence Pattern) algorithm in Python
|
|
5
5
|
Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
|
|
6
6
|
Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
@@ -41,21 +41,21 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
41
41
|
Requires-Python: >=3.10
|
|
42
42
|
Requires-Dist: click>=8.0.0
|
|
43
43
|
Provides-Extra: dev
|
|
44
|
-
Requires-Dist: cython==3.1.
|
|
45
|
-
Requires-Dist: hatch==1.
|
|
44
|
+
Requires-Dist: cython==3.1.4; extra == 'dev'
|
|
45
|
+
Requires-Dist: hatch==1.15.1; extra == 'dev'
|
|
46
46
|
Requires-Dist: hatchling==1.27.0; extra == 'dev'
|
|
47
|
-
Requires-Dist: mypy==1.18.
|
|
48
|
-
Requires-Dist: pylint==
|
|
49
|
-
Requires-Dist: pyright==1.1.
|
|
47
|
+
Requires-Dist: mypy==1.18.2; extra == 'dev'
|
|
48
|
+
Requires-Dist: pylint==4.0.2; extra == 'dev'
|
|
49
|
+
Requires-Dist: pyright==1.1.406; extra == 'dev'
|
|
50
50
|
Requires-Dist: pytest-benchmark==5.1.0; extra == 'dev'
|
|
51
51
|
Requires-Dist: pytest-cov==7.0.0; extra == 'dev'
|
|
52
52
|
Requires-Dist: pytest==8.4.2; extra == 'dev'
|
|
53
|
-
Requires-Dist: ruff==0.13.
|
|
54
|
-
Requires-Dist: tox==4.
|
|
53
|
+
Requires-Dist: ruff==0.13.3; extra == 'dev'
|
|
54
|
+
Requires-Dist: tox==4.32.0; extra == 'dev'
|
|
55
55
|
Provides-Extra: gpu
|
|
56
56
|
Requires-Dist: cupy<14,>=11; extra == 'gpu'
|
|
57
57
|
Provides-Extra: rust
|
|
58
|
-
Requires-Dist: maturin==1.9.
|
|
58
|
+
Requires-Dist: maturin==1.9.6; extra == 'rust'
|
|
59
59
|
Description-Content-Type: text/markdown
|
|
60
60
|
|
|
61
61
|
[]()
|
|
@@ -104,14 +104,15 @@ principles**. Using support thresholds, GSP identifies frequent sequences of ite
|
|
|
104
104
|
|
|
105
105
|
### Key Features:
|
|
106
106
|
|
|
107
|
+
- **Ordered (non-contiguous) matching**: Detects patterns where items appear in order but not necessarily adjacent, following standard GSP semantics. For example, the pattern `('A', 'C')` is found in the sequence `['A', 'B', 'C']`.
|
|
107
108
|
- **Support-based pruning**: Only retains sequences that meet the minimum support threshold.
|
|
108
109
|
- **Candidate generation**: Iteratively generates candidate sequences of increasing length.
|
|
109
110
|
- **General-purpose**: Useful in retail, web analytics, social networks, temporal sequence mining, and more.
|
|
110
111
|
|
|
111
112
|
For example:
|
|
112
113
|
|
|
113
|
-
- In a shopping dataset, GSP can identify patterns like "Customers who buy bread and milk often purchase diapers next.
|
|
114
|
-
- In a website clickstream, GSP might find patterns like "Users visit A, then go to
|
|
114
|
+
- In a shopping dataset, GSP can identify patterns like "Customers who buy bread and milk often purchase diapers next" - even if other items appear between bread and milk.
|
|
115
|
+
- In a website clickstream, GSP might find patterns like "Users visit A, then eventually go to C" - capturing user journeys with intermediate steps.
|
|
115
116
|
|
|
116
117
|
---
|
|
117
118
|
|
|
@@ -427,24 +428,57 @@ Sample Output:
|
|
|
427
428
|
```python
|
|
428
429
|
[
|
|
429
430
|
{('Bread',): 4, ('Milk',): 4, ('Diaper',): 4, ('Beer',): 3, ('Coke',): 2},
|
|
430
|
-
{('Bread', 'Milk'): 3, ('Milk', 'Diaper'): 3, ('Diaper', 'Beer'): 3},
|
|
431
|
-
{('Bread', 'Milk', 'Diaper'): 2, ('Milk', 'Diaper', 'Beer'): 2}
|
|
431
|
+
{('Bread', 'Milk'): 3, ('Bread', 'Diaper'): 3, ('Bread', 'Beer'): 2, ('Milk', 'Diaper'): 3, ('Milk', 'Beer'): 2, ('Milk', 'Coke'): 2, ('Diaper', 'Beer'): 3, ('Diaper', 'Coke'): 2},
|
|
432
|
+
{('Bread', 'Milk', 'Diaper'): 2, ('Bread', 'Diaper', 'Beer'): 2, ('Milk', 'Diaper', 'Beer'): 2, ('Milk', 'Diaper', 'Coke'): 2}
|
|
432
433
|
]
|
|
433
434
|
```
|
|
434
435
|
|
|
435
436
|
- The **first dictionary** contains single-item sequences with their frequencies (e.g., `('Bread',): 4` means "Bread"
|
|
436
437
|
appears in 4 transactions).
|
|
437
438
|
- The **second dictionary** contains 2-item sequential patterns (e.g., `('Bread', 'Milk'): 3` means the sequence "
|
|
438
|
-
Bread → Milk" appears in 3 transactions).
|
|
439
|
+
Bread → Milk" appears in 3 transactions). Note that patterns like `('Bread', 'Beer')` are detected even when they don't appear adjacent in transactions - they just need to appear in order.
|
|
439
440
|
- The **third dictionary** contains 3-item sequential patterns (e.g., `('Bread', 'Milk', 'Diaper'): 2` means the
|
|
440
441
|
sequence "Bread → Milk → Diaper" appears in 2 transactions).
|
|
441
442
|
|
|
442
443
|
> [!NOTE]
|
|
443
|
-
> The **support** of a sequence is calculated as the fraction of transactions containing the sequence, e.g.,
|
|
444
|
-
`
|
|
444
|
+
> The **support** of a sequence is calculated as the fraction of transactions containing the sequence **in order** (not necessarily contiguously), e.g.,
|
|
445
|
+
`('Bread', 'Milk')` appears in 3 out of 5 transactions → Support = `3 / 5 = 0.6` (60%).
|
|
445
446
|
> This insight helps identify frequently occurring sequential patterns in datasets, such as shopping trends or user
|
|
446
447
|
> behavior.
|
|
447
448
|
|
|
449
|
+
> [!IMPORTANT]
|
|
450
|
+
> **Non-contiguous (ordered) matching**: GSP-Py detects patterns where items appear in the specified order but not necessarily adjacent. For example, the pattern `('Bread', 'Beer')` matches the transaction `['Bread', 'Milk', 'Diaper', 'Beer']` because Bread appears before Beer, even though they are not adjacent. This follows the standard GSP algorithm semantics for sequential pattern mining.
|
|
451
|
+
|
|
452
|
+
### Understanding Non-Contiguous Pattern Matching
|
|
453
|
+
|
|
454
|
+
GSP-Py follows the standard GSP algorithm semantics by detecting **ordered (non-contiguous)** subsequences. This means:
|
|
455
|
+
|
|
456
|
+
- ✅ **Order matters**: Items must appear in the specified sequence order
|
|
457
|
+
- ✅ **Gaps allowed**: Items don't need to be adjacent
|
|
458
|
+
- ❌ **Wrong order rejected**: Items appearing in different order won't match
|
|
459
|
+
|
|
460
|
+
**Example:**
|
|
461
|
+
|
|
462
|
+
```python
|
|
463
|
+
from gsppy.gsp import GSP
|
|
464
|
+
|
|
465
|
+
sequences = [
|
|
466
|
+
['a', 'b', 'c'], # Contains: (a,b), (a,c), (b,c), (a,b,c)
|
|
467
|
+
['a', 'c'], # Contains: (a,c)
|
|
468
|
+
['b', 'c', 'a'], # Contains: (b,c), (b,a), (c,a)
|
|
469
|
+
['a', 'b', 'c', 'd'], # Contains: (a,b), (a,c), (a,d), (b,c), (b,d), (c,d), etc.
|
|
470
|
+
]
|
|
471
|
+
|
|
472
|
+
gsp = GSP(sequences)
|
|
473
|
+
result = gsp.search(min_support=0.5) # Need at least 2/4 sequences
|
|
474
|
+
|
|
475
|
+
# Pattern ('a', 'c') is found with support=3 because:
|
|
476
|
+
# - It appears in ['a', 'b', 'c'] (with 'b' in between)
|
|
477
|
+
# - It appears in ['a', 'c'] (adjacent)
|
|
478
|
+
# - It appears in ['a', 'b', 'c', 'd'] (with 'b' in between)
|
|
479
|
+
# Total: 3 out of 4 sequences = 75% support ✅
|
|
480
|
+
```
|
|
481
|
+
|
|
448
482
|
|
|
449
483
|
> [!TIP]
|
|
450
484
|
> For more complex examples, find example scripts in the [`gsppy/tests`](gsppy/tests) folder.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
gsppy/accelerate.py,sha256=YO3YQFzo2VAC6IXOTnQnOajkZO7SabkieGb1IPgWdSI,10407
|
|
3
|
+
gsppy/cli.py,sha256=wsGoc_utxpRfgCF9vPOAyLDTOJZ8NaiwiUny5VyIYvQ,6567
|
|
4
|
+
gsppy/gsp.py,sha256=GCHFhOu-DyHEPsse_OXzf9IaZoigF8ouRqgn_OsZBvA,14855
|
|
5
|
+
gsppy/utils.py,sha256=KtjfDgsTwvwxIyA2KCQmgu8cFkBqQvMZN8Ct5NB60Tc,3952
|
|
6
|
+
gsppy-3.1.1.dist-info/METADATA,sha256=uN-rN-CzsrwW_uh4s60DUevIKjm5CuiYyHRh5cgyKqQ,19819
|
|
7
|
+
gsppy-3.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
8
|
+
gsppy-3.1.1.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
9
|
+
gsppy-3.1.1.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
|
|
10
|
+
gsppy-3.1.1.dist-info/RECORD,,
|
gsppy-3.0.1.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
gsppy/accelerate.py,sha256=YO3YQFzo2VAC6IXOTnQnOajkZO7SabkieGb1IPgWdSI,10407
|
|
3
|
-
gsppy/cli.py,sha256=wsGoc_utxpRfgCF9vPOAyLDTOJZ8NaiwiUny5VyIYvQ,6567
|
|
4
|
-
gsppy/gsp.py,sha256=GCHFhOu-DyHEPsse_OXzf9IaZoigF8ouRqgn_OsZBvA,14855
|
|
5
|
-
gsppy/utils.py,sha256=YlV0F64lnd2Xymf6XnYr6mMLYWV2f2yjaHkZbAS1Qs0,3362
|
|
6
|
-
gsppy-3.0.1.dist-info/METADATA,sha256=vt35btl69hnEM4R1Kz-U5m-2MHEN5hdkHBk-uJ2eKAw,17670
|
|
7
|
-
gsppy-3.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
8
|
-
gsppy-3.0.1.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
9
|
-
gsppy-3.0.1.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
|
|
10
|
-
gsppy-3.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|