gsppy 3.0.1__tar.gz → 3.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gsppy-3.0.1 → gsppy-3.1.1}/PKG-INFO +50 -16
- {gsppy-3.0.1 → gsppy-3.1.1}/README.md +41 -7
- {gsppy-3.0.1 → gsppy-3.1.1}/gsppy/utils.py +23 -6
- {gsppy-3.0.1 → gsppy-3.1.1}/pyproject.toml +9 -9
- {gsppy-3.0.1 → gsppy-3.1.1}/tests/test_gsp.py +142 -2
- {gsppy-3.0.1 → gsppy-3.1.1}/tests/test_utils.py +68 -4
- {gsppy-3.0.1 → gsppy-3.1.1}/.gitignore +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/CHANGELOG.md +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/CONTRIBUTING.md +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/LICENSE +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/SECURITY.md +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/gsppy/__init__.py +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/gsppy/accelerate.py +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/gsppy/cli.py +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/gsppy/gsp.py +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/mypy.ini +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/rust/Cargo.lock +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/rust/Cargo.toml +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/rust/src/lib.rs +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/tests/__init__.py +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/tests/test_cli.py +0 -0
- {gsppy-3.0.1 → gsppy-3.1.1}/tox.ini +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gsppy
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.1.1
|
|
4
4
|
Summary: GSP (Generalized Sequence Pattern) algorithm in Python
|
|
5
5
|
Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
|
|
6
6
|
Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
@@ -41,21 +41,21 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
41
41
|
Requires-Python: >=3.10
|
|
42
42
|
Requires-Dist: click>=8.0.0
|
|
43
43
|
Provides-Extra: dev
|
|
44
|
-
Requires-Dist: cython==3.1.
|
|
45
|
-
Requires-Dist: hatch==1.
|
|
44
|
+
Requires-Dist: cython==3.1.4; extra == 'dev'
|
|
45
|
+
Requires-Dist: hatch==1.15.1; extra == 'dev'
|
|
46
46
|
Requires-Dist: hatchling==1.27.0; extra == 'dev'
|
|
47
|
-
Requires-Dist: mypy==1.18.
|
|
48
|
-
Requires-Dist: pylint==
|
|
49
|
-
Requires-Dist: pyright==1.1.
|
|
47
|
+
Requires-Dist: mypy==1.18.2; extra == 'dev'
|
|
48
|
+
Requires-Dist: pylint==4.0.2; extra == 'dev'
|
|
49
|
+
Requires-Dist: pyright==1.1.406; extra == 'dev'
|
|
50
50
|
Requires-Dist: pytest-benchmark==5.1.0; extra == 'dev'
|
|
51
51
|
Requires-Dist: pytest-cov==7.0.0; extra == 'dev'
|
|
52
52
|
Requires-Dist: pytest==8.4.2; extra == 'dev'
|
|
53
|
-
Requires-Dist: ruff==0.13.
|
|
54
|
-
Requires-Dist: tox==4.
|
|
53
|
+
Requires-Dist: ruff==0.13.3; extra == 'dev'
|
|
54
|
+
Requires-Dist: tox==4.32.0; extra == 'dev'
|
|
55
55
|
Provides-Extra: gpu
|
|
56
56
|
Requires-Dist: cupy<14,>=11; extra == 'gpu'
|
|
57
57
|
Provides-Extra: rust
|
|
58
|
-
Requires-Dist: maturin==1.9.
|
|
58
|
+
Requires-Dist: maturin==1.9.6; extra == 'rust'
|
|
59
59
|
Description-Content-Type: text/markdown
|
|
60
60
|
|
|
61
61
|
[]()
|
|
@@ -104,14 +104,15 @@ principles**. Using support thresholds, GSP identifies frequent sequences of ite
|
|
|
104
104
|
|
|
105
105
|
### Key Features:
|
|
106
106
|
|
|
107
|
+
- **Ordered (non-contiguous) matching**: Detects patterns where items appear in order but not necessarily adjacent, following standard GSP semantics. For example, the pattern `('A', 'C')` is found in the sequence `['A', 'B', 'C']`.
|
|
107
108
|
- **Support-based pruning**: Only retains sequences that meet the minimum support threshold.
|
|
108
109
|
- **Candidate generation**: Iteratively generates candidate sequences of increasing length.
|
|
109
110
|
- **General-purpose**: Useful in retail, web analytics, social networks, temporal sequence mining, and more.
|
|
110
111
|
|
|
111
112
|
For example:
|
|
112
113
|
|
|
113
|
-
- In a shopping dataset, GSP can identify patterns like "Customers who buy bread and milk often purchase diapers next.
|
|
114
|
-
- In a website clickstream, GSP might find patterns like "Users visit A, then go to
|
|
114
|
+
- In a shopping dataset, GSP can identify patterns like "Customers who buy bread and milk often purchase diapers next" - even if other items appear between bread and milk.
|
|
115
|
+
- In a website clickstream, GSP might find patterns like "Users visit A, then eventually go to C" - capturing user journeys with intermediate steps.
|
|
115
116
|
|
|
116
117
|
---
|
|
117
118
|
|
|
@@ -427,24 +428,57 @@ Sample Output:
|
|
|
427
428
|
```python
|
|
428
429
|
[
|
|
429
430
|
{('Bread',): 4, ('Milk',): 4, ('Diaper',): 4, ('Beer',): 3, ('Coke',): 2},
|
|
430
|
-
{('Bread', 'Milk'): 3, ('Milk', 'Diaper'): 3, ('Diaper', 'Beer'): 3},
|
|
431
|
-
{('Bread', 'Milk', 'Diaper'): 2, ('Milk', 'Diaper', 'Beer'): 2}
|
|
431
|
+
{('Bread', 'Milk'): 3, ('Bread', 'Diaper'): 3, ('Bread', 'Beer'): 2, ('Milk', 'Diaper'): 3, ('Milk', 'Beer'): 2, ('Milk', 'Coke'): 2, ('Diaper', 'Beer'): 3, ('Diaper', 'Coke'): 2},
|
|
432
|
+
{('Bread', 'Milk', 'Diaper'): 2, ('Bread', 'Diaper', 'Beer'): 2, ('Milk', 'Diaper', 'Beer'): 2, ('Milk', 'Diaper', 'Coke'): 2}
|
|
432
433
|
]
|
|
433
434
|
```
|
|
434
435
|
|
|
435
436
|
- The **first dictionary** contains single-item sequences with their frequencies (e.g., `('Bread',): 4` means "Bread"
|
|
436
437
|
appears in 4 transactions).
|
|
437
438
|
- The **second dictionary** contains 2-item sequential patterns (e.g., `('Bread', 'Milk'): 3` means the sequence "
|
|
438
|
-
Bread → Milk" appears in 3 transactions).
|
|
439
|
+
Bread → Milk" appears in 3 transactions). Note that patterns like `('Bread', 'Beer')` are detected even when they don't appear adjacent in transactions - they just need to appear in order.
|
|
439
440
|
- The **third dictionary** contains 3-item sequential patterns (e.g., `('Bread', 'Milk', 'Diaper'): 2` means the
|
|
440
441
|
sequence "Bread → Milk → Diaper" appears in 2 transactions).
|
|
441
442
|
|
|
442
443
|
> [!NOTE]
|
|
443
|
-
> The **support** of a sequence is calculated as the fraction of transactions containing the sequence, e.g.,
|
|
444
|
-
`
|
|
444
|
+
> The **support** of a sequence is calculated as the fraction of transactions containing the sequence **in order** (not necessarily contiguously), e.g.,
|
|
445
|
+
`('Bread', 'Milk')` appears in 3 out of 5 transactions → Support = `3 / 5 = 0.6` (60%).
|
|
445
446
|
> This insight helps identify frequently occurring sequential patterns in datasets, such as shopping trends or user
|
|
446
447
|
> behavior.
|
|
447
448
|
|
|
449
|
+
> [!IMPORTANT]
|
|
450
|
+
> **Non-contiguous (ordered) matching**: GSP-Py detects patterns where items appear in the specified order but not necessarily adjacent. For example, the pattern `('Bread', 'Beer')` matches the transaction `['Bread', 'Milk', 'Diaper', 'Beer']` because Bread appears before Beer, even though they are not adjacent. This follows the standard GSP algorithm semantics for sequential pattern mining.
|
|
451
|
+
|
|
452
|
+
### Understanding Non-Contiguous Pattern Matching
|
|
453
|
+
|
|
454
|
+
GSP-Py follows the standard GSP algorithm semantics by detecting **ordered (non-contiguous)** subsequences. This means:
|
|
455
|
+
|
|
456
|
+
- ✅ **Order matters**: Items must appear in the specified sequence order
|
|
457
|
+
- ✅ **Gaps allowed**: Items don't need to be adjacent
|
|
458
|
+
- ❌ **Wrong order rejected**: Items appearing in different order won't match
|
|
459
|
+
|
|
460
|
+
**Example:**
|
|
461
|
+
|
|
462
|
+
```python
|
|
463
|
+
from gsppy.gsp import GSP
|
|
464
|
+
|
|
465
|
+
sequences = [
|
|
466
|
+
['a', 'b', 'c'], # Contains: (a,b), (a,c), (b,c), (a,b,c)
|
|
467
|
+
['a', 'c'], # Contains: (a,c)
|
|
468
|
+
['b', 'c', 'a'], # Contains: (b,c), (b,a), (c,a)
|
|
469
|
+
['a', 'b', 'c', 'd'], # Contains: (a,b), (a,c), (a,d), (b,c), (b,d), (c,d), etc.
|
|
470
|
+
]
|
|
471
|
+
|
|
472
|
+
gsp = GSP(sequences)
|
|
473
|
+
result = gsp.search(min_support=0.5) # Need at least 2/4 sequences
|
|
474
|
+
|
|
475
|
+
# Pattern ('a', 'c') is found with support=3 because:
|
|
476
|
+
# - It appears in ['a', 'b', 'c'] (with 'b' in between)
|
|
477
|
+
# - It appears in ['a', 'c'] (adjacent)
|
|
478
|
+
# - It appears in ['a', 'b', 'c', 'd'] (with 'b' in between)
|
|
479
|
+
# Total: 3 out of 4 sequences = 75% support ✅
|
|
480
|
+
```
|
|
481
|
+
|
|
448
482
|
|
|
449
483
|
> [!TIP]
|
|
450
484
|
> For more complex examples, find example scripts in the [`gsppy/tests`](gsppy/tests) folder.
|
|
@@ -44,14 +44,15 @@ principles**. Using support thresholds, GSP identifies frequent sequences of ite
|
|
|
44
44
|
|
|
45
45
|
### Key Features:
|
|
46
46
|
|
|
47
|
+
- **Ordered (non-contiguous) matching**: Detects patterns where items appear in order but not necessarily adjacent, following standard GSP semantics. For example, the pattern `('A', 'C')` is found in the sequence `['A', 'B', 'C']`.
|
|
47
48
|
- **Support-based pruning**: Only retains sequences that meet the minimum support threshold.
|
|
48
49
|
- **Candidate generation**: Iteratively generates candidate sequences of increasing length.
|
|
49
50
|
- **General-purpose**: Useful in retail, web analytics, social networks, temporal sequence mining, and more.
|
|
50
51
|
|
|
51
52
|
For example:
|
|
52
53
|
|
|
53
|
-
- In a shopping dataset, GSP can identify patterns like "Customers who buy bread and milk often purchase diapers next.
|
|
54
|
-
- In a website clickstream, GSP might find patterns like "Users visit A, then go to
|
|
54
|
+
- In a shopping dataset, GSP can identify patterns like "Customers who buy bread and milk often purchase diapers next" - even if other items appear between bread and milk.
|
|
55
|
+
- In a website clickstream, GSP might find patterns like "Users visit A, then eventually go to C" - capturing user journeys with intermediate steps.
|
|
55
56
|
|
|
56
57
|
---
|
|
57
58
|
|
|
@@ -367,24 +368,57 @@ Sample Output:
|
|
|
367
368
|
```python
|
|
368
369
|
[
|
|
369
370
|
{('Bread',): 4, ('Milk',): 4, ('Diaper',): 4, ('Beer',): 3, ('Coke',): 2},
|
|
370
|
-
{('Bread', 'Milk'): 3, ('Milk', 'Diaper'): 3, ('Diaper', 'Beer'): 3},
|
|
371
|
-
{('Bread', 'Milk', 'Diaper'): 2, ('Milk', 'Diaper', 'Beer'): 2}
|
|
371
|
+
{('Bread', 'Milk'): 3, ('Bread', 'Diaper'): 3, ('Bread', 'Beer'): 2, ('Milk', 'Diaper'): 3, ('Milk', 'Beer'): 2, ('Milk', 'Coke'): 2, ('Diaper', 'Beer'): 3, ('Diaper', 'Coke'): 2},
|
|
372
|
+
{('Bread', 'Milk', 'Diaper'): 2, ('Bread', 'Diaper', 'Beer'): 2, ('Milk', 'Diaper', 'Beer'): 2, ('Milk', 'Diaper', 'Coke'): 2}
|
|
372
373
|
]
|
|
373
374
|
```
|
|
374
375
|
|
|
375
376
|
- The **first dictionary** contains single-item sequences with their frequencies (e.g., `('Bread',): 4` means "Bread"
|
|
376
377
|
appears in 4 transactions).
|
|
377
378
|
- The **second dictionary** contains 2-item sequential patterns (e.g., `('Bread', 'Milk'): 3` means the sequence "
|
|
378
|
-
Bread → Milk" appears in 3 transactions).
|
|
379
|
+
Bread → Milk" appears in 3 transactions). Note that patterns like `('Bread', 'Beer')` are detected even when they don't appear adjacent in transactions - they just need to appear in order.
|
|
379
380
|
- The **third dictionary** contains 3-item sequential patterns (e.g., `('Bread', 'Milk', 'Diaper'): 2` means the
|
|
380
381
|
sequence "Bread → Milk → Diaper" appears in 2 transactions).
|
|
381
382
|
|
|
382
383
|
> [!NOTE]
|
|
383
|
-
> The **support** of a sequence is calculated as the fraction of transactions containing the sequence, e.g.,
|
|
384
|
-
`
|
|
384
|
+
> The **support** of a sequence is calculated as the fraction of transactions containing the sequence **in order** (not necessarily contiguously), e.g.,
|
|
385
|
+
`('Bread', 'Milk')` appears in 3 out of 5 transactions → Support = `3 / 5 = 0.6` (60%).
|
|
385
386
|
> This insight helps identify frequently occurring sequential patterns in datasets, such as shopping trends or user
|
|
386
387
|
> behavior.
|
|
387
388
|
|
|
389
|
+
> [!IMPORTANT]
|
|
390
|
+
> **Non-contiguous (ordered) matching**: GSP-Py detects patterns where items appear in the specified order but not necessarily adjacent. For example, the pattern `('Bread', 'Beer')` matches the transaction `['Bread', 'Milk', 'Diaper', 'Beer']` because Bread appears before Beer, even though they are not adjacent. This follows the standard GSP algorithm semantics for sequential pattern mining.
|
|
391
|
+
|
|
392
|
+
### Understanding Non-Contiguous Pattern Matching
|
|
393
|
+
|
|
394
|
+
GSP-Py follows the standard GSP algorithm semantics by detecting **ordered (non-contiguous)** subsequences. This means:
|
|
395
|
+
|
|
396
|
+
- ✅ **Order matters**: Items must appear in the specified sequence order
|
|
397
|
+
- ✅ **Gaps allowed**: Items don't need to be adjacent
|
|
398
|
+
- ❌ **Wrong order rejected**: Items appearing in different order won't match
|
|
399
|
+
|
|
400
|
+
**Example:**
|
|
401
|
+
|
|
402
|
+
```python
|
|
403
|
+
from gsppy.gsp import GSP
|
|
404
|
+
|
|
405
|
+
sequences = [
|
|
406
|
+
['a', 'b', 'c'], # Contains: (a,b), (a,c), (b,c), (a,b,c)
|
|
407
|
+
['a', 'c'], # Contains: (a,c)
|
|
408
|
+
['b', 'c', 'a'], # Contains: (b,c), (b,a), (c,a)
|
|
409
|
+
['a', 'b', 'c', 'd'], # Contains: (a,b), (a,c), (a,d), (b,c), (b,d), (c,d), etc.
|
|
410
|
+
]
|
|
411
|
+
|
|
412
|
+
gsp = GSP(sequences)
|
|
413
|
+
result = gsp.search(min_support=0.5) # Need at least 2/4 sequences
|
|
414
|
+
|
|
415
|
+
# Pattern ('a', 'c') is found with support=3 because:
|
|
416
|
+
# - It appears in ['a', 'b', 'c'] (with 'b' in between)
|
|
417
|
+
# - It appears in ['a', 'c'] (adjacent)
|
|
418
|
+
# - It appears in ['a', 'b', 'c', 'd'] (with 'b' in between)
|
|
419
|
+
# Total: 3 out of 4 sequences = 75% support ✅
|
|
420
|
+
```
|
|
421
|
+
|
|
388
422
|
|
|
389
423
|
> [!TIP]
|
|
390
424
|
> For more complex examples, find example scripts in the [`gsppy/tests`](gsppy/tests) folder.
|
|
@@ -5,14 +5,14 @@ and generating candidate patterns from previously frequent patterns.
|
|
|
5
5
|
|
|
6
6
|
The key functionalities include:
|
|
7
7
|
1. Splitting a list of items into smaller batches for easier processing.
|
|
8
|
-
2. Checking for the existence of
|
|
8
|
+
2. Checking for the existence of an ordered (non-contiguous) subsequence within a sequence,
|
|
9
9
|
with caching to optimize repeated comparisons.
|
|
10
10
|
3. Generating candidate patterns from a dictionary of frequent patterns
|
|
11
11
|
to support pattern generation tasks in algorithms like sequence mining.
|
|
12
12
|
|
|
13
13
|
Main functionalities:
|
|
14
14
|
- `split_into_batches`: Splits a list of items into smaller batches based on a specified batch size.
|
|
15
|
-
- `is_subsequence_in_list`: Determines if a subsequence exists within another sequence,
|
|
15
|
+
- `is_subsequence_in_list`: Determines if a subsequence exists within another sequence in order,
|
|
16
16
|
using caching to improve performance.
|
|
17
17
|
- `generate_candidates_from_previous`: Generates candidate patterns by joining previously
|
|
18
18
|
identified frequent patterns.
|
|
@@ -46,7 +46,10 @@ def split_into_batches(
|
|
|
46
46
|
@lru_cache(maxsize=None)
|
|
47
47
|
def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ...]) -> bool:
|
|
48
48
|
"""
|
|
49
|
-
Check if a subsequence exists within a sequence as
|
|
49
|
+
Check if a subsequence exists within a sequence as an ordered (non-contiguous) subsequence.
|
|
50
|
+
|
|
51
|
+
This function implements the standard GSP semantics where items in the subsequence
|
|
52
|
+
must appear in the same order in the sequence, but not necessarily contiguously.
|
|
50
53
|
|
|
51
54
|
Parameters:
|
|
52
55
|
subsequence: (tuple): The sequence to search for.
|
|
@@ -54,6 +57,14 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
|
|
|
54
57
|
|
|
55
58
|
Returns:
|
|
56
59
|
bool: True if the subsequence is found, False otherwise.
|
|
60
|
+
|
|
61
|
+
Examples:
|
|
62
|
+
>>> is_subsequence_in_list(('a', 'c'), ('a', 'b', 'c'))
|
|
63
|
+
True
|
|
64
|
+
>>> is_subsequence_in_list(('a', 'c'), ('c', 'a'))
|
|
65
|
+
False
|
|
66
|
+
>>> is_subsequence_in_list(('a', 'b'), ('a', 'b', 'c'))
|
|
67
|
+
True
|
|
57
68
|
"""
|
|
58
69
|
# Handle the case where the subsequence is empty - it should not exist in any sequence
|
|
59
70
|
if not subsequence:
|
|
@@ -61,12 +72,18 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
|
|
|
61
72
|
|
|
62
73
|
len_sub, len_seq = len(subsequence), len(sequence)
|
|
63
74
|
|
|
64
|
-
# Return False if the
|
|
75
|
+
# Return False if the subsequence is longer than the sequence
|
|
65
76
|
if len_sub > len_seq:
|
|
66
77
|
return False
|
|
67
78
|
|
|
68
|
-
# Use
|
|
69
|
-
|
|
79
|
+
# Use two-pointer approach to check if subsequence exists in order
|
|
80
|
+
sub_idx = 0
|
|
81
|
+
for seq_idx in range(len_seq):
|
|
82
|
+
if sequence[seq_idx] == subsequence[sub_idx]:
|
|
83
|
+
sub_idx += 1
|
|
84
|
+
if sub_idx == len_sub:
|
|
85
|
+
return True
|
|
86
|
+
return False
|
|
70
87
|
|
|
71
88
|
|
|
72
89
|
def generate_candidates_from_previous(prev_patterns: Dict[Tuple[str, ...], int]) -> List[Tuple[str, ...]]:
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gsppy"
|
|
7
|
-
version = "3.
|
|
7
|
+
version = "3.1.1"
|
|
8
8
|
description = "GSP (Generalized Sequence Pattern) algorithm in Python"
|
|
9
9
|
keywords = ["GSP", "sequential patterns", "data analysis", "sequence mining"]
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -39,20 +39,20 @@ gsppy = "gsppy.cli:main"
|
|
|
39
39
|
|
|
40
40
|
[project.optional-dependencies]
|
|
41
41
|
dev = [
|
|
42
|
-
"cython==3.1.
|
|
43
|
-
"hatch==1.
|
|
42
|
+
"cython==3.1.4",
|
|
43
|
+
"hatch==1.15.1",
|
|
44
44
|
"hatchling==1.27.0",
|
|
45
|
-
"mypy==1.18.
|
|
46
|
-
"pylint==
|
|
47
|
-
"pyright==1.1.
|
|
45
|
+
"mypy==1.18.2",
|
|
46
|
+
"pylint==4.0.2",
|
|
47
|
+
"pyright==1.1.406",
|
|
48
48
|
"pytest==8.4.2",
|
|
49
49
|
"pytest-benchmark==5.1.0",
|
|
50
50
|
"pytest-cov==7.0.0",
|
|
51
|
-
"ruff==0.13.
|
|
52
|
-
"tox==4.
|
|
51
|
+
"ruff==0.13.3",
|
|
52
|
+
"tox==4.32.0",
|
|
53
53
|
]
|
|
54
54
|
rust = [
|
|
55
|
-
"maturin==1.9.
|
|
55
|
+
"maturin==1.9.6"
|
|
56
56
|
]
|
|
57
57
|
gpu = [
|
|
58
58
|
"cupy>=11,<14"
|
|
@@ -168,13 +168,28 @@ def test_frequent_patterns(supermarket_transactions: List[List[str]]) -> None:
|
|
|
168
168
|
|
|
169
169
|
Asserts:
|
|
170
170
|
- The frequent patterns should match the expected result.
|
|
171
|
+
- Non-contiguous patterns are correctly detected.
|
|
171
172
|
"""
|
|
172
173
|
gsp = GSP(supermarket_transactions)
|
|
173
174
|
result = gsp.search(min_support=0.3)
|
|
174
175
|
expected = [
|
|
175
176
|
{("Bread",): 4, ("Milk",): 4, ("Diaper",): 4, ("Beer",): 3, ("Coke",): 2},
|
|
176
|
-
{
|
|
177
|
-
|
|
177
|
+
{
|
|
178
|
+
("Bread", "Milk"): 3,
|
|
179
|
+
("Bread", "Diaper"): 3,
|
|
180
|
+
("Bread", "Beer"): 2,
|
|
181
|
+
("Milk", "Diaper"): 3,
|
|
182
|
+
("Milk", "Beer"): 2,
|
|
183
|
+
("Milk", "Coke"): 2,
|
|
184
|
+
("Diaper", "Beer"): 3,
|
|
185
|
+
("Diaper", "Coke"): 2,
|
|
186
|
+
},
|
|
187
|
+
{
|
|
188
|
+
("Bread", "Milk", "Diaper"): 2,
|
|
189
|
+
("Bread", "Diaper", "Beer"): 2,
|
|
190
|
+
("Milk", "Diaper", "Beer"): 2,
|
|
191
|
+
("Milk", "Diaper", "Coke"): 2,
|
|
192
|
+
},
|
|
178
193
|
]
|
|
179
194
|
assert result == expected, "Frequent patterns do not match expected results."
|
|
180
195
|
|
|
@@ -231,6 +246,131 @@ def test_partial_match(supermarket_transactions: List[List[str]]) -> None:
|
|
|
231
246
|
assert result_level_2 >= expected_patterns_level_2, f"Level 2 patterns mismatch. Got {result_level_2}"
|
|
232
247
|
|
|
233
248
|
|
|
249
|
+
def test_non_contiguous_subsequences() -> None:
|
|
250
|
+
"""
|
|
251
|
+
Test the GSP algorithm correctly detects non-contiguous subsequences (Issue #115).
|
|
252
|
+
|
|
253
|
+
This test validates that patterns like ('a', 'c') are detected even when
|
|
254
|
+
they appear with gaps in sequences like ['a', 'b', 'c'].
|
|
255
|
+
|
|
256
|
+
Asserts:
|
|
257
|
+
- Non-contiguous patterns are correctly identified with proper support.
|
|
258
|
+
"""
|
|
259
|
+
sequences = [
|
|
260
|
+
["a", "b", "c"],
|
|
261
|
+
["a", "c"],
|
|
262
|
+
["b", "c", "a"],
|
|
263
|
+
["a", "b", "c", "d"],
|
|
264
|
+
]
|
|
265
|
+
|
|
266
|
+
gsp = GSP(sequences)
|
|
267
|
+
result = gsp.search(min_support=0.5)
|
|
268
|
+
|
|
269
|
+
# Expected: ('a', 'c') should be found with support = 3
|
|
270
|
+
# It appears in: ['a', 'b', 'c'], ['a', 'c'], ['a', 'b', 'c', 'd']
|
|
271
|
+
assert len(result) >= 2, "Expected at least 2 levels of patterns"
|
|
272
|
+
|
|
273
|
+
level_2_patterns = result[1]
|
|
274
|
+
assert ("a", "c") in level_2_patterns, f"Pattern ('a', 'c') not found in level 2. Got {level_2_patterns}"
|
|
275
|
+
assert level_2_patterns[("a", "c")] == 3, f"Expected support 3 for ('a', 'c'), got {level_2_patterns[('a', 'c')]}"
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def test_contiguous_vs_non_contiguous_patterns() -> None:
|
|
279
|
+
"""
|
|
280
|
+
Comprehensive test demonstrating the difference between contiguous and non-contiguous patterns.
|
|
281
|
+
|
|
282
|
+
This test shows patterns that would ONLY be found in non-contiguous matching (current implementation)
|
|
283
|
+
vs patterns that would be found in BOTH contiguous and non-contiguous matching.
|
|
284
|
+
|
|
285
|
+
The current implementation uses non-contiguous (ordered) matching, which is the standard GSP behavior.
|
|
286
|
+
"""
|
|
287
|
+
sequences = [
|
|
288
|
+
["X", "Y", "Z"], # Contains X->Y, Y->Z, X->Z (contiguous: X->Y, Y->Z only)
|
|
289
|
+
["X", "Z"], # Contains X->Z (contiguous: X->Z)
|
|
290
|
+
["Y", "Z", "X"], # Contains Y->Z, Y->X, Z->X (contiguous: Y->Z, Z->X only)
|
|
291
|
+
["X", "Y", "Z", "W"], # Contains many patterns
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
gsp = GSP(sequences)
|
|
295
|
+
result = gsp.search(min_support=0.5) # Need at least 2/4 sequences
|
|
296
|
+
|
|
297
|
+
# Level 2 patterns
|
|
298
|
+
level_2_patterns = result[1] if len(result) >= 2 else {}
|
|
299
|
+
|
|
300
|
+
# Patterns that would be found in BOTH contiguous and non-contiguous:
|
|
301
|
+
# ('X', 'Y') appears contiguously in: ['X', 'Y', 'Z'], ['X', 'Y', 'Z', 'W']
|
|
302
|
+
# ('Y', 'Z') appears contiguously in: ['X', 'Y', 'Z'], ['Y', 'Z', 'X'], ['X', 'Y', 'Z', 'W']
|
|
303
|
+
assert ("X", "Y") in level_2_patterns, "('X', 'Y') should be found (contiguous in 2 sequences)"
|
|
304
|
+
assert ("Y", "Z") in level_2_patterns, "('Y', 'Z') should be found (contiguous in 3 sequences)"
|
|
305
|
+
|
|
306
|
+
# Pattern that would ONLY be found in non-contiguous matching:
|
|
307
|
+
# ('X', 'Z') appears with gap in: ['X', 'Y', 'Z'], ['X', 'Y', 'Z', 'W']
|
|
308
|
+
# and contiguously in: ['X', 'Z']
|
|
309
|
+
# Total support = 3 (>= 2 threshold)
|
|
310
|
+
assert ("X", "Z") in level_2_patterns, (
|
|
311
|
+
"('X', 'Z') should be found with non-contiguous matching. "
|
|
312
|
+
"This pattern has gaps in some sequences but is still ordered."
|
|
313
|
+
)
|
|
314
|
+
assert level_2_patterns[("X", "Z")] == 3, f"Expected support 3 for ('X', 'Z'), got {level_2_patterns[('X', 'Z')]}"
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def test_non_contiguous_with_longer_gaps() -> None:
|
|
318
|
+
"""
|
|
319
|
+
Test non-contiguous matching with longer gaps between elements.
|
|
320
|
+
|
|
321
|
+
This demonstrates that the algorithm correctly finds patterns even when
|
|
322
|
+
there are multiple elements between the pattern elements.
|
|
323
|
+
"""
|
|
324
|
+
sequences = [
|
|
325
|
+
["A", "B", "C", "D", "E"], # Contains A->E with 3 elements in between
|
|
326
|
+
["A", "X", "Y", "Z", "E"], # Contains A->E with 3 different elements in between
|
|
327
|
+
["A", "E"], # Contains A->E with no gap
|
|
328
|
+
["E", "A"], # Does NOT contain A->E (wrong order)
|
|
329
|
+
]
|
|
330
|
+
|
|
331
|
+
gsp = GSP(sequences)
|
|
332
|
+
result = gsp.search(min_support=0.5) # Need at least 2/4 sequences
|
|
333
|
+
|
|
334
|
+
# ('A', 'E') should be found with support = 3
|
|
335
|
+
level_2_patterns = result[1] if len(result) >= 2 else {}
|
|
336
|
+
assert ("A", "E") in level_2_patterns, "('A', 'E') should be found despite large gaps"
|
|
337
|
+
assert level_2_patterns[("A", "E")] == 3, f"Expected support 3 for ('A', 'E'), got {level_2_patterns[('A', 'E')]}"
|
|
338
|
+
|
|
339
|
+
# ('E', 'A') should NOT be found (wrong order)
|
|
340
|
+
assert ("E", "A") not in level_2_patterns, "('E', 'A') should not be found (wrong order)"
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def test_order_sensitivity() -> None:
|
|
344
|
+
"""
|
|
345
|
+
Test that the algorithm is sensitive to order - patterns must appear in sequence order.
|
|
346
|
+
|
|
347
|
+
This verifies that even with non-contiguous matching, the order of elements matters.
|
|
348
|
+
"""
|
|
349
|
+
sequences = [
|
|
350
|
+
["P", "Q", "R"], # Contains P->Q, P->R, Q->R
|
|
351
|
+
["P", "R", "Q"], # Contains P->R, P->Q, R->Q
|
|
352
|
+
["Q", "P", "R"], # Contains Q->P, Q->R, P->R
|
|
353
|
+
["R", "Q", "P"], # Contains R->Q, R->P, Q->P
|
|
354
|
+
]
|
|
355
|
+
|
|
356
|
+
gsp = GSP(sequences)
|
|
357
|
+
result = gsp.search(min_support=0.5) # Need at least 2/4 sequences
|
|
358
|
+
|
|
359
|
+
level_2_patterns = result[1] if len(result) >= 2 else {}
|
|
360
|
+
|
|
361
|
+
# ('P', 'R') appears in correct order in: ['P', 'Q', 'R'], ['P', 'R', 'Q'], ['Q', 'P', 'R']
|
|
362
|
+
assert ("P", "R") in level_2_patterns, "('P', 'R') should be found (support = 3)"
|
|
363
|
+
assert level_2_patterns[("P", "R")] == 3
|
|
364
|
+
|
|
365
|
+
# ('Q', 'P') appears in correct order in: ['Q', 'P', 'R'], ['R', 'Q', 'P']
|
|
366
|
+
assert ("Q", "P") in level_2_patterns, "('Q', 'P') should be found (support = 2)"
|
|
367
|
+
assert level_2_patterns[("Q", "P")] == 2
|
|
368
|
+
|
|
369
|
+
# ('R', 'P') appears in correct order in: ['R', 'Q', 'P']
|
|
370
|
+
# Support = 1, below threshold of 2
|
|
371
|
+
assert ("R", "P") not in level_2_patterns, "('R', 'P') should not be found (support = 1, below threshold)"
|
|
372
|
+
|
|
373
|
+
|
|
234
374
|
@pytest.mark.parametrize("min_support", [0.1, 0.2, 0.3, 0.4, 0.5])
|
|
235
375
|
def test_benchmark(benchmark: BenchmarkFixture, supermarket_transactions: List[List[str]], min_support: float) -> None:
|
|
236
376
|
"""
|
|
@@ -45,13 +45,19 @@ def test_is_subsequence_in_list():
|
|
|
45
45
|
"""
|
|
46
46
|
Test the `is_subsequence_in_list` utility function.
|
|
47
47
|
"""
|
|
48
|
-
# Test when the subsequence is present
|
|
49
|
-
assert is_subsequence_in_list((1, 2), (0, 1, 2, 3)), "Failed to find subsequence"
|
|
48
|
+
# Test when the subsequence is present (contiguous)
|
|
49
|
+
assert is_subsequence_in_list((1, 2), (0, 1, 2, 3)), "Failed to find contiguous subsequence"
|
|
50
50
|
assert is_subsequence_in_list((3,), (0, 1, 2, 3)), "Failed single-element subsequence"
|
|
51
51
|
|
|
52
|
-
# Test when the subsequence is
|
|
53
|
-
assert
|
|
52
|
+
# Test when the subsequence is present (non-contiguous)
|
|
53
|
+
assert is_subsequence_in_list((1, 3), (0, 1, 2, 3)), "Failed to find non-contiguous subsequence"
|
|
54
|
+
assert is_subsequence_in_list((0, 2), (0, 1, 2, 3)), "Failed to find non-contiguous subsequence"
|
|
55
|
+
assert is_subsequence_in_list((0, 3), (0, 1, 2, 3)), "Failed to find non-contiguous subsequence"
|
|
56
|
+
|
|
57
|
+
# Test when the subsequence is not present (wrong order or missing elements)
|
|
58
|
+
assert not is_subsequence_in_list((3, 1), (0, 1, 2, 3)), "Incorrectly found reversed subsequence"
|
|
54
59
|
assert not is_subsequence_in_list((4,), (0, 1, 2, 3)), "Incorrectly found non-existent subsequence"
|
|
60
|
+
assert not is_subsequence_in_list((2, 1), (0, 1, 2, 3)), "Incorrectly found out-of-order subsequence"
|
|
55
61
|
|
|
56
62
|
# Test when input sequence or subsequence is empty
|
|
57
63
|
assert not is_subsequence_in_list((), (0, 1, 2, 3)), "Incorrect positive result for empty subsequence"
|
|
@@ -61,6 +67,64 @@ def test_is_subsequence_in_list():
|
|
|
61
67
|
assert not is_subsequence_in_list((1, 2, 3, 4), (1, 2, 3)), "Failed to reject long subsequence"
|
|
62
68
|
|
|
63
69
|
|
|
70
|
+
def test_is_subsequence_contiguous_vs_non_contiguous():
|
|
71
|
+
"""
|
|
72
|
+
Test cases that demonstrate the difference between contiguous and non-contiguous matching.
|
|
73
|
+
|
|
74
|
+
The current implementation uses non-contiguous (ordered) matching.
|
|
75
|
+
This test documents patterns that would differ between the two approaches.
|
|
76
|
+
"""
|
|
77
|
+
# Pattern that appears with gaps (non-contiguous)
|
|
78
|
+
# In contiguous mode: would NOT match
|
|
79
|
+
# In non-contiguous mode: DOES match
|
|
80
|
+
assert is_subsequence_in_list(("a", "c"), ("a", "b", "c")), (
|
|
81
|
+
"Non-contiguous: ('a', 'c') should match in ('a', 'b', 'c')"
|
|
82
|
+
)
|
|
83
|
+
assert is_subsequence_in_list(("a", "d"), ("a", "b", "c", "d")), (
|
|
84
|
+
"Non-contiguous: ('a', 'd') should match in ('a', 'b', 'c', 'd')"
|
|
85
|
+
)
|
|
86
|
+
assert is_subsequence_in_list((1, 4), (1, 2, 3, 4, 5)), (
|
|
87
|
+
"Non-contiguous: (1, 4) should match in (1, 2, 3, 4, 5)"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Pattern that appears contiguously (would match in both modes)
|
|
91
|
+
assert is_subsequence_in_list(("a", "b"), ("a", "b", "c")), (
|
|
92
|
+
"Contiguous: ('a', 'b') should match in ('a', 'b', 'c')"
|
|
93
|
+
)
|
|
94
|
+
assert is_subsequence_in_list((2, 3), (1, 2, 3, 4)), (
|
|
95
|
+
"Contiguous: (2, 3) should match in (1, 2, 3, 4)"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Pattern with wrong order (would NOT match in either mode)
|
|
99
|
+
assert not is_subsequence_in_list(("c", "a"), ("a", "b", "c")), (
|
|
100
|
+
"Wrong order: ('c', 'a') should NOT match in ('a', 'b', 'c')"
|
|
101
|
+
)
|
|
102
|
+
assert not is_subsequence_in_list((3, 1), (1, 2, 3, 4)), (
|
|
103
|
+
"Wrong order: (3, 1) should NOT match in (1, 2, 3, 4)"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def test_is_subsequence_with_gaps():
|
|
108
|
+
"""
|
|
109
|
+
Test non-contiguous matching with various gap sizes.
|
|
110
|
+
"""
|
|
111
|
+
# Small gap
|
|
112
|
+
assert is_subsequence_in_list(("x", "z"), ("x", "y", "z")), "Failed with 1 element gap"
|
|
113
|
+
|
|
114
|
+
# Medium gap
|
|
115
|
+
assert is_subsequence_in_list(("a", "e"), ("a", "b", "c", "d", "e")), "Failed with 3 element gap"
|
|
116
|
+
|
|
117
|
+
# Large gap
|
|
118
|
+
assert is_subsequence_in_list((1, 10), (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)), "Failed with 8 element gap"
|
|
119
|
+
|
|
120
|
+
# Multiple gaps in longer pattern
|
|
121
|
+
assert is_subsequence_in_list((1, 3, 5), (1, 2, 3, 4, 5)), "Failed with multiple gaps"
|
|
122
|
+
assert is_subsequence_in_list(("a", "c", "e"), ("a", "b", "c", "d", "e")), "Failed with multiple gaps"
|
|
123
|
+
|
|
124
|
+
# No gap (adjacent elements still work)
|
|
125
|
+
assert is_subsequence_in_list((1, 2), (1, 2, 3)), "Failed with no gap (contiguous)"
|
|
126
|
+
|
|
127
|
+
|
|
64
128
|
def test_generate_candidates_from_previous():
|
|
65
129
|
"""
|
|
66
130
|
Test the `generate_candidates_from_previous` utility function.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|