gsppy 3.0.1__py3-none-any.whl → 3.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsppy/utils.py CHANGED
@@ -5,14 +5,14 @@ and generating candidate patterns from previously frequent patterns.
5
5
 
6
6
  The key functionalities include:
7
7
  1. Splitting a list of items into smaller batches for easier processing.
8
- 2. Checking for the existence of a contiguous subsequence within a sequence,
8
+ 2. Checking for the existence of an ordered (non-contiguous) subsequence within a sequence,
9
9
  with caching to optimize repeated comparisons.
10
10
  3. Generating candidate patterns from a dictionary of frequent patterns
11
11
  to support pattern generation tasks in algorithms like sequence mining.
12
12
 
13
13
  Main functionalities:
14
14
  - `split_into_batches`: Splits a list of items into smaller batches based on a specified batch size.
15
- - `is_subsequence_in_list`: Determines if a subsequence exists within another sequence,
15
+ - `is_subsequence_in_list`: Determines if a subsequence exists within another sequence in order,
16
16
  using caching to improve performance.
17
17
  - `generate_candidates_from_previous`: Generates candidate patterns by joining previously
18
18
  identified frequent patterns.
@@ -46,7 +46,10 @@ def split_into_batches(
46
46
  @lru_cache(maxsize=None)
47
47
  def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ...]) -> bool:
48
48
  """
49
- Check if a subsequence exists within a sequence as a contiguous subsequence.
49
+ Check if a subsequence exists within a sequence as an ordered (non-contiguous) subsequence.
50
+
51
+ This function implements the standard GSP semantics where items in the subsequence
52
+ must appear in the same order in the sequence, but not necessarily contiguously.
50
53
 
51
54
  Parameters:
52
55
  subsequence: (tuple): The sequence to search for.
@@ -54,6 +57,14 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
54
57
 
55
58
  Returns:
56
59
  bool: True if the subsequence is found, False otherwise.
60
+
61
+ Examples:
62
+ >>> is_subsequence_in_list(('a', 'c'), ('a', 'b', 'c'))
63
+ True
64
+ >>> is_subsequence_in_list(('a', 'c'), ('c', 'a'))
65
+ False
66
+ >>> is_subsequence_in_list(('a', 'b'), ('a', 'b', 'c'))
67
+ True
57
68
  """
58
69
  # Handle the case where the subsequence is empty - it should not exist in any sequence
59
70
  if not subsequence:
@@ -61,12 +72,18 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
61
72
 
62
73
  len_sub, len_seq = len(subsequence), len(sequence)
63
74
 
64
- # Return False if the sequence is longer than the list
75
+ # Return False if the subsequence is longer than the sequence
65
76
  if len_sub > len_seq:
66
77
  return False
67
78
 
68
- # Use any to check if any slice matches the sequence
69
- return any(sequence[i : i + len_sub] == subsequence for i in range(len_seq - len_sub + 1))
79
+ # Use two-pointer approach to check if subsequence exists in order
80
+ sub_idx = 0
81
+ for seq_idx in range(len_seq):
82
+ if sequence[seq_idx] == subsequence[sub_idx]:
83
+ sub_idx += 1
84
+ if sub_idx == len_sub:
85
+ return True
86
+ return False
70
87
 
71
88
 
72
89
  def generate_candidates_from_previous(prev_patterns: Dict[Tuple[str, ...], int]) -> List[Tuple[str, ...]]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gsppy
3
- Version: 3.0.1
3
+ Version: 3.1.1
4
4
  Summary: GSP (Generalized Sequence Pattern) algorithm in Python
5
5
  Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
6
6
  Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
@@ -41,21 +41,21 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
41
41
  Requires-Python: >=3.10
42
42
  Requires-Dist: click>=8.0.0
43
43
  Provides-Extra: dev
44
- Requires-Dist: cython==3.1.3; extra == 'dev'
45
- Requires-Dist: hatch==1.14.1; extra == 'dev'
44
+ Requires-Dist: cython==3.1.4; extra == 'dev'
45
+ Requires-Dist: hatch==1.15.1; extra == 'dev'
46
46
  Requires-Dist: hatchling==1.27.0; extra == 'dev'
47
- Requires-Dist: mypy==1.18.1; extra == 'dev'
48
- Requires-Dist: pylint==3.3.8; extra == 'dev'
49
- Requires-Dist: pyright==1.1.405; extra == 'dev'
47
+ Requires-Dist: mypy==1.18.2; extra == 'dev'
48
+ Requires-Dist: pylint==4.0.2; extra == 'dev'
49
+ Requires-Dist: pyright==1.1.406; extra == 'dev'
50
50
  Requires-Dist: pytest-benchmark==5.1.0; extra == 'dev'
51
51
  Requires-Dist: pytest-cov==7.0.0; extra == 'dev'
52
52
  Requires-Dist: pytest==8.4.2; extra == 'dev'
53
- Requires-Dist: ruff==0.13.0; extra == 'dev'
54
- Requires-Dist: tox==4.30.2; extra == 'dev'
53
+ Requires-Dist: ruff==0.13.3; extra == 'dev'
54
+ Requires-Dist: tox==4.32.0; extra == 'dev'
55
55
  Provides-Extra: gpu
56
56
  Requires-Dist: cupy<14,>=11; extra == 'gpu'
57
57
  Provides-Extra: rust
58
- Requires-Dist: maturin==1.9.4; extra == 'rust'
58
+ Requires-Dist: maturin==1.9.6; extra == 'rust'
59
59
  Description-Content-Type: text/markdown
60
60
 
61
61
  [![PyPI License](https://img.shields.io/pypi/l/gsppy.svg?style=flat-square)]()
@@ -104,14 +104,15 @@ principles**. Using support thresholds, GSP identifies frequent sequences of ite
104
104
 
105
105
  ### Key Features:
106
106
 
107
+ - **Ordered (non-contiguous) matching**: Detects patterns where items appear in order but not necessarily adjacent, following standard GSP semantics. For example, the pattern `('A', 'C')` is found in the sequence `['A', 'B', 'C']`.
107
108
  - **Support-based pruning**: Only retains sequences that meet the minimum support threshold.
108
109
  - **Candidate generation**: Iteratively generates candidate sequences of increasing length.
109
110
  - **General-purpose**: Useful in retail, web analytics, social networks, temporal sequence mining, and more.
110
111
 
111
112
  For example:
112
113
 
113
- - In a shopping dataset, GSP can identify patterns like "Customers who buy bread and milk often purchase diapers next."
114
- - In a website clickstream, GSP might find patterns like "Users visit A, then go to B, and later proceed to C."
114
+ - In a shopping dataset, GSP can identify patterns like "Customers who buy bread and milk often purchase diapers next" - even if other items appear between bread and milk.
115
+ - In a website clickstream, GSP might find patterns like "Users visit A, then eventually go to C" - capturing user journeys with intermediate steps.
115
116
 
116
117
  ---
117
118
 
@@ -427,24 +428,57 @@ Sample Output:
427
428
  ```python
428
429
  [
429
430
  {('Bread',): 4, ('Milk',): 4, ('Diaper',): 4, ('Beer',): 3, ('Coke',): 2},
430
- {('Bread', 'Milk'): 3, ('Milk', 'Diaper'): 3, ('Diaper', 'Beer'): 3},
431
- {('Bread', 'Milk', 'Diaper'): 2, ('Milk', 'Diaper', 'Beer'): 2}
431
+ {('Bread', 'Milk'): 3, ('Bread', 'Diaper'): 3, ('Bread', 'Beer'): 2, ('Milk', 'Diaper'): 3, ('Milk', 'Beer'): 2, ('Milk', 'Coke'): 2, ('Diaper', 'Beer'): 3, ('Diaper', 'Coke'): 2},
432
+ {('Bread', 'Milk', 'Diaper'): 2, ('Bread', 'Diaper', 'Beer'): 2, ('Milk', 'Diaper', 'Beer'): 2, ('Milk', 'Diaper', 'Coke'): 2}
432
433
  ]
433
434
  ```
434
435
 
435
436
  - The **first dictionary** contains single-item sequences with their frequencies (e.g., `('Bread',): 4` means "Bread"
436
437
  appears in 4 transactions).
437
438
  - The **second dictionary** contains 2-item sequential patterns (e.g., `('Bread', 'Milk'): 3` means the sequence "
438
- Bread → Milk" appears in 3 transactions).
439
+ Bread → Milk" appears in 3 transactions). Note that patterns like `('Bread', 'Beer')` are detected even when they don't appear adjacent in transactions - they just need to appear in order.
439
440
  - The **third dictionary** contains 3-item sequential patterns (e.g., `('Bread', 'Milk', 'Diaper'): 2` means the
440
441
  sequence "Bread → Milk → Diaper" appears in 2 transactions).
441
442
 
442
443
  > [!NOTE]
443
- > The **support** of a sequence is calculated as the fraction of transactions containing the sequence, e.g.,
444
- `[Bread, Milk]` appears in 3 out of 5 transactions → Support = `3 / 5 = 0.6` (60%).
444
+ > The **support** of a sequence is calculated as the fraction of transactions containing the sequence **in order** (not necessarily contiguously), e.g.,
445
+ `('Bread', 'Milk')` appears in 3 out of 5 transactions → Support = `3 / 5 = 0.6` (60%).
445
446
  > This insight helps identify frequently occurring sequential patterns in datasets, such as shopping trends or user
446
447
  > behavior.
447
448
 
449
+ > [!IMPORTANT]
450
+ > **Non-contiguous (ordered) matching**: GSP-Py detects patterns where items appear in the specified order but not necessarily adjacent. For example, the pattern `('Bread', 'Beer')` matches the transaction `['Bread', 'Milk', 'Diaper', 'Beer']` because Bread appears before Beer, even though they are not adjacent. This follows the standard GSP algorithm semantics for sequential pattern mining.
451
+
452
+ ### Understanding Non-Contiguous Pattern Matching
453
+
454
+ GSP-Py follows the standard GSP algorithm semantics by detecting **ordered (non-contiguous)** subsequences. This means:
455
+
456
+ - ✅ **Order matters**: Items must appear in the specified sequence order
457
+ - ✅ **Gaps allowed**: Items don't need to be adjacent
458
+ - ❌ **Wrong order rejected**: Items appearing in different order won't match
459
+
460
+ **Example:**
461
+
462
+ ```python
463
+ from gsppy.gsp import GSP
464
+
465
+ sequences = [
466
+ ['a', 'b', 'c'], # Contains: (a,b), (a,c), (b,c), (a,b,c)
467
+ ['a', 'c'], # Contains: (a,c)
468
+ ['b', 'c', 'a'], # Contains: (b,c), (b,a), (c,a)
469
+ ['a', 'b', 'c', 'd'], # Contains: (a,b), (a,c), (a,d), (b,c), (b,d), (c,d), etc.
470
+ ]
471
+
472
+ gsp = GSP(sequences)
473
+ result = gsp.search(min_support=0.5) # Need at least 2/4 sequences
474
+
475
+ # Pattern ('a', 'c') is found with support=3 because:
476
+ # - It appears in ['a', 'b', 'c'] (with 'b' in between)
477
+ # - It appears in ['a', 'c'] (adjacent)
478
+ # - It appears in ['a', 'b', 'c', 'd'] (with 'b' in between)
479
+ # Total: 3 out of 4 sequences = 75% support ✅
480
+ ```
481
+
448
482
 
449
483
  > [!TIP]
450
484
  > For more complex examples, find example scripts in the [`gsppy/tests`](gsppy/tests) folder.
@@ -0,0 +1,10 @@
1
+ gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ gsppy/accelerate.py,sha256=YO3YQFzo2VAC6IXOTnQnOajkZO7SabkieGb1IPgWdSI,10407
3
+ gsppy/cli.py,sha256=wsGoc_utxpRfgCF9vPOAyLDTOJZ8NaiwiUny5VyIYvQ,6567
4
+ gsppy/gsp.py,sha256=GCHFhOu-DyHEPsse_OXzf9IaZoigF8ouRqgn_OsZBvA,14855
5
+ gsppy/utils.py,sha256=KtjfDgsTwvwxIyA2KCQmgu8cFkBqQvMZN8Ct5NB60Tc,3952
6
+ gsppy-3.1.1.dist-info/METADATA,sha256=uN-rN-CzsrwW_uh4s60DUevIKjm5CuiYyHRh5cgyKqQ,19819
7
+ gsppy-3.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
8
+ gsppy-3.1.1.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
9
+ gsppy-3.1.1.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
10
+ gsppy-3.1.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,10 +0,0 @@
1
- gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- gsppy/accelerate.py,sha256=YO3YQFzo2VAC6IXOTnQnOajkZO7SabkieGb1IPgWdSI,10407
3
- gsppy/cli.py,sha256=wsGoc_utxpRfgCF9vPOAyLDTOJZ8NaiwiUny5VyIYvQ,6567
4
- gsppy/gsp.py,sha256=GCHFhOu-DyHEPsse_OXzf9IaZoigF8ouRqgn_OsZBvA,14855
5
- gsppy/utils.py,sha256=YlV0F64lnd2Xymf6XnYr6mMLYWV2f2yjaHkZbAS1Qs0,3362
6
- gsppy-3.0.1.dist-info/METADATA,sha256=vt35btl69hnEM4R1Kz-U5m-2MHEN5hdkHBk-uJ2eKAw,17670
7
- gsppy-3.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
8
- gsppy-3.0.1.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
9
- gsppy-3.0.1.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
10
- gsppy-3.0.1.dist-info/RECORD,,