gsppy 4.0.0__py3-none-any.whl → 4.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsppy/__init__.py +10 -0
- gsppy/cli.py +2 -2
- gsppy/gsp.py +71 -7
- gsppy/sequence.py +371 -0
- {gsppy-4.0.0.dist-info → gsppy-4.1.0.dist-info}/METADATA +77 -1
- {gsppy-4.0.0.dist-info → gsppy-4.1.0.dist-info}/RECORD +9 -8
- {gsppy-4.0.0.dist-info → gsppy-4.1.0.dist-info}/WHEEL +0 -0
- {gsppy-4.0.0.dist-info → gsppy-4.1.0.dist-info}/entry_points.txt +0 -0
- {gsppy-4.0.0.dist-info → gsppy-4.1.0.dist-info}/licenses/LICENSE +0 -0
gsppy/__init__.py
CHANGED
|
@@ -24,6 +24,12 @@ from gsppy.pruning import (
|
|
|
24
24
|
FrequencyBasedPruning,
|
|
25
25
|
create_default_pruning_strategy,
|
|
26
26
|
)
|
|
27
|
+
from gsppy.sequence import (
|
|
28
|
+
Sequence,
|
|
29
|
+
sequences_to_dict,
|
|
30
|
+
dict_to_sequences,
|
|
31
|
+
to_sequence,
|
|
32
|
+
)
|
|
27
33
|
from gsppy.token_mapper import TokenMapper
|
|
28
34
|
|
|
29
35
|
# DataFrame adapters are optional - import only if dependencies are available
|
|
@@ -63,6 +69,10 @@ __all__ = [
|
|
|
63
69
|
"TemporalAwarePruning",
|
|
64
70
|
"CombinedPruning",
|
|
65
71
|
"create_default_pruning_strategy",
|
|
72
|
+
"Sequence",
|
|
73
|
+
"sequences_to_dict",
|
|
74
|
+
"dict_to_sequences",
|
|
75
|
+
"to_sequence",
|
|
66
76
|
"TokenMapper",
|
|
67
77
|
]
|
|
68
78
|
|
gsppy/cli.py
CHANGED
|
@@ -35,7 +35,7 @@ import csv
|
|
|
35
35
|
import sys
|
|
36
36
|
import json
|
|
37
37
|
import logging
|
|
38
|
-
from typing import Any,
|
|
38
|
+
from typing import Any, List, Tuple, Union, Optional, cast
|
|
39
39
|
|
|
40
40
|
import click
|
|
41
41
|
|
|
@@ -608,7 +608,7 @@ def main(
|
|
|
608
608
|
# Initialize and run GSP algorithm
|
|
609
609
|
try:
|
|
610
610
|
gsp = GSP(transactions, mingap=mingap, maxgap=maxgap, maxspan=maxspan, verbose=verbose)
|
|
611
|
-
patterns
|
|
611
|
+
patterns = gsp.search(min_support=min_support, return_sequences=False)
|
|
612
612
|
logger.info("Frequent Patterns Found:")
|
|
613
613
|
for i, level in enumerate(patterns, start=1):
|
|
614
614
|
logger.info(f"\n{i}-Sequence Patterns:")
|
gsppy/gsp.py
CHANGED
|
@@ -90,7 +90,7 @@ from __future__ import annotations
|
|
|
90
90
|
import math
|
|
91
91
|
import logging
|
|
92
92
|
import multiprocessing as mp
|
|
93
|
-
from typing import TYPE_CHECKING, Dict, List, Tuple, Union, Optional, cast
|
|
93
|
+
from typing import TYPE_CHECKING, Dict, List, Tuple, Union, Literal, Optional, cast, overload
|
|
94
94
|
from itertools import chain
|
|
95
95
|
from collections import Counter
|
|
96
96
|
|
|
@@ -102,6 +102,7 @@ from gsppy.utils import (
|
|
|
102
102
|
is_subsequence_in_list_with_time_constraints,
|
|
103
103
|
)
|
|
104
104
|
from gsppy.pruning import PruningStrategy, create_default_pruning_strategy
|
|
105
|
+
from gsppy.sequence import Sequence, dict_to_sequences
|
|
105
106
|
from gsppy.accelerate import support_counts as support_counts_accel
|
|
106
107
|
|
|
107
108
|
if TYPE_CHECKING:
|
|
@@ -590,13 +591,37 @@ class GSP:
|
|
|
590
591
|
"""
|
|
591
592
|
logger.info("Run %d: %d candidates filtered to %d.", run, len(candidates), len(self.freq_patterns[run - 1]))
|
|
592
593
|
|
|
594
|
+
@overload
|
|
593
595
|
def search(
|
|
594
596
|
self,
|
|
595
597
|
min_support: float = 0.2,
|
|
596
598
|
max_k: Optional[int] = None,
|
|
597
599
|
backend: Optional[str] = None,
|
|
598
600
|
verbose: Optional[bool] = None,
|
|
599
|
-
|
|
601
|
+
*,
|
|
602
|
+
return_sequences: Literal[False] = False,
|
|
603
|
+
) -> List[Dict[Tuple[str, ...], int]]: ...
|
|
604
|
+
|
|
605
|
+
@overload
|
|
606
|
+
def search(
|
|
607
|
+
self,
|
|
608
|
+
min_support: float = 0.2,
|
|
609
|
+
max_k: Optional[int] = None,
|
|
610
|
+
backend: Optional[str] = None,
|
|
611
|
+
verbose: Optional[bool] = None,
|
|
612
|
+
*,
|
|
613
|
+
return_sequences: Literal[True],
|
|
614
|
+
) -> List[List[Sequence]]: ...
|
|
615
|
+
|
|
616
|
+
def search(
|
|
617
|
+
self,
|
|
618
|
+
min_support: float = 0.2,
|
|
619
|
+
max_k: Optional[int] = None,
|
|
620
|
+
backend: Optional[str] = None,
|
|
621
|
+
verbose: Optional[bool] = None,
|
|
622
|
+
*,
|
|
623
|
+
return_sequences: bool = False,
|
|
624
|
+
) -> Union[List[Dict[Tuple[str, ...], int]], List[List[Sequence]]]:
|
|
600
625
|
"""
|
|
601
626
|
Execute the Generalized Sequential Pattern (GSP) mining algorithm.
|
|
602
627
|
|
|
@@ -617,11 +642,20 @@ class GSP:
|
|
|
617
642
|
Note: temporal constraints always use Python backend.
|
|
618
643
|
verbose (Optional[bool]): Override instance verbosity setting for this search.
|
|
619
644
|
If None, uses the instance's verbose setting.
|
|
645
|
+
return_sequences (bool): If True, returns patterns as Sequence objects instead of
|
|
646
|
+
Dict[Tuple[str, ...], int]. Defaults to False for backward
|
|
647
|
+
compatibility. When True, returns List[List[Sequence]] where
|
|
648
|
+
each Sequence contains items, support count, and can be extended
|
|
649
|
+
with additional metadata.
|
|
620
650
|
|
|
621
651
|
Returns:
|
|
622
|
-
List[Dict[Tuple[str, ...], int]]:
|
|
623
|
-
|
|
624
|
-
|
|
652
|
+
Union[List[Dict[Tuple[str, ...], int]], List[List[Sequence]]]:
|
|
653
|
+
If return_sequences is False (default):
|
|
654
|
+
A list of dictionaries containing frequent patterns at each k-sequence level,
|
|
655
|
+
with patterns as keys and their support counts as values.
|
|
656
|
+
If return_sequences is True:
|
|
657
|
+
A list of lists containing Sequence objects at each k-sequence level,
|
|
658
|
+
where each Sequence encapsulates the pattern items and support count.
|
|
625
659
|
|
|
626
660
|
Raises:
|
|
627
661
|
ValueError: If the minimum support threshold is not in the range `(0.0, 1.0]`.
|
|
@@ -632,7 +666,7 @@ class GSP:
|
|
|
632
666
|
- Status updates for each iteration until the algorithm terminates.
|
|
633
667
|
|
|
634
668
|
Examples:
|
|
635
|
-
Basic usage without temporal constraints:
|
|
669
|
+
Basic usage without temporal constraints (default tuple-based):
|
|
636
670
|
|
|
637
671
|
```python
|
|
638
672
|
from gsppy.gsp import GSP
|
|
@@ -645,6 +679,28 @@ class GSP:
|
|
|
645
679
|
|
|
646
680
|
gsp = GSP(transactions)
|
|
647
681
|
patterns = gsp.search(min_support=0.3)
|
|
682
|
+
# Returns: [{('Bread',): 4, ('Milk',): 4, ...}, {('Bread', 'Milk'): 3, ...}, ...]
|
|
683
|
+
```
|
|
684
|
+
|
|
685
|
+
Using Sequence objects for richer pattern representation:
|
|
686
|
+
|
|
687
|
+
```python
|
|
688
|
+
from gsppy.gsp import GSP
|
|
689
|
+
|
|
690
|
+
transactions = [
|
|
691
|
+
["Bread", "Milk"],
|
|
692
|
+
["Bread", "Diaper", "Beer", "Eggs"],
|
|
693
|
+
["Milk", "Diaper", "Beer", "Coke"],
|
|
694
|
+
]
|
|
695
|
+
|
|
696
|
+
gsp = GSP(transactions)
|
|
697
|
+
patterns = gsp.search(min_support=0.3, return_sequences=True)
|
|
698
|
+
# Returns: [[Sequence(('Bread',), support=4), Sequence(('Milk',), support=4), ...], ...]
|
|
699
|
+
|
|
700
|
+
# Access pattern details
|
|
701
|
+
for level_patterns in patterns:
|
|
702
|
+
for seq in level_patterns:
|
|
703
|
+
print(f"Pattern: {seq.items}, Support: {seq.support}")
|
|
648
704
|
```
|
|
649
705
|
|
|
650
706
|
Usage with temporal constraints (requires timestamped transactions):
|
|
@@ -682,6 +738,9 @@ class GSP:
|
|
|
682
738
|
f"Using temporal constraints: mingap={self.mingap}, maxgap={self.maxgap}, maxspan={self.maxspan}"
|
|
683
739
|
)
|
|
684
740
|
|
|
741
|
+
# Clear freq_patterns for this search (allow reusing the GSP instance)
|
|
742
|
+
self.freq_patterns = []
|
|
743
|
+
|
|
685
744
|
# Convert fractional support to absolute count (ceil to preserve threshold semantics)
|
|
686
745
|
abs_min_support = int(math.ceil(len(self.transactions) * float(min_support)))
|
|
687
746
|
|
|
@@ -729,4 +788,9 @@ class GSP:
|
|
|
729
788
|
self.verbose = original_verbose
|
|
730
789
|
self._configure_logging()
|
|
731
790
|
|
|
732
|
-
|
|
791
|
+
# Return results in the requested format
|
|
792
|
+
result = self.freq_patterns[:-1]
|
|
793
|
+
if return_sequences:
|
|
794
|
+
# Convert Dict[Tuple[str, ...], int] to List[Sequence] for each level
|
|
795
|
+
return [dict_to_sequences(level_patterns) for level_patterns in result]
|
|
796
|
+
return result
|
gsppy/sequence.py
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sequence abstraction for GSP-Py algorithms.
|
|
3
|
+
|
|
4
|
+
This module provides a Sequence class that encapsulates a pattern's elements,
|
|
5
|
+
associated transaction indices or counts, and any extra metadata (such as pattern
|
|
6
|
+
support, provenance, or timestamps). This abstraction enhances maintainability,
|
|
7
|
+
clarity, and future extensibility of GSP-Py's core logic.
|
|
8
|
+
|
|
9
|
+
The Sequence class is designed to be:
|
|
10
|
+
- Immutable and hashable (can be used as dictionary keys)
|
|
11
|
+
- Backward compatible with tuple representations
|
|
12
|
+
- Efficient for multiprocessing (pickleable)
|
|
13
|
+
- Extensible for future metadata additions
|
|
14
|
+
|
|
15
|
+
Author: Jackson Antonio do Prado Lima
|
|
16
|
+
Email: jacksonpradolima@gmail.com
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from typing import Any, Dict, List, Tuple, Union, Iterator, Optional, cast
|
|
22
|
+
from dataclasses import field, dataclass
|
|
23
|
+
from typing_extensions import override
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True, slots=True)
|
|
27
|
+
class Sequence:
|
|
28
|
+
"""
|
|
29
|
+
Represents a sequential pattern with associated metadata.
|
|
30
|
+
|
|
31
|
+
This class encapsulates a pattern (sequence of items) along with its
|
|
32
|
+
support count, transaction indices, and optional temporal metadata.
|
|
33
|
+
The class is immutable and hashable, allowing it to be used as dictionary
|
|
34
|
+
keys while providing a richer interface than bare tuples.
|
|
35
|
+
|
|
36
|
+
Attributes:
|
|
37
|
+
items (Tuple[str, ...]): The pattern elements as an immutable tuple.
|
|
38
|
+
support (int): The support count (number of transactions containing this pattern).
|
|
39
|
+
Defaults to 0 for candidate sequences not yet evaluated.
|
|
40
|
+
transaction_indices (Optional[Tuple[int, ...]]): Indices of transactions that
|
|
41
|
+
contain this pattern. Optional
|
|
42
|
+
as it may not always be tracked
|
|
43
|
+
to save memory.
|
|
44
|
+
metadata (Optional[dict]): Additional metadata such as timestamps, confidence,
|
|
45
|
+
lift, or other pattern-specific information.
|
|
46
|
+
|
|
47
|
+
Examples:
|
|
48
|
+
Create a simple sequence:
|
|
49
|
+
>>> seq = Sequence(items=("A", "B", "C"), support=5)
|
|
50
|
+
>>> seq.length
|
|
51
|
+
3
|
|
52
|
+
>>> seq.items
|
|
53
|
+
('A', 'B', 'C')
|
|
54
|
+
|
|
55
|
+
Create from tuple for backward compatibility:
|
|
56
|
+
>>> seq = Sequence.from_tuple(("A", "B"))
|
|
57
|
+
>>> seq.items
|
|
58
|
+
('A', 'B')
|
|
59
|
+
|
|
60
|
+
Use as dictionary key:
|
|
61
|
+
>>> patterns = {seq: 10}
|
|
62
|
+
>>> seq in patterns
|
|
63
|
+
True
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
items: Tuple[str, ...]
|
|
67
|
+
support: int = 0
|
|
68
|
+
transaction_indices: Optional[Tuple[int, ...]] = None
|
|
69
|
+
metadata: Optional[Dict[str, Any]] = field(default=None, compare=False, hash=False)
|
|
70
|
+
|
|
71
|
+
def __post_init__(self) -> None:
|
|
72
|
+
"""Validate the sequence after initialization."""
|
|
73
|
+
# Normalize items to tuple for immutability
|
|
74
|
+
object.__setattr__(self, "items", tuple(self.items))
|
|
75
|
+
|
|
76
|
+
if not self.items:
|
|
77
|
+
raise ValueError("Sequence items cannot be empty")
|
|
78
|
+
|
|
79
|
+
if self.support < 0:
|
|
80
|
+
raise ValueError("Support count cannot be negative")
|
|
81
|
+
|
|
82
|
+
if self.transaction_indices is not None:
|
|
83
|
+
object.__setattr__(self, "transaction_indices", tuple(self.transaction_indices))
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def length(self) -> int:
|
|
87
|
+
"""Return the length of the sequence (number of items)."""
|
|
88
|
+
return len(self.items)
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
def first_item(self) -> str:
|
|
92
|
+
"""Return the first item in the sequence."""
|
|
93
|
+
return self.items[0]
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def last_item(self) -> str:
|
|
97
|
+
"""Return the last item in the sequence."""
|
|
98
|
+
return self.items[-1]
|
|
99
|
+
|
|
100
|
+
def as_tuple(self) -> Tuple[str, ...]:
|
|
101
|
+
"""
|
|
102
|
+
Return the pattern as a plain tuple for backward compatibility.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Tuple[str, ...]: The sequence items as a tuple.
|
|
106
|
+
"""
|
|
107
|
+
return self.items
|
|
108
|
+
|
|
109
|
+
@classmethod
|
|
110
|
+
def from_tuple(
|
|
111
|
+
cls,
|
|
112
|
+
items: Tuple[str, ...],
|
|
113
|
+
support: int = 0,
|
|
114
|
+
transaction_indices: Optional[Tuple[int, ...]] = None,
|
|
115
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
116
|
+
) -> Sequence:
|
|
117
|
+
"""
|
|
118
|
+
Create a Sequence from a tuple of items.
|
|
119
|
+
|
|
120
|
+
This is a convenience method for backward compatibility with code
|
|
121
|
+
that uses plain tuples to represent patterns.
|
|
122
|
+
|
|
123
|
+
Parameters:
|
|
124
|
+
items (Tuple[str, ...]): The pattern elements.
|
|
125
|
+
support (int): The support count. Defaults to 0.
|
|
126
|
+
transaction_indices (Optional[Tuple[int, ...]]): Transaction indices.
|
|
127
|
+
metadata (Optional[dict]): Additional metadata.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Sequence: A new Sequence instance.
|
|
131
|
+
|
|
132
|
+
Examples:
|
|
133
|
+
>>> seq = Sequence.from_tuple(("A", "B", "C"), support=5)
|
|
134
|
+
>>> seq.items
|
|
135
|
+
('A', 'B', 'C')
|
|
136
|
+
>>> seq.support
|
|
137
|
+
5
|
|
138
|
+
"""
|
|
139
|
+
return cls(
|
|
140
|
+
items=items,
|
|
141
|
+
support=support,
|
|
142
|
+
transaction_indices=transaction_indices,
|
|
143
|
+
metadata=metadata,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
@classmethod
|
|
147
|
+
def from_item(cls, item: str, support: int = 0) -> Sequence:
|
|
148
|
+
"""
|
|
149
|
+
Create a singleton Sequence from a single item.
|
|
150
|
+
|
|
151
|
+
Parameters:
|
|
152
|
+
item (str): The single item.
|
|
153
|
+
support (int): The support count. Defaults to 0.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Sequence: A new Sequence instance containing only the item.
|
|
157
|
+
|
|
158
|
+
Examples:
|
|
159
|
+
>>> seq = Sequence.from_item("A", support=10)
|
|
160
|
+
>>> seq.items
|
|
161
|
+
('A',)
|
|
162
|
+
>>> seq.length
|
|
163
|
+
1
|
|
164
|
+
"""
|
|
165
|
+
return cls(items=(item,), support=support)
|
|
166
|
+
|
|
167
|
+
def extend(self, item: str, support: int = 0) -> Sequence:
|
|
168
|
+
"""
|
|
169
|
+
Create a new Sequence by extending this one with an additional item.
|
|
170
|
+
|
|
171
|
+
This is used during candidate generation to create k+1 sequences
|
|
172
|
+
from k sequences.
|
|
173
|
+
|
|
174
|
+
Parameters:
|
|
175
|
+
item (str): The item to append.
|
|
176
|
+
support (int): The support count for the new sequence. Defaults to 0.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Sequence: A new Sequence with the item appended.
|
|
180
|
+
|
|
181
|
+
Examples:
|
|
182
|
+
>>> seq = Sequence.from_tuple(("A", "B"))
|
|
183
|
+
>>> new_seq = seq.extend("C")
|
|
184
|
+
>>> new_seq.items
|
|
185
|
+
('A', 'B', 'C')
|
|
186
|
+
"""
|
|
187
|
+
return Sequence(
|
|
188
|
+
items=self.items + (item,),
|
|
189
|
+
support=support,
|
|
190
|
+
transaction_indices=None, # New sequence, no indices yet
|
|
191
|
+
metadata=self.metadata.copy() if self.metadata else None,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
def with_support(self, support: int, transaction_indices: Optional[Tuple[int, ...]] = None) -> Sequence:
|
|
195
|
+
"""
|
|
196
|
+
Create a new Sequence with updated support information.
|
|
197
|
+
|
|
198
|
+
This is used after calculating support to update the sequence
|
|
199
|
+
with its actual support count and optionally transaction indices.
|
|
200
|
+
|
|
201
|
+
Parameters:
|
|
202
|
+
support (int): The new support count.
|
|
203
|
+
transaction_indices (Optional[Tuple[int, ...]]): Transaction indices.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Sequence: A new Sequence with updated support information.
|
|
207
|
+
|
|
208
|
+
Examples:
|
|
209
|
+
>>> seq = Sequence.from_tuple(("A", "B"))
|
|
210
|
+
>>> supported_seq = seq.with_support(5, (0, 2, 4))
|
|
211
|
+
>>> supported_seq.support
|
|
212
|
+
5
|
|
213
|
+
"""
|
|
214
|
+
return Sequence(
|
|
215
|
+
items=self.items,
|
|
216
|
+
support=support,
|
|
217
|
+
transaction_indices=transaction_indices,
|
|
218
|
+
metadata=self.metadata,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
def with_metadata(self, **kwargs: Any) -> Sequence:
|
|
222
|
+
"""
|
|
223
|
+
Create a new Sequence with additional or updated metadata.
|
|
224
|
+
|
|
225
|
+
Parameters:
|
|
226
|
+
**kwargs: Metadata key-value pairs to add or update.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
Sequence: A new Sequence with updated metadata.
|
|
230
|
+
|
|
231
|
+
Examples:
|
|
232
|
+
>>> seq = Sequence.from_tuple(("A", "B"), support=5)
|
|
233
|
+
>>> seq_with_meta = seq.with_metadata(confidence=0.75, lift=1.2)
|
|
234
|
+
>>> seq_with_meta.metadata
|
|
235
|
+
{'confidence': 0.75, 'lift': 1.2}
|
|
236
|
+
"""
|
|
237
|
+
new_metadata: Dict[str, Any] = (self.metadata or {}).copy()
|
|
238
|
+
new_metadata.update(kwargs)
|
|
239
|
+
return Sequence(
|
|
240
|
+
items=self.items,
|
|
241
|
+
support=self.support,
|
|
242
|
+
transaction_indices=self.transaction_indices,
|
|
243
|
+
metadata=new_metadata,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
@override
|
|
247
|
+
def __repr__(self) -> str:
|
|
248
|
+
"""Return a string representation of the Sequence."""
|
|
249
|
+
parts = [f"items={self.items}", f"support={self.support}"]
|
|
250
|
+
if self.transaction_indices is not None:
|
|
251
|
+
parts.append(f"transaction_indices={self.transaction_indices}")
|
|
252
|
+
if self.metadata:
|
|
253
|
+
parts.append(f"metadata={self.metadata}")
|
|
254
|
+
return f"Sequence({', '.join(parts)})"
|
|
255
|
+
|
|
256
|
+
@override
|
|
257
|
+
def __str__(self) -> str:
|
|
258
|
+
"""Return a human-readable string representation."""
|
|
259
|
+
return f"{self.items} (support={self.support})"
|
|
260
|
+
|
|
261
|
+
def __len__(self) -> int:
|
|
262
|
+
"""Return the length of the sequence."""
|
|
263
|
+
return len(self.items)
|
|
264
|
+
|
|
265
|
+
def __getitem__(self, index: Union[int, slice]) -> Union[str, Tuple[str, ...]]:
|
|
266
|
+
"""
|
|
267
|
+
Access items by index or slice.
|
|
268
|
+
|
|
269
|
+
Parameters:
|
|
270
|
+
index: Integer index or slice object.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
str or Tuple[str, ...]: Single item or tuple of items.
|
|
274
|
+
"""
|
|
275
|
+
return self.items[index]
|
|
276
|
+
|
|
277
|
+
def __iter__(self) -> Iterator[str]:
|
|
278
|
+
"""Iterate over the items in the sequence."""
|
|
279
|
+
return iter(self.items)
|
|
280
|
+
|
|
281
|
+
def __contains__(self, item: str) -> bool:
|
|
282
|
+
"""Check if an item is in the sequence."""
|
|
283
|
+
return item in self.items
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
# Utility functions for working with Sequences and tuples
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def sequences_to_dict(sequences: List[Sequence]) -> dict[Tuple[str, ...], int]:
|
|
290
|
+
"""
|
|
291
|
+
Convert a list of Sequence objects to a dictionary mapping tuples to support counts.
|
|
292
|
+
|
|
293
|
+
This function provides backward compatibility with code expecting the
|
|
294
|
+
traditional Dict[Tuple[str, ...], int] format.
|
|
295
|
+
|
|
296
|
+
Parameters:
|
|
297
|
+
sequences (List[Sequence]): List of Sequence objects.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
dict[Tuple[str, ...], int]: Dictionary mapping pattern tuples to support counts.
|
|
301
|
+
|
|
302
|
+
Examples:
|
|
303
|
+
>>> seqs = [Sequence(("A",), 5), Sequence(("B",), 3)]
|
|
304
|
+
>>> sequences_to_dict(seqs)
|
|
305
|
+
{('A',): 5, ('B',): 3}
|
|
306
|
+
"""
|
|
307
|
+
return {seq.items: seq.support for seq in sequences}
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def dict_to_sequences(pattern_dict: dict[Tuple[str, ...], int]) -> List[Sequence]:
|
|
311
|
+
"""
|
|
312
|
+
Convert a dictionary of patterns to a list of Sequence objects.
|
|
313
|
+
|
|
314
|
+
This function converts the traditional Dict[Tuple[str, ...], int] format
|
|
315
|
+
to Sequence objects.
|
|
316
|
+
|
|
317
|
+
Parameters:
|
|
318
|
+
pattern_dict (dict[Tuple[str, ...], int]): Dictionary mapping tuples to support.
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
List[Sequence]: List of Sequence objects.
|
|
322
|
+
|
|
323
|
+
Examples:
|
|
324
|
+
>>> patterns = {("A",): 5, ("B",): 3}
|
|
325
|
+
>>> seqs = dict_to_sequences(patterns)
|
|
326
|
+
>>> len(seqs)
|
|
327
|
+
2
|
|
328
|
+
"""
|
|
329
|
+
return [Sequence.from_tuple(items, support=support) for items, support in pattern_dict.items()]
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def is_sequence_or_tuple(obj: Any) -> bool:
|
|
333
|
+
"""
|
|
334
|
+
Check if an object is a Sequence instance or a tuple.
|
|
335
|
+
|
|
336
|
+
Parameters:
|
|
337
|
+
obj: Object to check.
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
bool: True if obj is a Sequence or tuple, False otherwise.
|
|
341
|
+
"""
|
|
342
|
+
return isinstance(obj, (Sequence, tuple))
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def to_sequence(obj: object, support: int = 0) -> Sequence:
|
|
346
|
+
"""
|
|
347
|
+
Convert various input types to a Sequence object.
|
|
348
|
+
|
|
349
|
+
Parameters:
|
|
350
|
+
obj: Input object (Sequence, tuple, or string).
|
|
351
|
+
support: Support count to use if creating a new Sequence.
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
Sequence: A Sequence object.
|
|
355
|
+
|
|
356
|
+
Examples:
|
|
357
|
+
>>> to_sequence(("A", "B"), support=5)
|
|
358
|
+
Sequence(items=('A', 'B'), support=5)
|
|
359
|
+
>>> seq = Sequence(("X",), 3)
|
|
360
|
+
>>> to_sequence(seq)
|
|
361
|
+
Sequence(items=('X',), support=3)
|
|
362
|
+
"""
|
|
363
|
+
if isinstance(obj, Sequence):
|
|
364
|
+
return obj
|
|
365
|
+
if isinstance(obj, tuple):
|
|
366
|
+
if not all(isinstance(item, str) for item in obj): # pyright: ignore[reportUnknownVariableType]
|
|
367
|
+
raise TypeError("Tuple items must be strings")
|
|
368
|
+
return Sequence.from_tuple(cast(Tuple[str, ...], obj), support=support)
|
|
369
|
+
if isinstance(obj, str):
|
|
370
|
+
return Sequence.from_item(obj, support=support)
|
|
371
|
+
raise TypeError(f"Cannot convert {type(obj)} to Sequence")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gsppy
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.1.0
|
|
4
4
|
Summary: GSP (Generalized Sequence Pattern) algorithm in Python
|
|
5
5
|
Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
|
|
6
6
|
Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
@@ -559,6 +559,82 @@ Verbose mode provides:
|
|
|
559
559
|
|
|
560
560
|
For complete documentation on logging, see [docs/logging.md](docs/logging.md).
|
|
561
561
|
|
|
562
|
+
### Using Sequence Objects for Rich Pattern Representation
|
|
563
|
+
|
|
564
|
+
GSP-Py 4.0+ introduces a **Sequence abstraction class** that provides a richer, more maintainable way to work with sequential patterns. The Sequence class encapsulates pattern items, support counts, and optional metadata in an immutable, hashable object.
|
|
565
|
+
|
|
566
|
+
#### Traditional Dict-based Output (Default)
|
|
567
|
+
|
|
568
|
+
```python
|
|
569
|
+
from gsppy import GSP
|
|
570
|
+
|
|
571
|
+
transactions = [
|
|
572
|
+
['Bread', 'Milk'],
|
|
573
|
+
['Bread', 'Diaper', 'Beer', 'Eggs'],
|
|
574
|
+
['Milk', 'Diaper', 'Beer', 'Coke']
|
|
575
|
+
]
|
|
576
|
+
|
|
577
|
+
gsp = GSP(transactions)
|
|
578
|
+
result = gsp.search(min_support=0.3)
|
|
579
|
+
|
|
580
|
+
# Returns: [{('Bread',): 4, ('Milk',): 4, ...}, {('Bread', 'Milk'): 3, ...}, ...]
|
|
581
|
+
for level_patterns in result:
|
|
582
|
+
for pattern, support in level_patterns.items():
|
|
583
|
+
print(f"Pattern: {pattern}, Support: {support}")
|
|
584
|
+
```
|
|
585
|
+
|
|
586
|
+
#### Sequence Objects (New Feature)
|
|
587
|
+
|
|
588
|
+
```python
|
|
589
|
+
from gsppy import GSP
|
|
590
|
+
|
|
591
|
+
transactions = [
|
|
592
|
+
['Bread', 'Milk'],
|
|
593
|
+
['Bread', 'Diaper', 'Beer', 'Eggs'],
|
|
594
|
+
['Milk', 'Diaper', 'Beer', 'Coke']
|
|
595
|
+
]
|
|
596
|
+
|
|
597
|
+
gsp = GSP(transactions)
|
|
598
|
+
result = gsp.search(min_support=0.3, return_sequences=True)
|
|
599
|
+
|
|
600
|
+
# Returns: [[Sequence(('Bread',), support=4), ...], [Sequence(('Bread', 'Milk'), support=3), ...], ...]
|
|
601
|
+
for level_patterns in result:
|
|
602
|
+
for seq in level_patterns:
|
|
603
|
+
print(f"Pattern: {seq.items}, Support: {seq.support}, Length: {seq.length}")
|
|
604
|
+
# Access sequence properties
|
|
605
|
+
print(f" First item: {seq.first_item}, Last item: {seq.last_item}")
|
|
606
|
+
# Check if item is in sequence
|
|
607
|
+
if "Milk" in seq:
|
|
608
|
+
print(f" Contains Milk!")
|
|
609
|
+
```
|
|
610
|
+
|
|
611
|
+
#### Key Benefits of Sequence Objects
|
|
612
|
+
|
|
613
|
+
1. **Rich API**: Access pattern properties like `length`, `first_item`, `last_item`
|
|
614
|
+
2. **Type Safety**: IDE autocomplete and better type hints
|
|
615
|
+
3. **Immutable & Hashable**: Can be used as dictionary keys
|
|
616
|
+
4. **Extensible**: Add metadata for confidence, lift, or custom properties
|
|
617
|
+
5. **Backward Compatible**: Convert to/from dict format as needed
|
|
618
|
+
|
|
619
|
+
```python
|
|
620
|
+
from gsppy import Sequence, sequences_to_dict, dict_to_sequences
|
|
621
|
+
|
|
622
|
+
# Create custom sequences
|
|
623
|
+
seq = Sequence.from_tuple(("A", "B", "C"), support=5)
|
|
624
|
+
|
|
625
|
+
# Extend sequences
|
|
626
|
+
extended = seq.extend("D") # Creates Sequence(("A", "B", "C", "D"))
|
|
627
|
+
|
|
628
|
+
# Add metadata
|
|
629
|
+
seq_with_meta = seq.with_metadata(confidence=0.85, lift=1.5)
|
|
630
|
+
|
|
631
|
+
# Convert between formats for compatibility
|
|
632
|
+
seq_result = gsp.search(min_support=0.3, return_sequences=True)
|
|
633
|
+
dict_format = sequences_to_dict(seq_result[0]) # Convert to dict
|
|
634
|
+
```
|
|
635
|
+
|
|
636
|
+
For a complete example, see [examples/sequence_example.py](examples/sequence_example.py).
|
|
637
|
+
|
|
562
638
|
### Loading SPM/GSP Format Files
|
|
563
639
|
|
|
564
640
|
GSP-Py supports loading datasets in the classical SPM/GSP delimiter format, which is widely used in sequential pattern mining research. This format uses:
|
|
@@ -1,15 +1,16 @@
|
|
|
1
|
-
gsppy/__init__.py,sha256=
|
|
1
|
+
gsppy/__init__.py,sha256=2RKQOByljT55UYgCJrSLSMNlMyu0AjsQtvvqUaWahyA,2444
|
|
2
2
|
gsppy/accelerate.py,sha256=rDho3ysADETpuhT2SF9voBjd3XRaQUzuA5k_baNACF8,11020
|
|
3
|
-
gsppy/cli.py,sha256=
|
|
3
|
+
gsppy/cli.py,sha256=dn5V5mcun0Hu3A5IgtBbJp3nZs4t21zYOenTsqktJY8,22424
|
|
4
4
|
gsppy/dataframe_adapters.py,sha256=urAu32a4YsMRnm0yGvxT_XrRHfB_EYWClHH2f4OHH8w,15773
|
|
5
5
|
gsppy/enums.py,sha256=2LxMWGJNWMgjhCWv_nzKWXi4iHU1S12qns3DpBUraAw,1265
|
|
6
|
-
gsppy/gsp.py,sha256=
|
|
6
|
+
gsppy/gsp.py,sha256=uFnTNrEkWinBUXYqv0eM2M0bpB2CibKo_yIfh4RAYaM,35429
|
|
7
7
|
gsppy/pruning.py,sha256=hOoQoH1k_gzACBy6qr_cvwth9WDmKuLmJyVRDbHjFFM,14779
|
|
8
8
|
gsppy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
gsppy/sequence.py,sha256=BBtaThzlEgtrL5eMXNn4fge-VkniLUT1TeGOL6hbpSg,12353
|
|
9
10
|
gsppy/token_mapper.py,sha256=JhPe_IZMnbM3GzdQwzleYIkE2aVw01QoYdG1TXWoCqw,2818
|
|
10
11
|
gsppy/utils.py,sha256=Ys5B9aJxJBCEXe51HK00nq3-Yf7fIGntoOzSvxSFlro,17592
|
|
11
|
-
gsppy-4.
|
|
12
|
-
gsppy-4.
|
|
13
|
-
gsppy-4.
|
|
14
|
-
gsppy-4.
|
|
15
|
-
gsppy-4.
|
|
12
|
+
gsppy-4.1.0.dist-info/METADATA,sha256=6KTEiLxg67lhyVzdCm_VnBL9h8pYuqYLOQZhzu2kPXc,45593
|
|
13
|
+
gsppy-4.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
14
|
+
gsppy-4.1.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
15
|
+
gsppy-4.1.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
|
|
16
|
+
gsppy-4.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|