gsppy 3.5.0__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsppy/__init__.py +14 -0
- gsppy/gsp.py +55 -2
- gsppy/pruning.py +412 -0
- {gsppy-3.5.0.dist-info → gsppy-3.6.0.dist-info}/METADATA +137 -5
- gsppy-3.6.0.dist-info/RECORD +12 -0
- gsppy-3.5.0.dist-info/RECORD +0 -11
- {gsppy-3.5.0.dist-info → gsppy-3.6.0.dist-info}/WHEEL +0 -0
- {gsppy-3.5.0.dist-info → gsppy-3.6.0.dist-info}/entry_points.txt +0 -0
- {gsppy-3.5.0.dist-info → gsppy-3.6.0.dist-info}/licenses/LICENSE +0 -0
gsppy/__init__.py
CHANGED
|
@@ -13,6 +13,14 @@ from gsppy.cli import (
|
|
|
13
13
|
read_transactions_from_json,
|
|
14
14
|
)
|
|
15
15
|
from gsppy.gsp import GSP
|
|
16
|
+
from gsppy.pruning import (
|
|
17
|
+
PruningStrategy,
|
|
18
|
+
SupportBasedPruning,
|
|
19
|
+
FrequencyBasedPruning,
|
|
20
|
+
TemporalAwarePruning,
|
|
21
|
+
CombinedPruning,
|
|
22
|
+
create_default_pruning_strategy,
|
|
23
|
+
)
|
|
16
24
|
|
|
17
25
|
try:
|
|
18
26
|
__version__ = importlib_metadata.version("gsppy")
|
|
@@ -26,4 +34,10 @@ __all__ = [
|
|
|
26
34
|
"read_transactions_from_json",
|
|
27
35
|
"setup_logging",
|
|
28
36
|
"__version__",
|
|
37
|
+
"PruningStrategy",
|
|
38
|
+
"SupportBasedPruning",
|
|
39
|
+
"FrequencyBasedPruning",
|
|
40
|
+
"TemporalAwarePruning",
|
|
41
|
+
"CombinedPruning",
|
|
42
|
+
"create_default_pruning_strategy",
|
|
29
43
|
]
|
gsppy/gsp.py
CHANGED
|
@@ -99,6 +99,7 @@ from gsppy.utils import (
|
|
|
99
99
|
generate_candidates_from_previous,
|
|
100
100
|
is_subsequence_in_list_with_time_constraints,
|
|
101
101
|
)
|
|
102
|
+
from gsppy.pruning import PruningStrategy, create_default_pruning_strategy
|
|
102
103
|
from gsppy.accelerate import support_counts as support_counts_accel
|
|
103
104
|
|
|
104
105
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
@@ -130,6 +131,7 @@ class GSP:
|
|
|
130
131
|
maxgap: Optional[float] = None,
|
|
131
132
|
maxspan: Optional[float] = None,
|
|
132
133
|
verbose: bool = False,
|
|
134
|
+
pruning_strategy: Optional[PruningStrategy] = None,
|
|
133
135
|
):
|
|
134
136
|
"""
|
|
135
137
|
Initialize the GSP algorithm with raw transactional data.
|
|
@@ -144,6 +146,9 @@ class GSP:
|
|
|
144
146
|
maxspan (Optional[float]): Maximum time span from first to last item in patterns.
|
|
145
147
|
verbose (bool): Enable verbose logging output with detailed progress information.
|
|
146
148
|
Default is False (minimal output).
|
|
149
|
+
pruning_strategy (Optional[PruningStrategy]): Custom pruning strategy for candidate filtering.
|
|
150
|
+
If None, a default strategy is created based on
|
|
151
|
+
temporal constraints.
|
|
147
152
|
|
|
148
153
|
Attributes Initialized:
|
|
149
154
|
- Processes the input raw transaction dataset.
|
|
@@ -162,9 +167,18 @@ class GSP:
|
|
|
162
167
|
self.maxgap = maxgap
|
|
163
168
|
self.maxspan = maxspan
|
|
164
169
|
self.verbose = verbose
|
|
170
|
+
self.pruning_strategy: PruningStrategy
|
|
165
171
|
self._configure_logging()
|
|
166
172
|
self._validate_temporal_constraints()
|
|
167
173
|
self._pre_processing(raw_transactions)
|
|
174
|
+
# Initialize default pruning strategy if none provided
|
|
175
|
+
if pruning_strategy is None:
|
|
176
|
+
self.pruning_strategy = create_default_pruning_strategy(
|
|
177
|
+
mingap=self.mingap, maxgap=self.maxgap, maxspan=self.maxspan
|
|
178
|
+
)
|
|
179
|
+
logger.debug("Using default pruning strategy: %s", self.pruning_strategy.get_description())
|
|
180
|
+
else:
|
|
181
|
+
self.pruning_strategy = pruning_strategy
|
|
168
182
|
|
|
169
183
|
def _configure_logging(self) -> None:
|
|
170
184
|
"""
|
|
@@ -389,6 +403,39 @@ class GSP:
|
|
|
389
403
|
# Fallback to Python implementation on any acceleration failure
|
|
390
404
|
return self._support_python(items, min_support, batch_size)
|
|
391
405
|
|
|
406
|
+
def _apply_pruning(
|
|
407
|
+
self, freq_patterns: Dict[Tuple[str, ...], int], min_support_count: int
|
|
408
|
+
) -> Dict[Tuple[str, ...], int]:
|
|
409
|
+
"""
|
|
410
|
+
Apply the configured pruning strategy to filter frequent patterns.
|
|
411
|
+
|
|
412
|
+
This method uses the pruning strategy to post-process patterns that have
|
|
413
|
+
already met the minimum support threshold. Additional pruning can be applied
|
|
414
|
+
based on other criteria such as temporal feasibility or frequency thresholds.
|
|
415
|
+
|
|
416
|
+
Parameters:
|
|
417
|
+
freq_patterns (Dict[Tuple[str, ...], int]): Dictionary of patterns and their support counts.
|
|
418
|
+
min_support_count (int): Absolute minimum support count threshold.
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
Dict[Tuple[str, ...], int]: Filtered patterns after applying pruning strategy.
|
|
422
|
+
"""
|
|
423
|
+
if not freq_patterns:
|
|
424
|
+
return freq_patterns
|
|
425
|
+
|
|
426
|
+
pruned_patterns: Dict[Tuple[str, ...], int] = {}
|
|
427
|
+
context = {"min_support_count": min_support_count}
|
|
428
|
+
|
|
429
|
+
for candidate, support_count in freq_patterns.items():
|
|
430
|
+
if not self.pruning_strategy.should_prune(candidate, support_count, len(self.transactions), context):
|
|
431
|
+
pruned_patterns[candidate] = support_count
|
|
432
|
+
|
|
433
|
+
num_pruned = len(freq_patterns) - len(pruned_patterns)
|
|
434
|
+
if num_pruned > 0:
|
|
435
|
+
logger.debug("Pruning strategy filtered out %d additional candidates", num_pruned)
|
|
436
|
+
|
|
437
|
+
return pruned_patterns
|
|
438
|
+
|
|
392
439
|
def _print_status(self, run: int, candidates: List[Tuple[str, ...]]) -> None:
|
|
393
440
|
"""
|
|
394
441
|
Log progress information for the current GSP iteration.
|
|
@@ -504,7 +551,10 @@ class GSP:
|
|
|
504
551
|
|
|
505
552
|
# scan transactions to collect support count for each candidate
|
|
506
553
|
# sequence & filter
|
|
507
|
-
self.
|
|
554
|
+
freq_1 = self._support(candidates, abs_min_support, backend=backend)
|
|
555
|
+
# Apply pruning strategy for additional filtering
|
|
556
|
+
freq_1 = self._apply_pruning(freq_1, abs_min_support)
|
|
557
|
+
self.freq_patterns.append(freq_1)
|
|
508
558
|
|
|
509
559
|
# (k-itemsets/k-sequence = 1)
|
|
510
560
|
k_items = 1
|
|
@@ -525,7 +575,10 @@ class GSP:
|
|
|
525
575
|
|
|
526
576
|
# candidate pruning - eliminates candidates who are not potentially
|
|
527
577
|
# frequent (using support as threshold)
|
|
528
|
-
self.
|
|
578
|
+
freq_k = self._support(candidates, abs_min_support, backend=backend)
|
|
579
|
+
# Apply pruning strategy for additional filtering
|
|
580
|
+
freq_k = self._apply_pruning(freq_k, abs_min_support)
|
|
581
|
+
self.freq_patterns.append(freq_k)
|
|
529
582
|
|
|
530
583
|
self._print_status(k_items, candidates)
|
|
531
584
|
logger.info("GSP algorithm completed.")
|
gsppy/pruning.py
ADDED
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Flexible candidate pruning strategies for the GSP algorithm.
|
|
3
|
+
|
|
4
|
+
This module provides a pluggable pruning system that allows different strategies
|
|
5
|
+
for filtering candidate sequences during pattern mining. The pruning strategies
|
|
6
|
+
can significantly impact performance and pattern discovery based on dataset
|
|
7
|
+
characteristics and mining requirements.
|
|
8
|
+
|
|
9
|
+
Key Features:
|
|
10
|
+
-------------
|
|
11
|
+
1. **Abstract Pruning Strategy Interface**:
|
|
12
|
+
- Defines a common interface for all pruning strategies.
|
|
13
|
+
- Allows custom pruning logic to be easily integrated.
|
|
14
|
+
|
|
15
|
+
2. **Built-in Pruning Strategies**:
|
|
16
|
+
- **SupportBasedPruning**: Standard GSP pruning based on minimum support threshold.
|
|
17
|
+
- **FrequencyBasedPruning**: Prunes candidates with low absolute frequency.
|
|
18
|
+
- **TemporalAwarePruning**: Prunes candidates that violate temporal constraints.
|
|
19
|
+
- **CombinedPruning**: Combines multiple pruning strategies.
|
|
20
|
+
|
|
21
|
+
3. **Performance Optimization**:
|
|
22
|
+
- Early termination of candidate generation when patterns cannot be extended.
|
|
23
|
+
- Reduces memory footprint by eliminating non-promising candidates early.
|
|
24
|
+
|
|
25
|
+
Example Usage:
|
|
26
|
+
--------------
|
|
27
|
+
```python
|
|
28
|
+
from gsppy.gsp import GSP
|
|
29
|
+
from gsppy.pruning import SupportBasedPruning, FrequencyBasedPruning, CombinedPruning
|
|
30
|
+
|
|
31
|
+
# Use default support-based pruning
|
|
32
|
+
gsp = GSP(transactions)
|
|
33
|
+
patterns = gsp.search(min_support=0.3)
|
|
34
|
+
|
|
35
|
+
# Use frequency-based pruning with a minimum frequency threshold
|
|
36
|
+
pruner = FrequencyBasedPruning(min_frequency=5)
|
|
37
|
+
gsp = GSP(transactions, pruning_strategy=pruner)
|
|
38
|
+
patterns = gsp.search(min_support=0.3)
|
|
39
|
+
|
|
40
|
+
# Combine multiple pruning strategies
|
|
41
|
+
combined = CombinedPruning([SupportBasedPruning(), FrequencyBasedPruning(min_frequency=3)])
|
|
42
|
+
gsp = GSP(transactions, pruning_strategy=combined)
|
|
43
|
+
patterns = gsp.search(min_support=0.3)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Author:
|
|
47
|
+
-------
|
|
48
|
+
- **Developed by:** Jackson Antonio do Prado Lima
|
|
49
|
+
- **Email:** jacksonpradolima@gmail.com
|
|
50
|
+
|
|
51
|
+
License:
|
|
52
|
+
--------
|
|
53
|
+
This implementation is distributed under the MIT License.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
import math
|
|
57
|
+
from abc import ABC, abstractmethod
|
|
58
|
+
from typing import List, Tuple, Mapping, Optional
|
|
59
|
+
from typing_extensions import override
|
|
60
|
+
|
|
61
|
+
PruningContext = Mapping[str, object]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class PruningStrategy(ABC):
|
|
65
|
+
"""
|
|
66
|
+
Abstract base class for candidate pruning strategies.
|
|
67
|
+
|
|
68
|
+
A pruning strategy determines which candidate sequences should be
|
|
69
|
+
filtered out during the GSP algorithm's candidate generation phase.
|
|
70
|
+
Custom pruning strategies can be implemented by subclassing this class
|
|
71
|
+
and implementing the `should_prune` method.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def should_prune(
|
|
76
|
+
self,
|
|
77
|
+
candidate: Tuple[str, ...],
|
|
78
|
+
support_count: int,
|
|
79
|
+
total_transactions: int,
|
|
80
|
+
context: Optional[PruningContext] = None,
|
|
81
|
+
) -> bool:
|
|
82
|
+
"""
|
|
83
|
+
Determine whether a candidate sequence should be pruned.
|
|
84
|
+
|
|
85
|
+
Parameters:
|
|
86
|
+
candidate (Tuple[str, ...]): The candidate sequence to evaluate.
|
|
87
|
+
support_count (int): The support count of the candidate in the dataset.
|
|
88
|
+
total_transactions (int): Total number of transactions in the dataset.
|
|
89
|
+
context (Optional[Dict]): Additional context information for pruning decisions.
|
|
90
|
+
May include temporal constraints, pattern length, etc.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
bool: True if the candidate should be pruned (filtered out), False otherwise.
|
|
94
|
+
"""
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
def get_description(self) -> str:
|
|
98
|
+
"""
|
|
99
|
+
Get a human-readable description of the pruning strategy.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
str: Description of the pruning strategy.
|
|
103
|
+
"""
|
|
104
|
+
return self.__class__.__name__
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class SupportBasedPruning(PruningStrategy):
|
|
108
|
+
"""
|
|
109
|
+
Standard GSP pruning based on minimum support threshold.
|
|
110
|
+
|
|
111
|
+
This is the default pruning strategy used in the classic GSP algorithm.
|
|
112
|
+
Candidates are pruned if their support count is below the minimum support
|
|
113
|
+
threshold.
|
|
114
|
+
|
|
115
|
+
Parameters:
|
|
116
|
+
min_support_fraction (Optional[float]): Minimum support as a fraction (0.0, 1.0].
|
|
117
|
+
If None, uses the value from search parameters.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
def __init__(self, min_support_fraction: Optional[float] = None):
|
|
121
|
+
"""
|
|
122
|
+
Initialize support-based pruning strategy.
|
|
123
|
+
|
|
124
|
+
Parameters:
|
|
125
|
+
min_support_fraction (Optional[float]): Minimum support threshold.
|
|
126
|
+
If None, uses the value from search.
|
|
127
|
+
"""
|
|
128
|
+
self.min_support_fraction = min_support_fraction
|
|
129
|
+
|
|
130
|
+
@override
|
|
131
|
+
def should_prune(
|
|
132
|
+
self,
|
|
133
|
+
candidate: Tuple[str, ...],
|
|
134
|
+
support_count: int,
|
|
135
|
+
total_transactions: int,
|
|
136
|
+
context: Optional[PruningContext] = None,
|
|
137
|
+
) -> bool:
|
|
138
|
+
"""
|
|
139
|
+
Prune candidates below the minimum support threshold.
|
|
140
|
+
|
|
141
|
+
Parameters:
|
|
142
|
+
candidate: The candidate sequence.
|
|
143
|
+
support_count: Support count of the candidate.
|
|
144
|
+
total_transactions: Total number of transactions.
|
|
145
|
+
context: Optional context with 'min_support_count' key.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
bool: True if support_count < min_support_count, False otherwise.
|
|
149
|
+
"""
|
|
150
|
+
# Prioritize user-provided min_support_fraction if set, otherwise use context
|
|
151
|
+
min_support_count: int
|
|
152
|
+
|
|
153
|
+
if self.min_support_fraction is not None:
|
|
154
|
+
min_support_count = int(math.ceil(total_transactions * self.min_support_fraction))
|
|
155
|
+
elif context is not None:
|
|
156
|
+
min_support_value = context.get("min_support_count")
|
|
157
|
+
if isinstance(min_support_value, int):
|
|
158
|
+
min_support_count = min_support_value
|
|
159
|
+
elif isinstance(min_support_value, float):
|
|
160
|
+
min_support_count = int(math.ceil(min_support_value))
|
|
161
|
+
else:
|
|
162
|
+
# Context does not provide a usable threshold
|
|
163
|
+
return False
|
|
164
|
+
else:
|
|
165
|
+
# If no threshold specified, don't prune
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
return support_count < min_support_count
|
|
169
|
+
|
|
170
|
+
@override
|
|
171
|
+
def get_description(self) -> str:
|
|
172
|
+
"""Get description of this pruning strategy."""
|
|
173
|
+
if self.min_support_fraction is not None:
|
|
174
|
+
return f"SupportBasedPruning(min_support={self.min_support_fraction})"
|
|
175
|
+
return "SupportBasedPruning(dynamic)"
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class FrequencyBasedPruning(PruningStrategy):
|
|
179
|
+
"""
|
|
180
|
+
Prunes candidates based on absolute frequency threshold.
|
|
181
|
+
|
|
182
|
+
This strategy prunes candidates that appear fewer times than a specified
|
|
183
|
+
minimum frequency, regardless of the dataset size. Useful for datasets
|
|
184
|
+
where you want to ensure patterns appear a minimum number of times.
|
|
185
|
+
|
|
186
|
+
Parameters:
|
|
187
|
+
min_frequency (int): Minimum absolute frequency threshold.
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
def __init__(self, min_frequency: int):
|
|
191
|
+
"""
|
|
192
|
+
Initialize frequency-based pruning strategy.
|
|
193
|
+
|
|
194
|
+
Parameters:
|
|
195
|
+
min_frequency (int): Minimum number of occurrences required.
|
|
196
|
+
"""
|
|
197
|
+
if min_frequency < 1:
|
|
198
|
+
raise ValueError("min_frequency must be at least 1")
|
|
199
|
+
self.min_frequency = min_frequency
|
|
200
|
+
|
|
201
|
+
@override
|
|
202
|
+
def should_prune(
|
|
203
|
+
self,
|
|
204
|
+
candidate: Tuple[str, ...],
|
|
205
|
+
support_count: int,
|
|
206
|
+
total_transactions: int,
|
|
207
|
+
context: Optional[PruningContext] = None,
|
|
208
|
+
) -> bool:
|
|
209
|
+
"""
|
|
210
|
+
Prune candidates with frequency below the minimum threshold.
|
|
211
|
+
|
|
212
|
+
Parameters:
|
|
213
|
+
candidate: The candidate sequence.
|
|
214
|
+
support_count: Support count (frequency) of the candidate.
|
|
215
|
+
total_transactions: Total number of transactions (unused).
|
|
216
|
+
context: Optional context (unused).
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
bool: True if support_count < min_frequency, False otherwise.
|
|
220
|
+
"""
|
|
221
|
+
return support_count < self.min_frequency
|
|
222
|
+
|
|
223
|
+
@override
|
|
224
|
+
def get_description(self) -> str:
|
|
225
|
+
"""Get description of this pruning strategy."""
|
|
226
|
+
return f"FrequencyBasedPruning(min_frequency={self.min_frequency})"
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
class TemporalAwarePruning(PruningStrategy):
|
|
230
|
+
"""
|
|
231
|
+
Prunes candidates based on temporal constraint feasibility.
|
|
232
|
+
|
|
233
|
+
This strategy can pre-filter candidates that are unlikely to satisfy
|
|
234
|
+
temporal constraints (mingap, maxgap, maxspan) based on pattern structure
|
|
235
|
+
and candidate length.
|
|
236
|
+
|
|
237
|
+
Parameters:
|
|
238
|
+
mingap (Optional[float]): Minimum time gap between consecutive items.
|
|
239
|
+
maxgap (Optional[float]): Maximum time gap between consecutive items.
|
|
240
|
+
maxspan (Optional[float]): Maximum time span from first to last item.
|
|
241
|
+
min_support_fraction (Optional[float]): Additional support threshold.
|
|
242
|
+
"""
|
|
243
|
+
|
|
244
|
+
def __init__(
|
|
245
|
+
self,
|
|
246
|
+
mingap: Optional[float] = None,
|
|
247
|
+
maxgap: Optional[float] = None,
|
|
248
|
+
maxspan: Optional[float] = None,
|
|
249
|
+
min_support_fraction: Optional[float] = None,
|
|
250
|
+
):
|
|
251
|
+
"""
|
|
252
|
+
Initialize temporal-aware pruning strategy.
|
|
253
|
+
|
|
254
|
+
Parameters:
|
|
255
|
+
mingap: Minimum time gap constraint.
|
|
256
|
+
maxgap: Maximum time gap constraint.
|
|
257
|
+
maxspan: Maximum time span constraint.
|
|
258
|
+
min_support_fraction: Additional support threshold.
|
|
259
|
+
"""
|
|
260
|
+
self.mingap = mingap
|
|
261
|
+
self.maxgap = maxgap
|
|
262
|
+
self.maxspan = maxspan
|
|
263
|
+
self.min_support_fraction = min_support_fraction
|
|
264
|
+
|
|
265
|
+
@override
|
|
266
|
+
def should_prune(
|
|
267
|
+
self,
|
|
268
|
+
candidate: Tuple[str, ...],
|
|
269
|
+
support_count: int,
|
|
270
|
+
total_transactions: int,
|
|
271
|
+
context: Optional[PruningContext] = None,
|
|
272
|
+
) -> bool:
|
|
273
|
+
"""
|
|
274
|
+
Prune candidates based on temporal feasibility and support.
|
|
275
|
+
|
|
276
|
+
This method performs two checks:
|
|
277
|
+
1. Support-based pruning (if min_support is specified)
|
|
278
|
+
2. Temporal feasibility check (pattern length vs constraints)
|
|
279
|
+
|
|
280
|
+
Parameters:
|
|
281
|
+
candidate: The candidate sequence.
|
|
282
|
+
support_count: Support count of the candidate.
|
|
283
|
+
total_transactions: Total number of transactions.
|
|
284
|
+
context: Optional context with 'min_support_count' key.
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
bool: True if candidate should be pruned, False otherwise.
|
|
288
|
+
"""
|
|
289
|
+
# First check support threshold if specified
|
|
290
|
+
if self.min_support_fraction is not None:
|
|
291
|
+
min_support_count = int(math.ceil(total_transactions * self.min_support_fraction))
|
|
292
|
+
if support_count < min_support_count:
|
|
293
|
+
return True
|
|
294
|
+
elif context is not None:
|
|
295
|
+
min_support_value = context.get("min_support_count")
|
|
296
|
+
if isinstance(min_support_value, (int, float)):
|
|
297
|
+
if support_count < int(math.ceil(min_support_value)):
|
|
298
|
+
return True
|
|
299
|
+
|
|
300
|
+
# Check temporal feasibility
|
|
301
|
+
# If we have maxspan and mingap, check if pattern length is feasible
|
|
302
|
+
if self.maxspan is not None and self.mingap is not None and len(candidate) > 1:
|
|
303
|
+
# Minimum possible span for this pattern length
|
|
304
|
+
min_possible_span = (len(candidate) - 1) * self.mingap
|
|
305
|
+
if min_possible_span > self.maxspan:
|
|
306
|
+
# Pattern is too long to fit within maxspan given mingap
|
|
307
|
+
return True
|
|
308
|
+
|
|
309
|
+
return False
|
|
310
|
+
|
|
311
|
+
@override
|
|
312
|
+
def get_description(self) -> str:
|
|
313
|
+
"""Get description of this pruning strategy."""
|
|
314
|
+
parts: List[str] = []
|
|
315
|
+
if self.mingap is not None:
|
|
316
|
+
parts.append(f"mingap={self.mingap}")
|
|
317
|
+
if self.maxgap is not None:
|
|
318
|
+
parts.append(f"maxgap={self.maxgap}")
|
|
319
|
+
if self.maxspan is not None:
|
|
320
|
+
parts.append(f"maxspan={self.maxspan}")
|
|
321
|
+
if self.min_support_fraction is not None:
|
|
322
|
+
parts.append(f"min_support={self.min_support_fraction}")
|
|
323
|
+
params = ", ".join(parts) if parts else "no constraints"
|
|
324
|
+
return f"TemporalAwarePruning({params})"
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
class CombinedPruning(PruningStrategy):
|
|
328
|
+
"""
|
|
329
|
+
Combines multiple pruning strategies using logical OR.
|
|
330
|
+
|
|
331
|
+
A candidate is pruned if ANY of the constituent strategies determines
|
|
332
|
+
it should be pruned. This allows combining different pruning criteria
|
|
333
|
+
for more aggressive filtering.
|
|
334
|
+
|
|
335
|
+
Parameters:
|
|
336
|
+
strategies (List[PruningStrategy]): List of pruning strategies to combine.
|
|
337
|
+
"""
|
|
338
|
+
|
|
339
|
+
def __init__(self, strategies: List[PruningStrategy]):
|
|
340
|
+
"""
|
|
341
|
+
Initialize combined pruning strategy.
|
|
342
|
+
|
|
343
|
+
Parameters:
|
|
344
|
+
strategies: List of pruning strategies to apply.
|
|
345
|
+
"""
|
|
346
|
+
if not strategies:
|
|
347
|
+
raise ValueError("At least one pruning strategy must be provided")
|
|
348
|
+
self.strategies = strategies
|
|
349
|
+
|
|
350
|
+
@override
|
|
351
|
+
def should_prune(
|
|
352
|
+
self,
|
|
353
|
+
candidate: Tuple[str, ...],
|
|
354
|
+
support_count: int,
|
|
355
|
+
total_transactions: int,
|
|
356
|
+
context: Optional[PruningContext] = None,
|
|
357
|
+
) -> bool:
|
|
358
|
+
"""
|
|
359
|
+
Prune candidate if ANY strategy recommends pruning.
|
|
360
|
+
|
|
361
|
+
Parameters:
|
|
362
|
+
candidate: The candidate sequence.
|
|
363
|
+
support_count: Support count of the candidate.
|
|
364
|
+
total_transactions: Total number of transactions.
|
|
365
|
+
context: Optional context for pruning decisions.
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
bool: True if any strategy recommends pruning, False otherwise.
|
|
369
|
+
"""
|
|
370
|
+
for strategy in self.strategies:
|
|
371
|
+
if strategy.should_prune(candidate, support_count, total_transactions, context):
|
|
372
|
+
return True
|
|
373
|
+
return False
|
|
374
|
+
|
|
375
|
+
@override
|
|
376
|
+
def get_description(self) -> str:
|
|
377
|
+
"""Get description of this combined pruning strategy."""
|
|
378
|
+
strategy_descs: List[str] = [s.get_description() for s in self.strategies]
|
|
379
|
+
return f"CombinedPruning([{', '.join(strategy_descs)}])"
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def create_default_pruning_strategy(
|
|
383
|
+
min_support_fraction: Optional[float] = None,
|
|
384
|
+
mingap: Optional[float] = None,
|
|
385
|
+
maxgap: Optional[float] = None,
|
|
386
|
+
maxspan: Optional[float] = None,
|
|
387
|
+
) -> PruningStrategy:
|
|
388
|
+
"""
|
|
389
|
+
Create an appropriate default pruning strategy based on parameters.
|
|
390
|
+
|
|
391
|
+
This factory function selects the best pruning strategy based on the
|
|
392
|
+
provided parameters:
|
|
393
|
+
- If temporal constraints are specified, uses TemporalAwarePruning
|
|
394
|
+
- Otherwise, uses standard SupportBasedPruning
|
|
395
|
+
|
|
396
|
+
Parameters:
|
|
397
|
+
min_support_fraction: Minimum support threshold.
|
|
398
|
+
mingap: Minimum time gap constraint.
|
|
399
|
+
maxgap: Maximum time gap constraint.
|
|
400
|
+
maxspan: Maximum time span constraint.
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
PruningStrategy: An appropriate pruning strategy instance.
|
|
404
|
+
"""
|
|
405
|
+
has_temporal = mingap is not None or maxgap is not None or maxspan is not None
|
|
406
|
+
|
|
407
|
+
if has_temporal:
|
|
408
|
+
return TemporalAwarePruning(
|
|
409
|
+
mingap=mingap, maxgap=maxgap, maxspan=maxspan, min_support_fraction=min_support_fraction
|
|
410
|
+
)
|
|
411
|
+
else:
|
|
412
|
+
return SupportBasedPruning(min_support_fraction=min_support_fraction)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gsppy
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.6.0
|
|
4
4
|
Summary: GSP (Generalized Sequence Pattern) algorithm in Python
|
|
5
5
|
Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
|
|
6
6
|
Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
@@ -40,6 +40,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
40
40
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
41
41
|
Requires-Python: >=3.10
|
|
42
42
|
Requires-Dist: click>=8.0.0
|
|
43
|
+
Requires-Dist: typing-extensions>=4.0.0
|
|
43
44
|
Provides-Extra: dev
|
|
44
45
|
Requires-Dist: cython==3.2.4; extra == 'dev'
|
|
45
46
|
Requires-Dist: hatch==1.16.3; extra == 'dev'
|
|
@@ -705,6 +706,140 @@ result = gsp.search(min_support=0.5)
|
|
|
705
706
|
|
|
706
707
|
---
|
|
707
708
|
|
|
709
|
+
## 🔧 Flexible Candidate Pruning
|
|
710
|
+
|
|
711
|
+
GSP-Py supports **flexible candidate pruning strategies** that allow you to customize how candidate sequences are filtered during pattern mining. This enables optimization for different dataset characteristics and mining requirements.
|
|
712
|
+
|
|
713
|
+
### Built-in Pruning Strategies
|
|
714
|
+
|
|
715
|
+
#### 1. Support-Based Pruning (Default)
|
|
716
|
+
|
|
717
|
+
The standard GSP pruning based on minimum support threshold:
|
|
718
|
+
|
|
719
|
+
```python
|
|
720
|
+
from gsppy.gsp import GSP
|
|
721
|
+
from gsppy.pruning import SupportBasedPruning
|
|
722
|
+
|
|
723
|
+
# Explicit support-based pruning
|
|
724
|
+
pruner = SupportBasedPruning(min_support_fraction=0.3)
|
|
725
|
+
gsp = GSP(transactions, pruning_strategy=pruner)
|
|
726
|
+
result = gsp.search(min_support=0.3)
|
|
727
|
+
```
|
|
728
|
+
|
|
729
|
+
#### 2. Frequency-Based Pruning
|
|
730
|
+
|
|
731
|
+
Prunes candidates based on absolute frequency (minimum number of occurrences):
|
|
732
|
+
|
|
733
|
+
```python
|
|
734
|
+
from gsppy.pruning import FrequencyBasedPruning
|
|
735
|
+
|
|
736
|
+
# Require patterns to appear at least 5 times
|
|
737
|
+
pruner = FrequencyBasedPruning(min_frequency=5)
|
|
738
|
+
gsp = GSP(transactions, pruning_strategy=pruner)
|
|
739
|
+
result = gsp.search(min_support=0.2)
|
|
740
|
+
```
|
|
741
|
+
|
|
742
|
+
**Use case**: When you need patterns to occur a minimum absolute number of times, regardless of dataset size.
|
|
743
|
+
|
|
744
|
+
#### 3. Temporal-Aware Pruning
|
|
745
|
+
|
|
746
|
+
Optimizes pruning for time-constrained pattern mining by pre-filtering infeasible patterns:
|
|
747
|
+
|
|
748
|
+
```python
|
|
749
|
+
from gsppy.pruning import TemporalAwarePruning
|
|
750
|
+
|
|
751
|
+
# Prune patterns that cannot satisfy temporal constraints
|
|
752
|
+
pruner = TemporalAwarePruning(
|
|
753
|
+
mingap=1,
|
|
754
|
+
maxgap=5,
|
|
755
|
+
maxspan=10,
|
|
756
|
+
min_support_fraction=0.3
|
|
757
|
+
)
|
|
758
|
+
gsp = GSP(timestamped_transactions, mingap=1, maxgap=5, maxspan=10, pruning_strategy=pruner)
|
|
759
|
+
result = gsp.search(min_support=0.3)
|
|
760
|
+
```
|
|
761
|
+
|
|
762
|
+
**Use case**: Improves performance for temporal pattern mining by eliminating patterns that cannot satisfy temporal constraints.
|
|
763
|
+
|
|
764
|
+
#### 4. Combined Pruning
|
|
765
|
+
|
|
766
|
+
Combines multiple pruning strategies for aggressive filtering:
|
|
767
|
+
|
|
768
|
+
```python
|
|
769
|
+
from gsppy.pruning import CombinedPruning, SupportBasedPruning, FrequencyBasedPruning
|
|
770
|
+
|
|
771
|
+
# Apply both support and frequency constraints
|
|
772
|
+
strategies = [
|
|
773
|
+
SupportBasedPruning(min_support_fraction=0.3),
|
|
774
|
+
FrequencyBasedPruning(min_frequency=5)
|
|
775
|
+
]
|
|
776
|
+
pruner = CombinedPruning(strategies)
|
|
777
|
+
gsp = GSP(transactions, pruning_strategy=pruner)
|
|
778
|
+
result = gsp.search(min_support=0.3)
|
|
779
|
+
```
|
|
780
|
+
|
|
781
|
+
**Use case**: When you want to combine multiple filtering criteria for more selective pattern discovery.
|
|
782
|
+
|
|
783
|
+
### Custom Pruning Strategies
|
|
784
|
+
|
|
785
|
+
You can create custom pruning strategies by implementing the `PruningStrategy` interface:
|
|
786
|
+
|
|
787
|
+
```python
|
|
788
|
+
from gsppy.pruning import PruningStrategy
|
|
789
|
+
from typing import Dict, Optional, Tuple
|
|
790
|
+
|
|
791
|
+
class MyCustomPruner(PruningStrategy):
|
|
792
|
+
def should_prune(
|
|
793
|
+
self,
|
|
794
|
+
candidate: Tuple[str, ...],
|
|
795
|
+
support_count: int,
|
|
796
|
+
total_transactions: int,
|
|
797
|
+
context: Optional[Dict] = None
|
|
798
|
+
) -> bool:
|
|
799
|
+
# Custom pruning logic
|
|
800
|
+
# Return True to prune (filter out), False to keep
|
|
801
|
+
pattern_length = len(candidate)
|
|
802
|
+
# Example: Prune very long patterns with low support
|
|
803
|
+
if pattern_length > 5 and support_count < 10:
|
|
804
|
+
return True
|
|
805
|
+
return False
|
|
806
|
+
|
|
807
|
+
# Use your custom pruner
|
|
808
|
+
custom_pruner = MyCustomPruner()
|
|
809
|
+
gsp = GSP(transactions, pruning_strategy=custom_pruner)
|
|
810
|
+
result = gsp.search(min_support=0.2)
|
|
811
|
+
```
|
|
812
|
+
|
|
813
|
+
### Performance Characteristics
|
|
814
|
+
|
|
815
|
+
Different pruning strategies have different performance tradeoffs:
|
|
816
|
+
|
|
817
|
+
| Strategy | Pruning Aggressiveness | Use Case | Performance Impact |
|
|
818
|
+
|----------|----------------------|----------|-------------------|
|
|
819
|
+
| **SupportBased** | Moderate | General-purpose mining | Baseline performance |
|
|
820
|
+
| **FrequencyBased** | High (for large datasets) | Require absolute frequency | Faster on large datasets |
|
|
821
|
+
| **TemporalAware** | High (for temporal data) | Time-constrained patterns | Significant speedup for temporal mining |
|
|
822
|
+
| **Combined** | Very High | Selective pattern discovery | Fastest, but may miss edge cases |
|
|
823
|
+
|
|
824
|
+
### Benchmarking Pruning Strategies
|
|
825
|
+
|
|
826
|
+
To compare pruning strategies on your dataset:
|
|
827
|
+
|
|
828
|
+
```bash
|
|
829
|
+
# Compare all strategies
|
|
830
|
+
python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy all
|
|
831
|
+
|
|
832
|
+
# Benchmark a specific strategy
|
|
833
|
+
python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy frequency
|
|
834
|
+
|
|
835
|
+
# Run multiple rounds for averaging
|
|
836
|
+
python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy all --rounds 3
|
|
837
|
+
```
|
|
838
|
+
|
|
839
|
+
See `benchmarks/bench_pruning.py` for the complete benchmarking script.
|
|
840
|
+
|
|
841
|
+
---
|
|
842
|
+
|
|
708
843
|
## ⌨️ Typing
|
|
709
844
|
|
|
710
845
|
`gsppy` ships inline type information (PEP 561) via a bundled `py.typed` marker. The public API is re-exported from
|
|
@@ -718,10 +853,7 @@ larger applications.
|
|
|
718
853
|
|
|
719
854
|
We are actively working to improve GSP-Py. Here are some exciting features planned for future releases:
|
|
720
855
|
|
|
721
|
-
1. **
|
|
722
|
-
- Enable users to define their own pruning logic during the mining process.
|
|
723
|
-
|
|
724
|
-
2. **Support for Preprocessing and Postprocessing**:
|
|
856
|
+
1. **Support for Preprocessing and Postprocessing**:
|
|
725
857
|
- Add hooks to allow users to transform datasets before mining and customize the output results.
|
|
726
858
|
|
|
727
859
|
Want to contribute or suggest an
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
gsppy/__init__.py,sha256=CJqssfftIIhjzXijnjLKwvIA4Cfr0CaykQkCWaD-q80,1161
|
|
2
|
+
gsppy/accelerate.py,sha256=rDho3ysADETpuhT2SF9voBjd3XRaQUzuA5k_baNACF8,11020
|
|
3
|
+
gsppy/cli.py,sha256=-viXa8VFIF-QvrHYy1vtDxtMm50sM_tZq5B5DMZ1Jtw,12516
|
|
4
|
+
gsppy/gsp.py,sha256=grDKfnC8rshvDH3xG-HQ2JSWsDZl3qbhyEt6FFlQeeI,27135
|
|
5
|
+
gsppy/pruning.py,sha256=hOoQoH1k_gzACBy6qr_cvwth9WDmKuLmJyVRDbHjFFM,14779
|
|
6
|
+
gsppy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
gsppy/utils.py,sha256=dAEq1hEZMN0ZjoocKs_ZIgOI9j_Y6rJEAKneul3zNRo,13501
|
|
8
|
+
gsppy-3.6.0.dist-info/METADATA,sha256=8gBox1RTiigMmzTUBldVsOXc2S8ykI-J-sUC0az-RWM,34082
|
|
9
|
+
gsppy-3.6.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
10
|
+
gsppy-3.6.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
11
|
+
gsppy-3.6.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
|
|
12
|
+
gsppy-3.6.0.dist-info/RECORD,,
|
gsppy-3.5.0.dist-info/RECORD
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
gsppy/__init__.py,sha256=NMVa-ZWT449wuxZMF9Ym7p-DChOxOibaaqlpPxksfuo,805
|
|
2
|
-
gsppy/accelerate.py,sha256=rDho3ysADETpuhT2SF9voBjd3XRaQUzuA5k_baNACF8,11020
|
|
3
|
-
gsppy/cli.py,sha256=-viXa8VFIF-QvrHYy1vtDxtMm50sM_tZq5B5DMZ1Jtw,12516
|
|
4
|
-
gsppy/gsp.py,sha256=k72pvdmD6jU4AId2rrHQrJ4FBUgtkuC0ntEY8QHGi5c,24486
|
|
5
|
-
gsppy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
gsppy/utils.py,sha256=dAEq1hEZMN0ZjoocKs_ZIgOI9j_Y6rJEAKneul3zNRo,13501
|
|
7
|
-
gsppy-3.5.0.dist-info/METADATA,sha256=ix2X_VEUTved_DaTsSJMERT-CZ34TUYF0XMC2KeNeuE,29747
|
|
8
|
-
gsppy-3.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
9
|
-
gsppy-3.5.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
10
|
-
gsppy-3.5.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
|
|
11
|
-
gsppy-3.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|