gsppy 3.5.0__py3-none-any.whl → 3.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsppy/__init__.py CHANGED
@@ -13,6 +13,14 @@ from gsppy.cli import (
13
13
  read_transactions_from_json,
14
14
  )
15
15
  from gsppy.gsp import GSP
16
+ from gsppy.pruning import (
17
+ PruningStrategy,
18
+ SupportBasedPruning,
19
+ FrequencyBasedPruning,
20
+ TemporalAwarePruning,
21
+ CombinedPruning,
22
+ create_default_pruning_strategy,
23
+ )
16
24
 
17
25
  try:
18
26
  __version__ = importlib_metadata.version("gsppy")
@@ -26,4 +34,10 @@ __all__ = [
26
34
  "read_transactions_from_json",
27
35
  "setup_logging",
28
36
  "__version__",
37
+ "PruningStrategy",
38
+ "SupportBasedPruning",
39
+ "FrequencyBasedPruning",
40
+ "TemporalAwarePruning",
41
+ "CombinedPruning",
42
+ "create_default_pruning_strategy",
29
43
  ]
gsppy/gsp.py CHANGED
@@ -99,6 +99,7 @@ from gsppy.utils import (
99
99
  generate_candidates_from_previous,
100
100
  is_subsequence_in_list_with_time_constraints,
101
101
  )
102
+ from gsppy.pruning import PruningStrategy, create_default_pruning_strategy
102
103
  from gsppy.accelerate import support_counts as support_counts_accel
103
104
 
104
105
  logger: logging.Logger = logging.getLogger(__name__)
@@ -130,6 +131,7 @@ class GSP:
130
131
  maxgap: Optional[float] = None,
131
132
  maxspan: Optional[float] = None,
132
133
  verbose: bool = False,
134
+ pruning_strategy: Optional[PruningStrategy] = None,
133
135
  ):
134
136
  """
135
137
  Initialize the GSP algorithm with raw transactional data.
@@ -144,6 +146,9 @@ class GSP:
144
146
  maxspan (Optional[float]): Maximum time span from first to last item in patterns.
145
147
  verbose (bool): Enable verbose logging output with detailed progress information.
146
148
  Default is False (minimal output).
149
+ pruning_strategy (Optional[PruningStrategy]): Custom pruning strategy for candidate filtering.
150
+ If None, a default strategy is created based on
151
+ temporal constraints.
147
152
 
148
153
  Attributes Initialized:
149
154
  - Processes the input raw transaction dataset.
@@ -162,9 +167,18 @@ class GSP:
162
167
  self.maxgap = maxgap
163
168
  self.maxspan = maxspan
164
169
  self.verbose = verbose
170
+ self.pruning_strategy: PruningStrategy
165
171
  self._configure_logging()
166
172
  self._validate_temporal_constraints()
167
173
  self._pre_processing(raw_transactions)
174
+ # Initialize default pruning strategy if none provided
175
+ if pruning_strategy is None:
176
+ self.pruning_strategy = create_default_pruning_strategy(
177
+ mingap=self.mingap, maxgap=self.maxgap, maxspan=self.maxspan
178
+ )
179
+ logger.debug("Using default pruning strategy: %s", self.pruning_strategy.get_description())
180
+ else:
181
+ self.pruning_strategy = pruning_strategy
168
182
 
169
183
  def _configure_logging(self) -> None:
170
184
  """
@@ -389,6 +403,39 @@ class GSP:
389
403
  # Fallback to Python implementation on any acceleration failure
390
404
  return self._support_python(items, min_support, batch_size)
391
405
 
406
+ def _apply_pruning(
407
+ self, freq_patterns: Dict[Tuple[str, ...], int], min_support_count: int
408
+ ) -> Dict[Tuple[str, ...], int]:
409
+ """
410
+ Apply the configured pruning strategy to filter frequent patterns.
411
+
412
+ This method uses the pruning strategy to post-process patterns that have
413
+ already met the minimum support threshold. Additional pruning can be applied
414
+ based on other criteria such as temporal feasibility or frequency thresholds.
415
+
416
+ Parameters:
417
+ freq_patterns (Dict[Tuple[str, ...], int]): Dictionary of patterns and their support counts.
418
+ min_support_count (int): Absolute minimum support count threshold.
419
+
420
+ Returns:
421
+ Dict[Tuple[str, ...], int]: Filtered patterns after applying pruning strategy.
422
+ """
423
+ if not freq_patterns:
424
+ return freq_patterns
425
+
426
+ pruned_patterns: Dict[Tuple[str, ...], int] = {}
427
+ context = {"min_support_count": min_support_count}
428
+
429
+ for candidate, support_count in freq_patterns.items():
430
+ if not self.pruning_strategy.should_prune(candidate, support_count, len(self.transactions), context):
431
+ pruned_patterns[candidate] = support_count
432
+
433
+ num_pruned = len(freq_patterns) - len(pruned_patterns)
434
+ if num_pruned > 0:
435
+ logger.debug("Pruning strategy filtered out %d additional candidates", num_pruned)
436
+
437
+ return pruned_patterns
438
+
392
439
  def _print_status(self, run: int, candidates: List[Tuple[str, ...]]) -> None:
393
440
  """
394
441
  Log progress information for the current GSP iteration.
@@ -504,7 +551,10 @@ class GSP:
504
551
 
505
552
  # scan transactions to collect support count for each candidate
506
553
  # sequence & filter
507
- self.freq_patterns.append(self._support(candidates, abs_min_support, backend=backend))
554
+ freq_1 = self._support(candidates, abs_min_support, backend=backend)
555
+ # Apply pruning strategy for additional filtering
556
+ freq_1 = self._apply_pruning(freq_1, abs_min_support)
557
+ self.freq_patterns.append(freq_1)
508
558
 
509
559
  # (k-itemsets/k-sequence = 1)
510
560
  k_items = 1
@@ -525,7 +575,10 @@ class GSP:
525
575
 
526
576
  # candidate pruning - eliminates candidates who are not potentially
527
577
  # frequent (using support as threshold)
528
- self.freq_patterns.append(self._support(candidates, abs_min_support, backend=backend))
578
+ freq_k = self._support(candidates, abs_min_support, backend=backend)
579
+ # Apply pruning strategy for additional filtering
580
+ freq_k = self._apply_pruning(freq_k, abs_min_support)
581
+ self.freq_patterns.append(freq_k)
529
582
 
530
583
  self._print_status(k_items, candidates)
531
584
  logger.info("GSP algorithm completed.")
gsppy/pruning.py ADDED
@@ -0,0 +1,412 @@
1
+ """
2
+ Flexible candidate pruning strategies for the GSP algorithm.
3
+
4
+ This module provides a pluggable pruning system that allows different strategies
5
+ for filtering candidate sequences during pattern mining. The pruning strategies
6
+ can significantly impact performance and pattern discovery based on dataset
7
+ characteristics and mining requirements.
8
+
9
+ Key Features:
10
+ -------------
11
+ 1. **Abstract Pruning Strategy Interface**:
12
+ - Defines a common interface for all pruning strategies.
13
+ - Allows custom pruning logic to be easily integrated.
14
+
15
+ 2. **Built-in Pruning Strategies**:
16
+ - **SupportBasedPruning**: Standard GSP pruning based on minimum support threshold.
17
+ - **FrequencyBasedPruning**: Prunes candidates with low absolute frequency.
18
+ - **TemporalAwarePruning**: Prunes candidates that violate temporal constraints.
19
+ - **CombinedPruning**: Combines multiple pruning strategies.
20
+
21
+ 3. **Performance Optimization**:
22
+ - Early termination of candidate generation when patterns cannot be extended.
23
+ - Reduces memory footprint by eliminating non-promising candidates early.
24
+
25
+ Example Usage:
26
+ --------------
27
+ ```python
28
+ from gsppy.gsp import GSP
29
+ from gsppy.pruning import SupportBasedPruning, FrequencyBasedPruning, CombinedPruning
30
+
31
+ # Use default support-based pruning
32
+ gsp = GSP(transactions)
33
+ patterns = gsp.search(min_support=0.3)
34
+
35
+ # Use frequency-based pruning with a minimum frequency threshold
36
+ pruner = FrequencyBasedPruning(min_frequency=5)
37
+ gsp = GSP(transactions, pruning_strategy=pruner)
38
+ patterns = gsp.search(min_support=0.3)
39
+
40
+ # Combine multiple pruning strategies
41
+ combined = CombinedPruning([SupportBasedPruning(), FrequencyBasedPruning(min_frequency=3)])
42
+ gsp = GSP(transactions, pruning_strategy=combined)
43
+ patterns = gsp.search(min_support=0.3)
44
+ ```
45
+
46
+ Author:
47
+ -------
48
+ - **Developed by:** Jackson Antonio do Prado Lima
49
+ - **Email:** jacksonpradolima@gmail.com
50
+
51
+ License:
52
+ --------
53
+ This implementation is distributed under the MIT License.
54
+ """
55
+
56
+ import math
57
+ from abc import ABC, abstractmethod
58
+ from typing import List, Tuple, Mapping, Optional
59
+ from typing_extensions import override
60
+
61
+ PruningContext = Mapping[str, object]
62
+
63
+
64
+ class PruningStrategy(ABC):
65
+ """
66
+ Abstract base class for candidate pruning strategies.
67
+
68
+ A pruning strategy determines which candidate sequences should be
69
+ filtered out during the GSP algorithm's candidate generation phase.
70
+ Custom pruning strategies can be implemented by subclassing this class
71
+ and implementing the `should_prune` method.
72
+ """
73
+
74
+ @abstractmethod
75
+ def should_prune(
76
+ self,
77
+ candidate: Tuple[str, ...],
78
+ support_count: int,
79
+ total_transactions: int,
80
+ context: Optional[PruningContext] = None,
81
+ ) -> bool:
82
+ """
83
+ Determine whether a candidate sequence should be pruned.
84
+
85
+ Parameters:
86
+ candidate (Tuple[str, ...]): The candidate sequence to evaluate.
87
+ support_count (int): The support count of the candidate in the dataset.
88
+ total_transactions (int): Total number of transactions in the dataset.
89
+ context (Optional[Dict]): Additional context information for pruning decisions.
90
+ May include temporal constraints, pattern length, etc.
91
+
92
+ Returns:
93
+ bool: True if the candidate should be pruned (filtered out), False otherwise.
94
+ """
95
+ pass
96
+
97
+ def get_description(self) -> str:
98
+ """
99
+ Get a human-readable description of the pruning strategy.
100
+
101
+ Returns:
102
+ str: Description of the pruning strategy.
103
+ """
104
+ return self.__class__.__name__
105
+
106
+
107
+ class SupportBasedPruning(PruningStrategy):
108
+ """
109
+ Standard GSP pruning based on minimum support threshold.
110
+
111
+ This is the default pruning strategy used in the classic GSP algorithm.
112
+ Candidates are pruned if their support count is below the minimum support
113
+ threshold.
114
+
115
+ Parameters:
116
+ min_support_fraction (Optional[float]): Minimum support as a fraction (0.0, 1.0].
117
+ If None, uses the value from search parameters.
118
+ """
119
+
120
+ def __init__(self, min_support_fraction: Optional[float] = None):
121
+ """
122
+ Initialize support-based pruning strategy.
123
+
124
+ Parameters:
125
+ min_support_fraction (Optional[float]): Minimum support threshold.
126
+ If None, uses the value from search.
127
+ """
128
+ self.min_support_fraction = min_support_fraction
129
+
130
+ @override
131
+ def should_prune(
132
+ self,
133
+ candidate: Tuple[str, ...],
134
+ support_count: int,
135
+ total_transactions: int,
136
+ context: Optional[PruningContext] = None,
137
+ ) -> bool:
138
+ """
139
+ Prune candidates below the minimum support threshold.
140
+
141
+ Parameters:
142
+ candidate: The candidate sequence.
143
+ support_count: Support count of the candidate.
144
+ total_transactions: Total number of transactions.
145
+ context: Optional context with 'min_support_count' key.
146
+
147
+ Returns:
148
+ bool: True if support_count < min_support_count, False otherwise.
149
+ """
150
+ # Prioritize user-provided min_support_fraction if set, otherwise use context
151
+ min_support_count: int
152
+
153
+ if self.min_support_fraction is not None:
154
+ min_support_count = int(math.ceil(total_transactions * self.min_support_fraction))
155
+ elif context is not None:
156
+ min_support_value = context.get("min_support_count")
157
+ if isinstance(min_support_value, int):
158
+ min_support_count = min_support_value
159
+ elif isinstance(min_support_value, float):
160
+ min_support_count = int(math.ceil(min_support_value))
161
+ else:
162
+ # Context does not provide a usable threshold
163
+ return False
164
+ else:
165
+ # If no threshold specified, don't prune
166
+ return False
167
+
168
+ return support_count < min_support_count
169
+
170
+ @override
171
+ def get_description(self) -> str:
172
+ """Get description of this pruning strategy."""
173
+ if self.min_support_fraction is not None:
174
+ return f"SupportBasedPruning(min_support={self.min_support_fraction})"
175
+ return "SupportBasedPruning(dynamic)"
176
+
177
+
178
+ class FrequencyBasedPruning(PruningStrategy):
179
+ """
180
+ Prunes candidates based on absolute frequency threshold.
181
+
182
+ This strategy prunes candidates that appear fewer times than a specified
183
+ minimum frequency, regardless of the dataset size. Useful for datasets
184
+ where you want to ensure patterns appear a minimum number of times.
185
+
186
+ Parameters:
187
+ min_frequency (int): Minimum absolute frequency threshold.
188
+ """
189
+
190
+ def __init__(self, min_frequency: int):
191
+ """
192
+ Initialize frequency-based pruning strategy.
193
+
194
+ Parameters:
195
+ min_frequency (int): Minimum number of occurrences required.
196
+ """
197
+ if min_frequency < 1:
198
+ raise ValueError("min_frequency must be at least 1")
199
+ self.min_frequency = min_frequency
200
+
201
+ @override
202
+ def should_prune(
203
+ self,
204
+ candidate: Tuple[str, ...],
205
+ support_count: int,
206
+ total_transactions: int,
207
+ context: Optional[PruningContext] = None,
208
+ ) -> bool:
209
+ """
210
+ Prune candidates with frequency below the minimum threshold.
211
+
212
+ Parameters:
213
+ candidate: The candidate sequence.
214
+ support_count: Support count (frequency) of the candidate.
215
+ total_transactions: Total number of transactions (unused).
216
+ context: Optional context (unused).
217
+
218
+ Returns:
219
+ bool: True if support_count < min_frequency, False otherwise.
220
+ """
221
+ return support_count < self.min_frequency
222
+
223
+ @override
224
+ def get_description(self) -> str:
225
+ """Get description of this pruning strategy."""
226
+ return f"FrequencyBasedPruning(min_frequency={self.min_frequency})"
227
+
228
+
229
+ class TemporalAwarePruning(PruningStrategy):
230
+ """
231
+ Prunes candidates based on temporal constraint feasibility.
232
+
233
+ This strategy can pre-filter candidates that are unlikely to satisfy
234
+ temporal constraints (mingap, maxgap, maxspan) based on pattern structure
235
+ and candidate length.
236
+
237
+ Parameters:
238
+ mingap (Optional[float]): Minimum time gap between consecutive items.
239
+ maxgap (Optional[float]): Maximum time gap between consecutive items.
240
+ maxspan (Optional[float]): Maximum time span from first to last item.
241
+ min_support_fraction (Optional[float]): Additional support threshold.
242
+ """
243
+
244
+ def __init__(
245
+ self,
246
+ mingap: Optional[float] = None,
247
+ maxgap: Optional[float] = None,
248
+ maxspan: Optional[float] = None,
249
+ min_support_fraction: Optional[float] = None,
250
+ ):
251
+ """
252
+ Initialize temporal-aware pruning strategy.
253
+
254
+ Parameters:
255
+ mingap: Minimum time gap constraint.
256
+ maxgap: Maximum time gap constraint.
257
+ maxspan: Maximum time span constraint.
258
+ min_support_fraction: Additional support threshold.
259
+ """
260
+ self.mingap = mingap
261
+ self.maxgap = maxgap
262
+ self.maxspan = maxspan
263
+ self.min_support_fraction = min_support_fraction
264
+
265
+ @override
266
+ def should_prune(
267
+ self,
268
+ candidate: Tuple[str, ...],
269
+ support_count: int,
270
+ total_transactions: int,
271
+ context: Optional[PruningContext] = None,
272
+ ) -> bool:
273
+ """
274
+ Prune candidates based on temporal feasibility and support.
275
+
276
+ This method performs two checks:
277
+ 1. Support-based pruning (if min_support is specified)
278
+ 2. Temporal feasibility check (pattern length vs constraints)
279
+
280
+ Parameters:
281
+ candidate: The candidate sequence.
282
+ support_count: Support count of the candidate.
283
+ total_transactions: Total number of transactions.
284
+ context: Optional context with 'min_support_count' key.
285
+
286
+ Returns:
287
+ bool: True if candidate should be pruned, False otherwise.
288
+ """
289
+ # First check support threshold if specified
290
+ if self.min_support_fraction is not None:
291
+ min_support_count = int(math.ceil(total_transactions * self.min_support_fraction))
292
+ if support_count < min_support_count:
293
+ return True
294
+ elif context is not None:
295
+ min_support_value = context.get("min_support_count")
296
+ if isinstance(min_support_value, (int, float)):
297
+ if support_count < int(math.ceil(min_support_value)):
298
+ return True
299
+
300
+ # Check temporal feasibility
301
+ # If we have maxspan and mingap, check if pattern length is feasible
302
+ if self.maxspan is not None and self.mingap is not None and len(candidate) > 1:
303
+ # Minimum possible span for this pattern length
304
+ min_possible_span = (len(candidate) - 1) * self.mingap
305
+ if min_possible_span > self.maxspan:
306
+ # Pattern is too long to fit within maxspan given mingap
307
+ return True
308
+
309
+ return False
310
+
311
+ @override
312
+ def get_description(self) -> str:
313
+ """Get description of this pruning strategy."""
314
+ parts: List[str] = []
315
+ if self.mingap is not None:
316
+ parts.append(f"mingap={self.mingap}")
317
+ if self.maxgap is not None:
318
+ parts.append(f"maxgap={self.maxgap}")
319
+ if self.maxspan is not None:
320
+ parts.append(f"maxspan={self.maxspan}")
321
+ if self.min_support_fraction is not None:
322
+ parts.append(f"min_support={self.min_support_fraction}")
323
+ params = ", ".join(parts) if parts else "no constraints"
324
+ return f"TemporalAwarePruning({params})"
325
+
326
+
327
+ class CombinedPruning(PruningStrategy):
328
+ """
329
+ Combines multiple pruning strategies using logical OR.
330
+
331
+ A candidate is pruned if ANY of the constituent strategies determines
332
+ it should be pruned. This allows combining different pruning criteria
333
+ for more aggressive filtering.
334
+
335
+ Parameters:
336
+ strategies (List[PruningStrategy]): List of pruning strategies to combine.
337
+ """
338
+
339
+ def __init__(self, strategies: List[PruningStrategy]):
340
+ """
341
+ Initialize combined pruning strategy.
342
+
343
+ Parameters:
344
+ strategies: List of pruning strategies to apply.
345
+ """
346
+ if not strategies:
347
+ raise ValueError("At least one pruning strategy must be provided")
348
+ self.strategies = strategies
349
+
350
+ @override
351
+ def should_prune(
352
+ self,
353
+ candidate: Tuple[str, ...],
354
+ support_count: int,
355
+ total_transactions: int,
356
+ context: Optional[PruningContext] = None,
357
+ ) -> bool:
358
+ """
359
+ Prune candidate if ANY strategy recommends pruning.
360
+
361
+ Parameters:
362
+ candidate: The candidate sequence.
363
+ support_count: Support count of the candidate.
364
+ total_transactions: Total number of transactions.
365
+ context: Optional context for pruning decisions.
366
+
367
+ Returns:
368
+ bool: True if any strategy recommends pruning, False otherwise.
369
+ """
370
+ for strategy in self.strategies:
371
+ if strategy.should_prune(candidate, support_count, total_transactions, context):
372
+ return True
373
+ return False
374
+
375
+ @override
376
+ def get_description(self) -> str:
377
+ """Get description of this combined pruning strategy."""
378
+ strategy_descs: List[str] = [s.get_description() for s in self.strategies]
379
+ return f"CombinedPruning([{', '.join(strategy_descs)}])"
380
+
381
+
382
+ def create_default_pruning_strategy(
383
+ min_support_fraction: Optional[float] = None,
384
+ mingap: Optional[float] = None,
385
+ maxgap: Optional[float] = None,
386
+ maxspan: Optional[float] = None,
387
+ ) -> PruningStrategy:
388
+ """
389
+ Create an appropriate default pruning strategy based on parameters.
390
+
391
+ This factory function selects the best pruning strategy based on the
392
+ provided parameters:
393
+ - If temporal constraints are specified, uses TemporalAwarePruning
394
+ - Otherwise, uses standard SupportBasedPruning
395
+
396
+ Parameters:
397
+ min_support_fraction: Minimum support threshold.
398
+ mingap: Minimum time gap constraint.
399
+ maxgap: Maximum time gap constraint.
400
+ maxspan: Maximum time span constraint.
401
+
402
+ Returns:
403
+ PruningStrategy: An appropriate pruning strategy instance.
404
+ """
405
+ has_temporal = mingap is not None or maxgap is not None or maxspan is not None
406
+
407
+ if has_temporal:
408
+ return TemporalAwarePruning(
409
+ mingap=mingap, maxgap=maxgap, maxspan=maxspan, min_support_fraction=min_support_fraction
410
+ )
411
+ else:
412
+ return SupportBasedPruning(min_support_fraction=min_support_fraction)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gsppy
3
- Version: 3.5.0
3
+ Version: 3.6.0
4
4
  Summary: GSP (Generalized Sequence Pattern) algorithm in Python
5
5
  Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
6
6
  Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
@@ -40,6 +40,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
40
40
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
41
41
  Requires-Python: >=3.10
42
42
  Requires-Dist: click>=8.0.0
43
+ Requires-Dist: typing-extensions>=4.0.0
43
44
  Provides-Extra: dev
44
45
  Requires-Dist: cython==3.2.4; extra == 'dev'
45
46
  Requires-Dist: hatch==1.16.3; extra == 'dev'
@@ -705,6 +706,140 @@ result = gsp.search(min_support=0.5)
705
706
 
706
707
  ---
707
708
 
709
+ ## 🔧 Flexible Candidate Pruning
710
+
711
+ GSP-Py supports **flexible candidate pruning strategies** that allow you to customize how candidate sequences are filtered during pattern mining. This enables optimization for different dataset characteristics and mining requirements.
712
+
713
+ ### Built-in Pruning Strategies
714
+
715
+ #### 1. Support-Based Pruning (Default)
716
+
717
+ The standard GSP pruning based on minimum support threshold:
718
+
719
+ ```python
720
+ from gsppy.gsp import GSP
721
+ from gsppy.pruning import SupportBasedPruning
722
+
723
+ # Explicit support-based pruning
724
+ pruner = SupportBasedPruning(min_support_fraction=0.3)
725
+ gsp = GSP(transactions, pruning_strategy=pruner)
726
+ result = gsp.search(min_support=0.3)
727
+ ```
728
+
729
+ #### 2. Frequency-Based Pruning
730
+
731
+ Prunes candidates based on absolute frequency (minimum number of occurrences):
732
+
733
+ ```python
734
+ from gsppy.pruning import FrequencyBasedPruning
735
+
736
+ # Require patterns to appear at least 5 times
737
+ pruner = FrequencyBasedPruning(min_frequency=5)
738
+ gsp = GSP(transactions, pruning_strategy=pruner)
739
+ result = gsp.search(min_support=0.2)
740
+ ```
741
+
742
+ **Use case**: When you need patterns to occur a minimum absolute number of times, regardless of dataset size.
743
+
744
+ #### 3. Temporal-Aware Pruning
745
+
746
+ Optimizes pruning for time-constrained pattern mining by pre-filtering infeasible patterns:
747
+
748
+ ```python
749
+ from gsppy.pruning import TemporalAwarePruning
750
+
751
+ # Prune patterns that cannot satisfy temporal constraints
752
+ pruner = TemporalAwarePruning(
753
+ mingap=1,
754
+ maxgap=5,
755
+ maxspan=10,
756
+ min_support_fraction=0.3
757
+ )
758
+ gsp = GSP(timestamped_transactions, mingap=1, maxgap=5, maxspan=10, pruning_strategy=pruner)
759
+ result = gsp.search(min_support=0.3)
760
+ ```
761
+
762
+ **Use case**: Improves performance for temporal pattern mining by eliminating patterns that cannot satisfy temporal constraints.
763
+
764
+ #### 4. Combined Pruning
765
+
766
+ Combines multiple pruning strategies for aggressive filtering:
767
+
768
+ ```python
769
+ from gsppy.pruning import CombinedPruning, SupportBasedPruning, FrequencyBasedPruning
770
+
771
+ # Apply both support and frequency constraints
772
+ strategies = [
773
+ SupportBasedPruning(min_support_fraction=0.3),
774
+ FrequencyBasedPruning(min_frequency=5)
775
+ ]
776
+ pruner = CombinedPruning(strategies)
777
+ gsp = GSP(transactions, pruning_strategy=pruner)
778
+ result = gsp.search(min_support=0.3)
779
+ ```
780
+
781
+ **Use case**: When you want to combine multiple filtering criteria for more selective pattern discovery.
782
+
783
+ ### Custom Pruning Strategies
784
+
785
+ You can create custom pruning strategies by implementing the `PruningStrategy` interface:
786
+
787
+ ```python
788
+ from gsppy.pruning import PruningStrategy
789
+ from typing import Dict, Optional, Tuple
790
+
791
+ class MyCustomPruner(PruningStrategy):
792
+ def should_prune(
793
+ self,
794
+ candidate: Tuple[str, ...],
795
+ support_count: int,
796
+ total_transactions: int,
797
+ context: Optional[Dict] = None
798
+ ) -> bool:
799
+ # Custom pruning logic
800
+ # Return True to prune (filter out), False to keep
801
+ pattern_length = len(candidate)
802
+ # Example: Prune very long patterns with low support
803
+ if pattern_length > 5 and support_count < 10:
804
+ return True
805
+ return False
806
+
807
+ # Use your custom pruner
808
+ custom_pruner = MyCustomPruner()
809
+ gsp = GSP(transactions, pruning_strategy=custom_pruner)
810
+ result = gsp.search(min_support=0.2)
811
+ ```
812
+
813
+ ### Performance Characteristics
814
+
815
+ Different pruning strategies have different performance tradeoffs:
816
+
817
+ | Strategy | Pruning Aggressiveness | Use Case | Performance Impact |
818
+ |----------|----------------------|----------|-------------------|
819
+ | **SupportBased** | Moderate | General-purpose mining | Baseline performance |
820
+ | **FrequencyBased** | High (for large datasets) | Require absolute frequency | Faster on large datasets |
821
+ | **TemporalAware** | High (for temporal data) | Time-constrained patterns | Significant speedup for temporal mining |
822
+ | **Combined** | Very High | Selective pattern discovery | Fastest, but may miss edge cases |
823
+
824
+ ### Benchmarking Pruning Strategies
825
+
826
+ To compare pruning strategies on your dataset:
827
+
828
+ ```bash
829
+ # Compare all strategies
830
+ python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy all
831
+
832
+ # Benchmark a specific strategy
833
+ python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy frequency
834
+
835
+ # Run multiple rounds for averaging
836
+ python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy all --rounds 3
837
+ ```
838
+
839
+ See `benchmarks/bench_pruning.py` for the complete benchmarking script.
840
+
841
+ ---
842
+
708
843
  ## ⌨️ Typing
709
844
 
710
845
  `gsppy` ships inline type information (PEP 561) via a bundled `py.typed` marker. The public API is re-exported from
@@ -718,10 +853,7 @@ larger applications.
718
853
 
719
854
  We are actively working to improve GSP-Py. Here are some exciting features planned for future releases:
720
855
 
721
- 1. **Custom Filters for Candidate Pruning**:
722
- - Enable users to define their own pruning logic during the mining process.
723
-
724
- 2. **Support for Preprocessing and Postprocessing**:
856
+ 1. **Support for Preprocessing and Postprocessing**:
725
857
  - Add hooks to allow users to transform datasets before mining and customize the output results.
726
858
 
727
859
  Want to contribute or suggest an
@@ -0,0 +1,12 @@
1
+ gsppy/__init__.py,sha256=CJqssfftIIhjzXijnjLKwvIA4Cfr0CaykQkCWaD-q80,1161
2
+ gsppy/accelerate.py,sha256=rDho3ysADETpuhT2SF9voBjd3XRaQUzuA5k_baNACF8,11020
3
+ gsppy/cli.py,sha256=-viXa8VFIF-QvrHYy1vtDxtMm50sM_tZq5B5DMZ1Jtw,12516
4
+ gsppy/gsp.py,sha256=grDKfnC8rshvDH3xG-HQ2JSWsDZl3qbhyEt6FFlQeeI,27135
5
+ gsppy/pruning.py,sha256=hOoQoH1k_gzACBy6qr_cvwth9WDmKuLmJyVRDbHjFFM,14779
6
+ gsppy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ gsppy/utils.py,sha256=dAEq1hEZMN0ZjoocKs_ZIgOI9j_Y6rJEAKneul3zNRo,13501
8
+ gsppy-3.6.0.dist-info/METADATA,sha256=8gBox1RTiigMmzTUBldVsOXc2S8ykI-J-sUC0az-RWM,34082
9
+ gsppy-3.6.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
+ gsppy-3.6.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
11
+ gsppy-3.6.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
12
+ gsppy-3.6.0.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- gsppy/__init__.py,sha256=NMVa-ZWT449wuxZMF9Ym7p-DChOxOibaaqlpPxksfuo,805
2
- gsppy/accelerate.py,sha256=rDho3ysADETpuhT2SF9voBjd3XRaQUzuA5k_baNACF8,11020
3
- gsppy/cli.py,sha256=-viXa8VFIF-QvrHYy1vtDxtMm50sM_tZq5B5DMZ1Jtw,12516
4
- gsppy/gsp.py,sha256=k72pvdmD6jU4AId2rrHQrJ4FBUgtkuC0ntEY8QHGi5c,24486
5
- gsppy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- gsppy/utils.py,sha256=dAEq1hEZMN0ZjoocKs_ZIgOI9j_Y6rJEAKneul3zNRo,13501
7
- gsppy-3.5.0.dist-info/METADATA,sha256=ix2X_VEUTved_DaTsSJMERT-CZ34TUYF0XMC2KeNeuE,29747
8
- gsppy-3.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
9
- gsppy-3.5.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
10
- gsppy-3.5.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
11
- gsppy-3.5.0.dist-info/RECORD,,
File without changes