gsppy 3.5.0__tar.gz → 3.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,29 @@
1
1
  # CHANGELOG
2
2
 
3
3
 
4
+ ## v3.6.0 (2026-01-26)
5
+
6
+ ### Chores
7
+
8
+ - Update uv.lock for version 3.5.0
9
+ ([`e2c1be0`](https://github.com/jacksonpradolima/gsp-py/commit/e2c1be0945b0b124d8afa8981877513449b29ff0))
10
+
11
+ ### Features
12
+
13
+ - Add flexible pruning strategy system to GSP algorithm
14
+ ([`94089cc`](https://github.com/jacksonpradolima/gsp-py/commit/94089cc5716ec6d7c7a6e0720843162db116fca2))
15
+
16
+ feat: add flexible pruning strategy system to GSP algorithm
17
+
18
+ - Add typing-extensions as a dependency
19
+ ([`6222945`](https://github.com/jacksonpradolima/gsp-py/commit/62229455ef3976c405d96e5ea9d5cafaf5eee6e3))
20
+
21
+ ### Refactoring
22
+
23
+ - Pruning strategy initialization and enhance type hints; add typing_extensions dependency
24
+ ([`ddc0abd`](https://github.com/jacksonpradolima/gsp-py/commit/ddc0abd9352797dd19988f60d6287da421ef60cf))
25
+
26
+
4
27
  ## v3.5.0 (2026-01-26)
5
28
 
6
29
  ### Bug Fixes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gsppy
3
- Version: 3.5.0
3
+ Version: 3.6.0
4
4
  Summary: GSP (Generalized Sequence Pattern) algorithm in Python
5
5
  Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
6
6
  Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
@@ -40,6 +40,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
40
40
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
41
41
  Requires-Python: >=3.10
42
42
  Requires-Dist: click>=8.0.0
43
+ Requires-Dist: typing-extensions>=4.0.0
43
44
  Provides-Extra: dev
44
45
  Requires-Dist: cython==3.2.4; extra == 'dev'
45
46
  Requires-Dist: hatch==1.16.3; extra == 'dev'
@@ -705,6 +706,140 @@ result = gsp.search(min_support=0.5)
705
706
 
706
707
  ---
707
708
 
709
+ ## 🔧 Flexible Candidate Pruning
710
+
711
+ GSP-Py supports **flexible candidate pruning strategies** that allow you to customize how candidate sequences are filtered during pattern mining. This enables optimization for different dataset characteristics and mining requirements.
712
+
713
+ ### Built-in Pruning Strategies
714
+
715
+ #### 1. Support-Based Pruning (Default)
716
+
717
+ The standard GSP pruning based on minimum support threshold:
718
+
719
+ ```python
720
+ from gsppy.gsp import GSP
721
+ from gsppy.pruning import SupportBasedPruning
722
+
723
+ # Explicit support-based pruning
724
+ pruner = SupportBasedPruning(min_support_fraction=0.3)
725
+ gsp = GSP(transactions, pruning_strategy=pruner)
726
+ result = gsp.search(min_support=0.3)
727
+ ```
728
+
729
+ #### 2. Frequency-Based Pruning
730
+
731
+ Prunes candidates based on absolute frequency (minimum number of occurrences):
732
+
733
+ ```python
734
+ from gsppy.pruning import FrequencyBasedPruning
735
+
736
+ # Require patterns to appear at least 5 times
737
+ pruner = FrequencyBasedPruning(min_frequency=5)
738
+ gsp = GSP(transactions, pruning_strategy=pruner)
739
+ result = gsp.search(min_support=0.2)
740
+ ```
741
+
742
+ **Use case**: When you need patterns to occur a minimum absolute number of times, regardless of dataset size.
743
+
744
+ #### 3. Temporal-Aware Pruning
745
+
746
+ Optimizes pruning for time-constrained pattern mining by pre-filtering infeasible patterns:
747
+
748
+ ```python
749
+ from gsppy.pruning import TemporalAwarePruning
750
+
751
+ # Prune patterns that cannot satisfy temporal constraints
752
+ pruner = TemporalAwarePruning(
753
+ mingap=1,
754
+ maxgap=5,
755
+ maxspan=10,
756
+ min_support_fraction=0.3
757
+ )
758
+ gsp = GSP(timestamped_transactions, mingap=1, maxgap=5, maxspan=10, pruning_strategy=pruner)
759
+ result = gsp.search(min_support=0.3)
760
+ ```
761
+
762
+ **Use case**: Improves performance for temporal pattern mining by eliminating patterns that cannot satisfy temporal constraints.
763
+
764
+ #### 4. Combined Pruning
765
+
766
+ Combines multiple pruning strategies for aggressive filtering:
767
+
768
+ ```python
769
+ from gsppy.pruning import CombinedPruning, SupportBasedPruning, FrequencyBasedPruning
770
+
771
+ # Apply both support and frequency constraints
772
+ strategies = [
773
+ SupportBasedPruning(min_support_fraction=0.3),
774
+ FrequencyBasedPruning(min_frequency=5)
775
+ ]
776
+ pruner = CombinedPruning(strategies)
777
+ gsp = GSP(transactions, pruning_strategy=pruner)
778
+ result = gsp.search(min_support=0.3)
779
+ ```
780
+
781
+ **Use case**: When you want to combine multiple filtering criteria for more selective pattern discovery.
782
+
783
+ ### Custom Pruning Strategies
784
+
785
+ You can create custom pruning strategies by implementing the `PruningStrategy` interface:
786
+
787
+ ```python
788
+ from gsppy.pruning import PruningStrategy
789
+ from typing import Dict, Optional, Tuple
790
+
791
+ class MyCustomPruner(PruningStrategy):
792
+ def should_prune(
793
+ self,
794
+ candidate: Tuple[str, ...],
795
+ support_count: int,
796
+ total_transactions: int,
797
+ context: Optional[Dict] = None
798
+ ) -> bool:
799
+ # Custom pruning logic
800
+ # Return True to prune (filter out), False to keep
801
+ pattern_length = len(candidate)
802
+ # Example: Prune very long patterns with low support
803
+ if pattern_length > 5 and support_count < 10:
804
+ return True
805
+ return False
806
+
807
+ # Use your custom pruner
808
+ custom_pruner = MyCustomPruner()
809
+ gsp = GSP(transactions, pruning_strategy=custom_pruner)
810
+ result = gsp.search(min_support=0.2)
811
+ ```
812
+
813
+ ### Performance Characteristics
814
+
815
+ Different pruning strategies have different performance tradeoffs:
816
+
817
+ | Strategy | Pruning Aggressiveness | Use Case | Performance Impact |
818
+ |----------|----------------------|----------|-------------------|
819
+ | **SupportBased** | Moderate | General-purpose mining | Baseline performance |
820
+ | **FrequencyBased** | High (for large datasets) | Require absolute frequency | Faster on large datasets |
821
+ | **TemporalAware** | High (for temporal data) | Time-constrained patterns | Significant speedup for temporal mining |
822
+ | **Combined** | Very High | Selective pattern discovery | Fastest, but may miss edge cases |
823
+
824
+ ### Benchmarking Pruning Strategies
825
+
826
+ To compare pruning strategies on your dataset:
827
+
828
+ ```bash
829
+ # Compare all strategies
830
+ python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy all
831
+
832
+ # Benchmark a specific strategy
833
+ python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy frequency
834
+
835
+ # Run multiple rounds for averaging
836
+ python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy all --rounds 3
837
+ ```
838
+
839
+ See `benchmarks/bench_pruning.py` for the complete benchmarking script.
840
+
841
+ ---
842
+
708
843
  ## ⌨️ Typing
709
844
 
710
845
  `gsppy` ships inline type information (PEP 561) via a bundled `py.typed` marker. The public API is re-exported from
@@ -718,10 +853,7 @@ larger applications.
718
853
 
719
854
  We are actively working to improve GSP-Py. Here are some exciting features planned for future releases:
720
855
 
721
- 1. **Custom Filters for Candidate Pruning**:
722
- - Enable users to define their own pruning logic during the mining process.
723
-
724
- 2. **Support for Preprocessing and Postprocessing**:
856
+ 1. **Support for Preprocessing and Postprocessing**:
725
857
  - Add hooks to allow users to transform datasets before mining and customize the output results.
726
858
 
727
859
  Want to contribute or suggest an
@@ -638,6 +638,140 @@ result = gsp.search(min_support=0.5)
638
638
 
639
639
  ---
640
640
 
641
+ ## 🔧 Flexible Candidate Pruning
642
+
643
+ GSP-Py supports **flexible candidate pruning strategies** that allow you to customize how candidate sequences are filtered during pattern mining. This enables optimization for different dataset characteristics and mining requirements.
644
+
645
+ ### Built-in Pruning Strategies
646
+
647
+ #### 1. Support-Based Pruning (Default)
648
+
649
+ The standard GSP pruning based on minimum support threshold:
650
+
651
+ ```python
652
+ from gsppy.gsp import GSP
653
+ from gsppy.pruning import SupportBasedPruning
654
+
655
+ # Explicit support-based pruning
656
+ pruner = SupportBasedPruning(min_support_fraction=0.3)
657
+ gsp = GSP(transactions, pruning_strategy=pruner)
658
+ result = gsp.search(min_support=0.3)
659
+ ```
660
+
661
+ #### 2. Frequency-Based Pruning
662
+
663
+ Prunes candidates based on absolute frequency (minimum number of occurrences):
664
+
665
+ ```python
666
+ from gsppy.pruning import FrequencyBasedPruning
667
+
668
+ # Require patterns to appear at least 5 times
669
+ pruner = FrequencyBasedPruning(min_frequency=5)
670
+ gsp = GSP(transactions, pruning_strategy=pruner)
671
+ result = gsp.search(min_support=0.2)
672
+ ```
673
+
674
+ **Use case**: When you need patterns to occur a minimum absolute number of times, regardless of dataset size.
675
+
676
+ #### 3. Temporal-Aware Pruning
677
+
678
+ Optimizes pruning for time-constrained pattern mining by pre-filtering infeasible patterns:
679
+
680
+ ```python
681
+ from gsppy.pruning import TemporalAwarePruning
682
+
683
+ # Prune patterns that cannot satisfy temporal constraints
684
+ pruner = TemporalAwarePruning(
685
+ mingap=1,
686
+ maxgap=5,
687
+ maxspan=10,
688
+ min_support_fraction=0.3
689
+ )
690
+ gsp = GSP(timestamped_transactions, mingap=1, maxgap=5, maxspan=10, pruning_strategy=pruner)
691
+ result = gsp.search(min_support=0.3)
692
+ ```
693
+
694
+ **Use case**: Improves performance for temporal pattern mining by eliminating patterns that cannot satisfy temporal constraints.
695
+
696
+ #### 4. Combined Pruning
697
+
698
+ Combines multiple pruning strategies for aggressive filtering:
699
+
700
+ ```python
701
+ from gsppy.pruning import CombinedPruning, SupportBasedPruning, FrequencyBasedPruning
702
+
703
+ # Apply both support and frequency constraints
704
+ strategies = [
705
+ SupportBasedPruning(min_support_fraction=0.3),
706
+ FrequencyBasedPruning(min_frequency=5)
707
+ ]
708
+ pruner = CombinedPruning(strategies)
709
+ gsp = GSP(transactions, pruning_strategy=pruner)
710
+ result = gsp.search(min_support=0.3)
711
+ ```
712
+
713
+ **Use case**: When you want to combine multiple filtering criteria for more selective pattern discovery.
714
+
715
+ ### Custom Pruning Strategies
716
+
717
+ You can create custom pruning strategies by implementing the `PruningStrategy` interface:
718
+
719
+ ```python
720
+ from gsppy.pruning import PruningStrategy
721
+ from typing import Dict, Optional, Tuple
722
+
723
+ class MyCustomPruner(PruningStrategy):
724
+ def should_prune(
725
+ self,
726
+ candidate: Tuple[str, ...],
727
+ support_count: int,
728
+ total_transactions: int,
729
+ context: Optional[Dict] = None
730
+ ) -> bool:
731
+ # Custom pruning logic
732
+ # Return True to prune (filter out), False to keep
733
+ pattern_length = len(candidate)
734
+ # Example: Prune very long patterns with low support
735
+ if pattern_length > 5 and support_count < 10:
736
+ return True
737
+ return False
738
+
739
+ # Use your custom pruner
740
+ custom_pruner = MyCustomPruner()
741
+ gsp = GSP(transactions, pruning_strategy=custom_pruner)
742
+ result = gsp.search(min_support=0.2)
743
+ ```
744
+
745
+ ### Performance Characteristics
746
+
747
+ Different pruning strategies have different performance tradeoffs:
748
+
749
+ | Strategy | Pruning Aggressiveness | Use Case | Performance Impact |
750
+ |----------|----------------------|----------|-------------------|
751
+ | **SupportBased** | Moderate | General-purpose mining | Baseline performance |
752
+ | **FrequencyBased** | High (for large datasets) | Require absolute frequency | Faster on large datasets |
753
+ | **TemporalAware** | High (for temporal data) | Time-constrained patterns | Significant speedup for temporal mining |
754
+ | **Combined** | Very High | Selective pattern discovery | Fastest, but may miss edge cases |
755
+
756
+ ### Benchmarking Pruning Strategies
757
+
758
+ To compare pruning strategies on your dataset:
759
+
760
+ ```bash
761
+ # Compare all strategies
762
+ python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy all
763
+
764
+ # Benchmark a specific strategy
765
+ python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy frequency
766
+
767
+ # Run multiple rounds for averaging
768
+ python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy all --rounds 3
769
+ ```
770
+
771
+ See `benchmarks/bench_pruning.py` for the complete benchmarking script.
772
+
773
+ ---
774
+
641
775
  ## ⌨️ Typing
642
776
 
643
777
  `gsppy` ships inline type information (PEP 561) via a bundled `py.typed` marker. The public API is re-exported from
@@ -651,10 +785,7 @@ larger applications.
651
785
 
652
786
  We are actively working to improve GSP-Py. Here are some exciting features planned for future releases:
653
787
 
654
- 1. **Custom Filters for Candidate Pruning**:
655
- - Enable users to define their own pruning logic during the mining process.
656
-
657
- 2. **Support for Preprocessing and Postprocessing**:
788
+ 1. **Support for Preprocessing and Postprocessing**:
658
789
  - Add hooks to allow users to transform datasets before mining and customize the output results.
659
790
 
660
791
  Want to contribute or suggest an
@@ -13,6 +13,14 @@ from gsppy.cli import (
13
13
  read_transactions_from_json,
14
14
  )
15
15
  from gsppy.gsp import GSP
16
+ from gsppy.pruning import (
17
+ PruningStrategy,
18
+ SupportBasedPruning,
19
+ FrequencyBasedPruning,
20
+ TemporalAwarePruning,
21
+ CombinedPruning,
22
+ create_default_pruning_strategy,
23
+ )
16
24
 
17
25
  try:
18
26
  __version__ = importlib_metadata.version("gsppy")
@@ -26,4 +34,10 @@ __all__ = [
26
34
  "read_transactions_from_json",
27
35
  "setup_logging",
28
36
  "__version__",
37
+ "PruningStrategy",
38
+ "SupportBasedPruning",
39
+ "FrequencyBasedPruning",
40
+ "TemporalAwarePruning",
41
+ "CombinedPruning",
42
+ "create_default_pruning_strategy",
29
43
  ]
@@ -99,6 +99,7 @@ from gsppy.utils import (
99
99
  generate_candidates_from_previous,
100
100
  is_subsequence_in_list_with_time_constraints,
101
101
  )
102
+ from gsppy.pruning import PruningStrategy, create_default_pruning_strategy
102
103
  from gsppy.accelerate import support_counts as support_counts_accel
103
104
 
104
105
  logger: logging.Logger = logging.getLogger(__name__)
@@ -130,6 +131,7 @@ class GSP:
130
131
  maxgap: Optional[float] = None,
131
132
  maxspan: Optional[float] = None,
132
133
  verbose: bool = False,
134
+ pruning_strategy: Optional[PruningStrategy] = None,
133
135
  ):
134
136
  """
135
137
  Initialize the GSP algorithm with raw transactional data.
@@ -144,6 +146,9 @@ class GSP:
144
146
  maxspan (Optional[float]): Maximum time span from first to last item in patterns.
145
147
  verbose (bool): Enable verbose logging output with detailed progress information.
146
148
  Default is False (minimal output).
149
+ pruning_strategy (Optional[PruningStrategy]): Custom pruning strategy for candidate filtering.
150
+ If None, a default strategy is created based on
151
+ temporal constraints.
147
152
 
148
153
  Attributes Initialized:
149
154
  - Processes the input raw transaction dataset.
@@ -162,9 +167,18 @@ class GSP:
162
167
  self.maxgap = maxgap
163
168
  self.maxspan = maxspan
164
169
  self.verbose = verbose
170
+ self.pruning_strategy: PruningStrategy
165
171
  self._configure_logging()
166
172
  self._validate_temporal_constraints()
167
173
  self._pre_processing(raw_transactions)
174
+ # Initialize default pruning strategy if none provided
175
+ if pruning_strategy is None:
176
+ self.pruning_strategy = create_default_pruning_strategy(
177
+ mingap=self.mingap, maxgap=self.maxgap, maxspan=self.maxspan
178
+ )
179
+ logger.debug("Using default pruning strategy: %s", self.pruning_strategy.get_description())
180
+ else:
181
+ self.pruning_strategy = pruning_strategy
168
182
 
169
183
  def _configure_logging(self) -> None:
170
184
  """
@@ -389,6 +403,39 @@ class GSP:
389
403
  # Fallback to Python implementation on any acceleration failure
390
404
  return self._support_python(items, min_support, batch_size)
391
405
 
406
+ def _apply_pruning(
407
+ self, freq_patterns: Dict[Tuple[str, ...], int], min_support_count: int
408
+ ) -> Dict[Tuple[str, ...], int]:
409
+ """
410
+ Apply the configured pruning strategy to filter frequent patterns.
411
+
412
+ This method uses the pruning strategy to post-process patterns that have
413
+ already met the minimum support threshold. Additional pruning can be applied
414
+ based on other criteria such as temporal feasibility or frequency thresholds.
415
+
416
+ Parameters:
417
+ freq_patterns (Dict[Tuple[str, ...], int]): Dictionary of patterns and their support counts.
418
+ min_support_count (int): Absolute minimum support count threshold.
419
+
420
+ Returns:
421
+ Dict[Tuple[str, ...], int]: Filtered patterns after applying pruning strategy.
422
+ """
423
+ if not freq_patterns:
424
+ return freq_patterns
425
+
426
+ pruned_patterns: Dict[Tuple[str, ...], int] = {}
427
+ context = {"min_support_count": min_support_count}
428
+
429
+ for candidate, support_count in freq_patterns.items():
430
+ if not self.pruning_strategy.should_prune(candidate, support_count, len(self.transactions), context):
431
+ pruned_patterns[candidate] = support_count
432
+
433
+ num_pruned = len(freq_patterns) - len(pruned_patterns)
434
+ if num_pruned > 0:
435
+ logger.debug("Pruning strategy filtered out %d additional candidates", num_pruned)
436
+
437
+ return pruned_patterns
438
+
392
439
  def _print_status(self, run: int, candidates: List[Tuple[str, ...]]) -> None:
393
440
  """
394
441
  Log progress information for the current GSP iteration.
@@ -504,7 +551,10 @@ class GSP:
504
551
 
505
552
  # scan transactions to collect support count for each candidate
506
553
  # sequence & filter
507
- self.freq_patterns.append(self._support(candidates, abs_min_support, backend=backend))
554
+ freq_1 = self._support(candidates, abs_min_support, backend=backend)
555
+ # Apply pruning strategy for additional filtering
556
+ freq_1 = self._apply_pruning(freq_1, abs_min_support)
557
+ self.freq_patterns.append(freq_1)
508
558
 
509
559
  # (k-itemsets/k-sequence = 1)
510
560
  k_items = 1
@@ -525,7 +575,10 @@ class GSP:
525
575
 
526
576
  # candidate pruning - eliminates candidates who are not potentially
527
577
  # frequent (using support as threshold)
528
- self.freq_patterns.append(self._support(candidates, abs_min_support, backend=backend))
578
+ freq_k = self._support(candidates, abs_min_support, backend=backend)
579
+ # Apply pruning strategy for additional filtering
580
+ freq_k = self._apply_pruning(freq_k, abs_min_support)
581
+ self.freq_patterns.append(freq_k)
529
582
 
530
583
  self._print_status(k_items, candidates)
531
584
  logger.info("GSP algorithm completed.")