gsppy 3.5.0__tar.gz → 3.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gsppy-3.5.0 → gsppy-3.6.0}/CHANGELOG.md +23 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/PKG-INFO +137 -5
- {gsppy-3.5.0 → gsppy-3.6.0}/README.md +135 -4
- {gsppy-3.5.0 → gsppy-3.6.0}/gsppy/__init__.py +14 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/gsppy/gsp.py +55 -2
- gsppy-3.6.0/gsppy/pruning.py +412 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/pyproject.toml +3 -2
- gsppy-3.6.0/tests/test_pruning.py +400 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/.gitignore +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/CONTRIBUTING.md +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/LICENSE +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/SECURITY.md +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/gsppy/accelerate.py +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/gsppy/cli.py +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/gsppy/py.typed +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/gsppy/utils.py +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/rust/Cargo.lock +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/rust/Cargo.toml +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/rust/src/lib.rs +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/tests/__init__.py +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/tests/test_cli.py +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/tests/test_gsp.py +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/tests/test_gsp_fuzzing.py +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/tests/test_temporal_constraints.py +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/tests/test_utils.py +0 -0
- {gsppy-3.5.0 → gsppy-3.6.0}/tox.ini +0 -0
|
@@ -1,6 +1,29 @@
|
|
|
1
1
|
# CHANGELOG
|
|
2
2
|
|
|
3
3
|
|
|
4
|
+
## v3.6.0 (2026-01-26)
|
|
5
|
+
|
|
6
|
+
### Chores
|
|
7
|
+
|
|
8
|
+
- Update uv.lock for version 3.5.0
|
|
9
|
+
([`e2c1be0`](https://github.com/jacksonpradolima/gsp-py/commit/e2c1be0945b0b124d8afa8981877513449b29ff0))
|
|
10
|
+
|
|
11
|
+
### Features
|
|
12
|
+
|
|
13
|
+
- Add flexible pruning strategy system to GSP algorithm
|
|
14
|
+
([`94089cc`](https://github.com/jacksonpradolima/gsp-py/commit/94089cc5716ec6d7c7a6e0720843162db116fca2))
|
|
15
|
+
|
|
16
|
+
feat: add flexible pruning strategy system to GSP algorithm
|
|
17
|
+
|
|
18
|
+
- Add typing-extensions as a dependency
|
|
19
|
+
([`6222945`](https://github.com/jacksonpradolima/gsp-py/commit/62229455ef3976c405d96e5ea9d5cafaf5eee6e3))
|
|
20
|
+
|
|
21
|
+
### Refactoring
|
|
22
|
+
|
|
23
|
+
- Pruning strategy initialization and enhance type hints; add typing_extensions dependency
|
|
24
|
+
([`ddc0abd`](https://github.com/jacksonpradolima/gsp-py/commit/ddc0abd9352797dd19988f60d6287da421ef60cf))
|
|
25
|
+
|
|
26
|
+
|
|
4
27
|
## v3.5.0 (2026-01-26)
|
|
5
28
|
|
|
6
29
|
### Bug Fixes
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gsppy
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.6.0
|
|
4
4
|
Summary: GSP (Generalized Sequence Pattern) algorithm in Python
|
|
5
5
|
Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
|
|
6
6
|
Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
@@ -40,6 +40,7 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
40
40
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
41
41
|
Requires-Python: >=3.10
|
|
42
42
|
Requires-Dist: click>=8.0.0
|
|
43
|
+
Requires-Dist: typing-extensions>=4.0.0
|
|
43
44
|
Provides-Extra: dev
|
|
44
45
|
Requires-Dist: cython==3.2.4; extra == 'dev'
|
|
45
46
|
Requires-Dist: hatch==1.16.3; extra == 'dev'
|
|
@@ -705,6 +706,140 @@ result = gsp.search(min_support=0.5)
|
|
|
705
706
|
|
|
706
707
|
---
|
|
707
708
|
|
|
709
|
+
## 🔧 Flexible Candidate Pruning
|
|
710
|
+
|
|
711
|
+
GSP-Py supports **flexible candidate pruning strategies** that allow you to customize how candidate sequences are filtered during pattern mining. This enables optimization for different dataset characteristics and mining requirements.
|
|
712
|
+
|
|
713
|
+
### Built-in Pruning Strategies
|
|
714
|
+
|
|
715
|
+
#### 1. Support-Based Pruning (Default)
|
|
716
|
+
|
|
717
|
+
The standard GSP pruning based on minimum support threshold:
|
|
718
|
+
|
|
719
|
+
```python
|
|
720
|
+
from gsppy.gsp import GSP
|
|
721
|
+
from gsppy.pruning import SupportBasedPruning
|
|
722
|
+
|
|
723
|
+
# Explicit support-based pruning
|
|
724
|
+
pruner = SupportBasedPruning(min_support_fraction=0.3)
|
|
725
|
+
gsp = GSP(transactions, pruning_strategy=pruner)
|
|
726
|
+
result = gsp.search(min_support=0.3)
|
|
727
|
+
```
|
|
728
|
+
|
|
729
|
+
#### 2. Frequency-Based Pruning
|
|
730
|
+
|
|
731
|
+
Prunes candidates based on absolute frequency (minimum number of occurrences):
|
|
732
|
+
|
|
733
|
+
```python
|
|
734
|
+
from gsppy.pruning import FrequencyBasedPruning
|
|
735
|
+
|
|
736
|
+
# Require patterns to appear at least 5 times
|
|
737
|
+
pruner = FrequencyBasedPruning(min_frequency=5)
|
|
738
|
+
gsp = GSP(transactions, pruning_strategy=pruner)
|
|
739
|
+
result = gsp.search(min_support=0.2)
|
|
740
|
+
```
|
|
741
|
+
|
|
742
|
+
**Use case**: When you need patterns to occur a minimum absolute number of times, regardless of dataset size.
|
|
743
|
+
|
|
744
|
+
#### 3. Temporal-Aware Pruning
|
|
745
|
+
|
|
746
|
+
Optimizes pruning for time-constrained pattern mining by pre-filtering infeasible patterns:
|
|
747
|
+
|
|
748
|
+
```python
|
|
749
|
+
from gsppy.pruning import TemporalAwarePruning
|
|
750
|
+
|
|
751
|
+
# Prune patterns that cannot satisfy temporal constraints
|
|
752
|
+
pruner = TemporalAwarePruning(
|
|
753
|
+
mingap=1,
|
|
754
|
+
maxgap=5,
|
|
755
|
+
maxspan=10,
|
|
756
|
+
min_support_fraction=0.3
|
|
757
|
+
)
|
|
758
|
+
gsp = GSP(timestamped_transactions, mingap=1, maxgap=5, maxspan=10, pruning_strategy=pruner)
|
|
759
|
+
result = gsp.search(min_support=0.3)
|
|
760
|
+
```
|
|
761
|
+
|
|
762
|
+
**Use case**: Improves performance for temporal pattern mining by eliminating patterns that cannot satisfy temporal constraints.
|
|
763
|
+
|
|
764
|
+
#### 4. Combined Pruning
|
|
765
|
+
|
|
766
|
+
Combines multiple pruning strategies for aggressive filtering:
|
|
767
|
+
|
|
768
|
+
```python
|
|
769
|
+
from gsppy.pruning import CombinedPruning, SupportBasedPruning, FrequencyBasedPruning
|
|
770
|
+
|
|
771
|
+
# Apply both support and frequency constraints
|
|
772
|
+
strategies = [
|
|
773
|
+
SupportBasedPruning(min_support_fraction=0.3),
|
|
774
|
+
FrequencyBasedPruning(min_frequency=5)
|
|
775
|
+
]
|
|
776
|
+
pruner = CombinedPruning(strategies)
|
|
777
|
+
gsp = GSP(transactions, pruning_strategy=pruner)
|
|
778
|
+
result = gsp.search(min_support=0.3)
|
|
779
|
+
```
|
|
780
|
+
|
|
781
|
+
**Use case**: When you want to combine multiple filtering criteria for more selective pattern discovery.
|
|
782
|
+
|
|
783
|
+
### Custom Pruning Strategies
|
|
784
|
+
|
|
785
|
+
You can create custom pruning strategies by implementing the `PruningStrategy` interface:
|
|
786
|
+
|
|
787
|
+
```python
|
|
788
|
+
from gsppy.pruning import PruningStrategy
|
|
789
|
+
from typing import Dict, Optional, Tuple
|
|
790
|
+
|
|
791
|
+
class MyCustomPruner(PruningStrategy):
|
|
792
|
+
def should_prune(
|
|
793
|
+
self,
|
|
794
|
+
candidate: Tuple[str, ...],
|
|
795
|
+
support_count: int,
|
|
796
|
+
total_transactions: int,
|
|
797
|
+
context: Optional[Dict] = None
|
|
798
|
+
) -> bool:
|
|
799
|
+
# Custom pruning logic
|
|
800
|
+
# Return True to prune (filter out), False to keep
|
|
801
|
+
pattern_length = len(candidate)
|
|
802
|
+
# Example: Prune very long patterns with low support
|
|
803
|
+
if pattern_length > 5 and support_count < 10:
|
|
804
|
+
return True
|
|
805
|
+
return False
|
|
806
|
+
|
|
807
|
+
# Use your custom pruner
|
|
808
|
+
custom_pruner = MyCustomPruner()
|
|
809
|
+
gsp = GSP(transactions, pruning_strategy=custom_pruner)
|
|
810
|
+
result = gsp.search(min_support=0.2)
|
|
811
|
+
```
|
|
812
|
+
|
|
813
|
+
### Performance Characteristics
|
|
814
|
+
|
|
815
|
+
Different pruning strategies have different performance tradeoffs:
|
|
816
|
+
|
|
817
|
+
| Strategy | Pruning Aggressiveness | Use Case | Performance Impact |
|
|
818
|
+
|----------|----------------------|----------|-------------------|
|
|
819
|
+
| **SupportBased** | Moderate | General-purpose mining | Baseline performance |
|
|
820
|
+
| **FrequencyBased** | High (for large datasets) | Require absolute frequency | Faster on large datasets |
|
|
821
|
+
| **TemporalAware** | High (for temporal data) | Time-constrained patterns | Significant speedup for temporal mining |
|
|
822
|
+
| **Combined** | Very High | Selective pattern discovery | Fastest, but may miss edge cases |
|
|
823
|
+
|
|
824
|
+
### Benchmarking Pruning Strategies
|
|
825
|
+
|
|
826
|
+
To compare pruning strategies on your dataset:
|
|
827
|
+
|
|
828
|
+
```bash
|
|
829
|
+
# Compare all strategies
|
|
830
|
+
python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy all
|
|
831
|
+
|
|
832
|
+
# Benchmark a specific strategy
|
|
833
|
+
python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy frequency
|
|
834
|
+
|
|
835
|
+
# Run multiple rounds for averaging
|
|
836
|
+
python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy all --rounds 3
|
|
837
|
+
```
|
|
838
|
+
|
|
839
|
+
See `benchmarks/bench_pruning.py` for the complete benchmarking script.
|
|
840
|
+
|
|
841
|
+
---
|
|
842
|
+
|
|
708
843
|
## ⌨️ Typing
|
|
709
844
|
|
|
710
845
|
`gsppy` ships inline type information (PEP 561) via a bundled `py.typed` marker. The public API is re-exported from
|
|
@@ -718,10 +853,7 @@ larger applications.
|
|
|
718
853
|
|
|
719
854
|
We are actively working to improve GSP-Py. Here are some exciting features planned for future releases:
|
|
720
855
|
|
|
721
|
-
1. **
|
|
722
|
-
- Enable users to define their own pruning logic during the mining process.
|
|
723
|
-
|
|
724
|
-
2. **Support for Preprocessing and Postprocessing**:
|
|
856
|
+
1. **Support for Preprocessing and Postprocessing**:
|
|
725
857
|
- Add hooks to allow users to transform datasets before mining and customize the output results.
|
|
726
858
|
|
|
727
859
|
Want to contribute or suggest an
|
|
@@ -638,6 +638,140 @@ result = gsp.search(min_support=0.5)
|
|
|
638
638
|
|
|
639
639
|
---
|
|
640
640
|
|
|
641
|
+
## 🔧 Flexible Candidate Pruning
|
|
642
|
+
|
|
643
|
+
GSP-Py supports **flexible candidate pruning strategies** that allow you to customize how candidate sequences are filtered during pattern mining. This enables optimization for different dataset characteristics and mining requirements.
|
|
644
|
+
|
|
645
|
+
### Built-in Pruning Strategies
|
|
646
|
+
|
|
647
|
+
#### 1. Support-Based Pruning (Default)
|
|
648
|
+
|
|
649
|
+
The standard GSP pruning based on minimum support threshold:
|
|
650
|
+
|
|
651
|
+
```python
|
|
652
|
+
from gsppy.gsp import GSP
|
|
653
|
+
from gsppy.pruning import SupportBasedPruning
|
|
654
|
+
|
|
655
|
+
# Explicit support-based pruning
|
|
656
|
+
pruner = SupportBasedPruning(min_support_fraction=0.3)
|
|
657
|
+
gsp = GSP(transactions, pruning_strategy=pruner)
|
|
658
|
+
result = gsp.search(min_support=0.3)
|
|
659
|
+
```
|
|
660
|
+
|
|
661
|
+
#### 2. Frequency-Based Pruning
|
|
662
|
+
|
|
663
|
+
Prunes candidates based on absolute frequency (minimum number of occurrences):
|
|
664
|
+
|
|
665
|
+
```python
|
|
666
|
+
from gsppy.pruning import FrequencyBasedPruning
|
|
667
|
+
|
|
668
|
+
# Require patterns to appear at least 5 times
|
|
669
|
+
pruner = FrequencyBasedPruning(min_frequency=5)
|
|
670
|
+
gsp = GSP(transactions, pruning_strategy=pruner)
|
|
671
|
+
result = gsp.search(min_support=0.2)
|
|
672
|
+
```
|
|
673
|
+
|
|
674
|
+
**Use case**: When you need patterns to occur a minimum absolute number of times, regardless of dataset size.
|
|
675
|
+
|
|
676
|
+
#### 3. Temporal-Aware Pruning
|
|
677
|
+
|
|
678
|
+
Optimizes pruning for time-constrained pattern mining by pre-filtering infeasible patterns:
|
|
679
|
+
|
|
680
|
+
```python
|
|
681
|
+
from gsppy.pruning import TemporalAwarePruning
|
|
682
|
+
|
|
683
|
+
# Prune patterns that cannot satisfy temporal constraints
|
|
684
|
+
pruner = TemporalAwarePruning(
|
|
685
|
+
mingap=1,
|
|
686
|
+
maxgap=5,
|
|
687
|
+
maxspan=10,
|
|
688
|
+
min_support_fraction=0.3
|
|
689
|
+
)
|
|
690
|
+
gsp = GSP(timestamped_transactions, mingap=1, maxgap=5, maxspan=10, pruning_strategy=pruner)
|
|
691
|
+
result = gsp.search(min_support=0.3)
|
|
692
|
+
```
|
|
693
|
+
|
|
694
|
+
**Use case**: Improves performance for temporal pattern mining by eliminating patterns that cannot satisfy temporal constraints.
|
|
695
|
+
|
|
696
|
+
#### 4. Combined Pruning
|
|
697
|
+
|
|
698
|
+
Combines multiple pruning strategies for aggressive filtering:
|
|
699
|
+
|
|
700
|
+
```python
|
|
701
|
+
from gsppy.pruning import CombinedPruning, SupportBasedPruning, FrequencyBasedPruning
|
|
702
|
+
|
|
703
|
+
# Apply both support and frequency constraints
|
|
704
|
+
strategies = [
|
|
705
|
+
SupportBasedPruning(min_support_fraction=0.3),
|
|
706
|
+
FrequencyBasedPruning(min_frequency=5)
|
|
707
|
+
]
|
|
708
|
+
pruner = CombinedPruning(strategies)
|
|
709
|
+
gsp = GSP(transactions, pruning_strategy=pruner)
|
|
710
|
+
result = gsp.search(min_support=0.3)
|
|
711
|
+
```
|
|
712
|
+
|
|
713
|
+
**Use case**: When you want to combine multiple filtering criteria for more selective pattern discovery.
|
|
714
|
+
|
|
715
|
+
### Custom Pruning Strategies
|
|
716
|
+
|
|
717
|
+
You can create custom pruning strategies by implementing the `PruningStrategy` interface:
|
|
718
|
+
|
|
719
|
+
```python
|
|
720
|
+
from gsppy.pruning import PruningStrategy
|
|
721
|
+
from typing import Dict, Optional, Tuple
|
|
722
|
+
|
|
723
|
+
class MyCustomPruner(PruningStrategy):
|
|
724
|
+
def should_prune(
|
|
725
|
+
self,
|
|
726
|
+
candidate: Tuple[str, ...],
|
|
727
|
+
support_count: int,
|
|
728
|
+
total_transactions: int,
|
|
729
|
+
context: Optional[Dict] = None
|
|
730
|
+
) -> bool:
|
|
731
|
+
# Custom pruning logic
|
|
732
|
+
# Return True to prune (filter out), False to keep
|
|
733
|
+
pattern_length = len(candidate)
|
|
734
|
+
# Example: Prune very long patterns with low support
|
|
735
|
+
if pattern_length > 5 and support_count < 10:
|
|
736
|
+
return True
|
|
737
|
+
return False
|
|
738
|
+
|
|
739
|
+
# Use your custom pruner
|
|
740
|
+
custom_pruner = MyCustomPruner()
|
|
741
|
+
gsp = GSP(transactions, pruning_strategy=custom_pruner)
|
|
742
|
+
result = gsp.search(min_support=0.2)
|
|
743
|
+
```
|
|
744
|
+
|
|
745
|
+
### Performance Characteristics
|
|
746
|
+
|
|
747
|
+
Different pruning strategies have different performance tradeoffs:
|
|
748
|
+
|
|
749
|
+
| Strategy | Pruning Aggressiveness | Use Case | Performance Impact |
|
|
750
|
+
|----------|----------------------|----------|-------------------|
|
|
751
|
+
| **SupportBased** | Moderate | General-purpose mining | Baseline performance |
|
|
752
|
+
| **FrequencyBased** | High (for large datasets) | Require absolute frequency | Faster on large datasets |
|
|
753
|
+
| **TemporalAware** | High (for temporal data) | Time-constrained patterns | Significant speedup for temporal mining |
|
|
754
|
+
| **Combined** | Very High | Selective pattern discovery | Fastest, but may miss edge cases |
|
|
755
|
+
|
|
756
|
+
### Benchmarking Pruning Strategies
|
|
757
|
+
|
|
758
|
+
To compare pruning strategies on your dataset:
|
|
759
|
+
|
|
760
|
+
```bash
|
|
761
|
+
# Compare all strategies
|
|
762
|
+
python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy all
|
|
763
|
+
|
|
764
|
+
# Benchmark a specific strategy
|
|
765
|
+
python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy frequency
|
|
766
|
+
|
|
767
|
+
# Run multiple rounds for averaging
|
|
768
|
+
python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy all --rounds 3
|
|
769
|
+
```
|
|
770
|
+
|
|
771
|
+
See `benchmarks/bench_pruning.py` for the complete benchmarking script.
|
|
772
|
+
|
|
773
|
+
---
|
|
774
|
+
|
|
641
775
|
## ⌨️ Typing
|
|
642
776
|
|
|
643
777
|
`gsppy` ships inline type information (PEP 561) via a bundled `py.typed` marker. The public API is re-exported from
|
|
@@ -651,10 +785,7 @@ larger applications.
|
|
|
651
785
|
|
|
652
786
|
We are actively working to improve GSP-Py. Here are some exciting features planned for future releases:
|
|
653
787
|
|
|
654
|
-
1. **
|
|
655
|
-
- Enable users to define their own pruning logic during the mining process.
|
|
656
|
-
|
|
657
|
-
2. **Support for Preprocessing and Postprocessing**:
|
|
788
|
+
1. **Support for Preprocessing and Postprocessing**:
|
|
658
789
|
- Add hooks to allow users to transform datasets before mining and customize the output results.
|
|
659
790
|
|
|
660
791
|
Want to contribute or suggest an
|
|
@@ -13,6 +13,14 @@ from gsppy.cli import (
|
|
|
13
13
|
read_transactions_from_json,
|
|
14
14
|
)
|
|
15
15
|
from gsppy.gsp import GSP
|
|
16
|
+
from gsppy.pruning import (
|
|
17
|
+
PruningStrategy,
|
|
18
|
+
SupportBasedPruning,
|
|
19
|
+
FrequencyBasedPruning,
|
|
20
|
+
TemporalAwarePruning,
|
|
21
|
+
CombinedPruning,
|
|
22
|
+
create_default_pruning_strategy,
|
|
23
|
+
)
|
|
16
24
|
|
|
17
25
|
try:
|
|
18
26
|
__version__ = importlib_metadata.version("gsppy")
|
|
@@ -26,4 +34,10 @@ __all__ = [
|
|
|
26
34
|
"read_transactions_from_json",
|
|
27
35
|
"setup_logging",
|
|
28
36
|
"__version__",
|
|
37
|
+
"PruningStrategy",
|
|
38
|
+
"SupportBasedPruning",
|
|
39
|
+
"FrequencyBasedPruning",
|
|
40
|
+
"TemporalAwarePruning",
|
|
41
|
+
"CombinedPruning",
|
|
42
|
+
"create_default_pruning_strategy",
|
|
29
43
|
]
|
|
@@ -99,6 +99,7 @@ from gsppy.utils import (
|
|
|
99
99
|
generate_candidates_from_previous,
|
|
100
100
|
is_subsequence_in_list_with_time_constraints,
|
|
101
101
|
)
|
|
102
|
+
from gsppy.pruning import PruningStrategy, create_default_pruning_strategy
|
|
102
103
|
from gsppy.accelerate import support_counts as support_counts_accel
|
|
103
104
|
|
|
104
105
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
@@ -130,6 +131,7 @@ class GSP:
|
|
|
130
131
|
maxgap: Optional[float] = None,
|
|
131
132
|
maxspan: Optional[float] = None,
|
|
132
133
|
verbose: bool = False,
|
|
134
|
+
pruning_strategy: Optional[PruningStrategy] = None,
|
|
133
135
|
):
|
|
134
136
|
"""
|
|
135
137
|
Initialize the GSP algorithm with raw transactional data.
|
|
@@ -144,6 +146,9 @@ class GSP:
|
|
|
144
146
|
maxspan (Optional[float]): Maximum time span from first to last item in patterns.
|
|
145
147
|
verbose (bool): Enable verbose logging output with detailed progress information.
|
|
146
148
|
Default is False (minimal output).
|
|
149
|
+
pruning_strategy (Optional[PruningStrategy]): Custom pruning strategy for candidate filtering.
|
|
150
|
+
If None, a default strategy is created based on
|
|
151
|
+
temporal constraints.
|
|
147
152
|
|
|
148
153
|
Attributes Initialized:
|
|
149
154
|
- Processes the input raw transaction dataset.
|
|
@@ -162,9 +167,18 @@ class GSP:
|
|
|
162
167
|
self.maxgap = maxgap
|
|
163
168
|
self.maxspan = maxspan
|
|
164
169
|
self.verbose = verbose
|
|
170
|
+
self.pruning_strategy: PruningStrategy
|
|
165
171
|
self._configure_logging()
|
|
166
172
|
self._validate_temporal_constraints()
|
|
167
173
|
self._pre_processing(raw_transactions)
|
|
174
|
+
# Initialize default pruning strategy if none provided
|
|
175
|
+
if pruning_strategy is None:
|
|
176
|
+
self.pruning_strategy = create_default_pruning_strategy(
|
|
177
|
+
mingap=self.mingap, maxgap=self.maxgap, maxspan=self.maxspan
|
|
178
|
+
)
|
|
179
|
+
logger.debug("Using default pruning strategy: %s", self.pruning_strategy.get_description())
|
|
180
|
+
else:
|
|
181
|
+
self.pruning_strategy = pruning_strategy
|
|
168
182
|
|
|
169
183
|
def _configure_logging(self) -> None:
|
|
170
184
|
"""
|
|
@@ -389,6 +403,39 @@ class GSP:
|
|
|
389
403
|
# Fallback to Python implementation on any acceleration failure
|
|
390
404
|
return self._support_python(items, min_support, batch_size)
|
|
391
405
|
|
|
406
|
+
def _apply_pruning(
|
|
407
|
+
self, freq_patterns: Dict[Tuple[str, ...], int], min_support_count: int
|
|
408
|
+
) -> Dict[Tuple[str, ...], int]:
|
|
409
|
+
"""
|
|
410
|
+
Apply the configured pruning strategy to filter frequent patterns.
|
|
411
|
+
|
|
412
|
+
This method uses the pruning strategy to post-process patterns that have
|
|
413
|
+
already met the minimum support threshold. Additional pruning can be applied
|
|
414
|
+
based on other criteria such as temporal feasibility or frequency thresholds.
|
|
415
|
+
|
|
416
|
+
Parameters:
|
|
417
|
+
freq_patterns (Dict[Tuple[str, ...], int]): Dictionary of patterns and their support counts.
|
|
418
|
+
min_support_count (int): Absolute minimum support count threshold.
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
Dict[Tuple[str, ...], int]: Filtered patterns after applying pruning strategy.
|
|
422
|
+
"""
|
|
423
|
+
if not freq_patterns:
|
|
424
|
+
return freq_patterns
|
|
425
|
+
|
|
426
|
+
pruned_patterns: Dict[Tuple[str, ...], int] = {}
|
|
427
|
+
context = {"min_support_count": min_support_count}
|
|
428
|
+
|
|
429
|
+
for candidate, support_count in freq_patterns.items():
|
|
430
|
+
if not self.pruning_strategy.should_prune(candidate, support_count, len(self.transactions), context):
|
|
431
|
+
pruned_patterns[candidate] = support_count
|
|
432
|
+
|
|
433
|
+
num_pruned = len(freq_patterns) - len(pruned_patterns)
|
|
434
|
+
if num_pruned > 0:
|
|
435
|
+
logger.debug("Pruning strategy filtered out %d additional candidates", num_pruned)
|
|
436
|
+
|
|
437
|
+
return pruned_patterns
|
|
438
|
+
|
|
392
439
|
def _print_status(self, run: int, candidates: List[Tuple[str, ...]]) -> None:
|
|
393
440
|
"""
|
|
394
441
|
Log progress information for the current GSP iteration.
|
|
@@ -504,7 +551,10 @@ class GSP:
|
|
|
504
551
|
|
|
505
552
|
# scan transactions to collect support count for each candidate
|
|
506
553
|
# sequence & filter
|
|
507
|
-
self.
|
|
554
|
+
freq_1 = self._support(candidates, abs_min_support, backend=backend)
|
|
555
|
+
# Apply pruning strategy for additional filtering
|
|
556
|
+
freq_1 = self._apply_pruning(freq_1, abs_min_support)
|
|
557
|
+
self.freq_patterns.append(freq_1)
|
|
508
558
|
|
|
509
559
|
# (k-itemsets/k-sequence = 1)
|
|
510
560
|
k_items = 1
|
|
@@ -525,7 +575,10 @@ class GSP:
|
|
|
525
575
|
|
|
526
576
|
# candidate pruning - eliminates candidates who are not potentially
|
|
527
577
|
# frequent (using support as threshold)
|
|
528
|
-
self.
|
|
578
|
+
freq_k = self._support(candidates, abs_min_support, backend=backend)
|
|
579
|
+
# Apply pruning strategy for additional filtering
|
|
580
|
+
freq_k = self._apply_pruning(freq_k, abs_min_support)
|
|
581
|
+
self.freq_patterns.append(freq_k)
|
|
529
582
|
|
|
530
583
|
self._print_status(k_items, candidates)
|
|
531
584
|
logger.info("GSP algorithm completed.")
|