gsppy 4.2.0__tar.gz → 5.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gsppy-4.2.0 → gsppy-5.0.0}/CHANGELOG.md +18 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/PKG-INFO +1 -1
- {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/cli.py +129 -18
- {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/gsp.py +267 -48
- {gsppy-4.2.0 → gsppy-5.0.0}/pyproject.toml +1 -1
- gsppy-5.0.0/tests/test_cli_hooks.py +197 -0
- gsppy-5.0.0/tests/test_hooks.py +510 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/.gitignore +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/CONTRIBUTING.md +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/LICENSE +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/README.md +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/SECURITY.md +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/__init__.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/accelerate.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/dataframe_adapters.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/enums.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/pruning.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/py.typed +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/sequence.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/token_mapper.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/utils.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/rust/Cargo.lock +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/rust/Cargo.toml +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/rust/src/lib.rs +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/tests/__init__.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_cli.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_dataframe.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_gsp.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_gsp_fuzzing.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_gsp_sequence_integration.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_itemsets.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_pruning.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_sequence.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_spm_format.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_temporal_constraints.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_utils.py +0 -0
- {gsppy-4.2.0 → gsppy-5.0.0}/tox.ini +0 -0
|
@@ -1,6 +1,24 @@
|
|
|
1
1
|
# CHANGELOG
|
|
2
2
|
|
|
3
3
|
|
|
4
|
+
## v5.0.0 (2026-02-06)
|
|
5
|
+
|
|
6
|
+
### Chores
|
|
7
|
+
|
|
8
|
+
- Adds support for optional types in item filters and ignores types in metadata printouts.
|
|
9
|
+
([`79111b4`](https://github.com/jacksonpradolima/gsp-py/commit/79111b4c781a65b21a17f85b0b507b41ba6e51f9))
|
|
10
|
+
|
|
11
|
+
- Update uv.lock for version 4.2.0
|
|
12
|
+
([`f8f690f`](https://github.com/jacksonpradolima/gsp-py/commit/f8f690f7f0304dc4331c17c68487fe3411436149))
|
|
13
|
+
|
|
14
|
+
### Features
|
|
15
|
+
|
|
16
|
+
- Add preprocessing, postprocessing, and candidate filtering hooks to GSP algorithm
|
|
17
|
+
([`495d290`](https://github.com/jacksonpradolima/gsp-py/commit/495d29009abe862bf992831bd276181efa40c99d))
|
|
18
|
+
|
|
19
|
+
feat!: add preprocessing, postprocessing, and candidate filtering hooks to GSP algorithm
|
|
20
|
+
|
|
21
|
+
|
|
4
22
|
## v4.2.0 (2026-02-01)
|
|
5
23
|
|
|
6
24
|
### Chores
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gsppy
|
|
3
|
-
Version:
|
|
3
|
+
Version: 5.0.0
|
|
4
4
|
Summary: GSP (Generalized Sequence Pattern) algorithm in Python
|
|
5
5
|
Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
|
|
6
6
|
Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
@@ -35,7 +35,8 @@ import csv
|
|
|
35
35
|
import sys
|
|
36
36
|
import json
|
|
37
37
|
import logging
|
|
38
|
-
|
|
38
|
+
import importlib
|
|
39
|
+
from typing import Any, List, Tuple, Union, Callable, Optional, cast
|
|
39
40
|
|
|
40
41
|
import click
|
|
41
42
|
|
|
@@ -51,6 +52,54 @@ from gsppy.enums import (
|
|
|
51
52
|
from gsppy.utils import has_timestamps
|
|
52
53
|
|
|
53
54
|
|
|
55
|
+
def _load_hook_function(import_path: str, hook_type: str) -> Callable[..., Any]:
|
|
56
|
+
"""
|
|
57
|
+
Load a hook function from a Python module import path.
|
|
58
|
+
|
|
59
|
+
Parameters:
|
|
60
|
+
import_path (str): Import path in format 'module.submodule.function_name'
|
|
61
|
+
hook_type (str): Type of hook for error messages ('preprocess', 'postprocess', 'candidate_filter')
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Callable: The loaded hook function
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
ValueError: If the import path is invalid or function cannot be loaded
|
|
68
|
+
"""
|
|
69
|
+
try:
|
|
70
|
+
# Split into module path and function name
|
|
71
|
+
parts = import_path.rsplit(".", 1)
|
|
72
|
+
if len(parts) != 2:
|
|
73
|
+
raise ValueError(f"Invalid import path format. Expected 'module.function', got '{import_path}'")
|
|
74
|
+
|
|
75
|
+
module_name, function_name = parts
|
|
76
|
+
|
|
77
|
+
# Import the module
|
|
78
|
+
module = importlib.import_module(module_name)
|
|
79
|
+
|
|
80
|
+
# Get the function from the module
|
|
81
|
+
if not hasattr(module, function_name):
|
|
82
|
+
raise ValueError(f"Function '{function_name}' not found in module '{module_name}'")
|
|
83
|
+
|
|
84
|
+
hook_fn = getattr(module, function_name)
|
|
85
|
+
|
|
86
|
+
# Verify it's callable
|
|
87
|
+
if not callable(hook_fn):
|
|
88
|
+
raise ValueError(f"'{import_path}' is not a callable function")
|
|
89
|
+
|
|
90
|
+
return hook_fn
|
|
91
|
+
|
|
92
|
+
except ImportError as e:
|
|
93
|
+
# Extract module name from import path for error message
|
|
94
|
+
module_part = import_path.rsplit(".", 1)[0] if "." in import_path else import_path
|
|
95
|
+
raise ValueError(f"Failed to import {hook_type} hook module '{module_part}': {e}") from e
|
|
96
|
+
except ValueError:
|
|
97
|
+
# Re-raise ValueError as-is
|
|
98
|
+
raise
|
|
99
|
+
except Exception as e:
|
|
100
|
+
raise ValueError(f"Failed to load {hook_type} hook function '{import_path}': {e}") from e
|
|
101
|
+
|
|
102
|
+
|
|
54
103
|
def setup_logging(verbose: bool) -> None:
|
|
55
104
|
"""
|
|
56
105
|
Configure logging with standardized format based on verbosity level.
|
|
@@ -515,20 +564,26 @@ def _load_transactions_by_format(
|
|
|
515
564
|
help="File format to use. 'auto' detects format from file extension.",
|
|
516
565
|
)
|
|
517
566
|
@click.option("--verbose", is_flag=True, help="Enable verbose output for debugging purposes.")
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
567
|
+
@click.option(
|
|
568
|
+
"--preprocess-hook",
|
|
569
|
+
type=str,
|
|
570
|
+
default=None,
|
|
571
|
+
help="Python import path to preprocessing hook function (e.g., 'mymodule.preprocess_fn').",
|
|
572
|
+
)
|
|
573
|
+
@click.option(
|
|
574
|
+
"--postprocess-hook",
|
|
575
|
+
type=str,
|
|
576
|
+
default=None,
|
|
577
|
+
help="Python import path to postprocessing hook function (e.g., 'mymodule.postprocess_fn').",
|
|
578
|
+
)
|
|
579
|
+
@click.option(
|
|
580
|
+
"--candidate-filter-hook",
|
|
581
|
+
type=str,
|
|
582
|
+
default=None,
|
|
583
|
+
help="Python import path to candidate filter hook function (e.g., 'mymodule.filter_fn').",
|
|
584
|
+
)
|
|
585
|
+
@click.pass_context
|
|
586
|
+
def main(ctx: click.Context, **kwargs: Any) -> None:
|
|
532
587
|
"""
|
|
533
588
|
Run the GSP algorithm on transactional data from a file.
|
|
534
589
|
|
|
@@ -573,9 +628,59 @@ def main(
|
|
|
573
628
|
```bash
|
|
574
629
|
gsppy --file data.txt --format spm --min_support 0.3
|
|
575
630
|
```
|
|
631
|
+
|
|
632
|
+
With custom hooks (requires Python module with hook functions):
|
|
633
|
+
|
|
634
|
+
```bash
|
|
635
|
+
# Create a hooks module first (hooks.py):
|
|
636
|
+
# def my_filter(candidate, support, context):
|
|
637
|
+
# return len(candidate) <= 2 # Keep only short patterns
|
|
638
|
+
#
|
|
639
|
+
# def my_postprocess(patterns):
|
|
640
|
+
# return patterns[:2] # Keep only first 2 levels
|
|
641
|
+
|
|
642
|
+
gsppy --file data.json --min_support 0.3 \
|
|
643
|
+
--candidate-filter-hook hooks.my_filter \
|
|
644
|
+
--postprocess-hook hooks.my_postprocess
|
|
645
|
+
```
|
|
576
646
|
"""
|
|
647
|
+
# Extract parameters from kwargs
|
|
648
|
+
file_path = kwargs['file_path']
|
|
649
|
+
min_support = kwargs['min_support']
|
|
650
|
+
backend = kwargs['backend']
|
|
651
|
+
mingap = kwargs.get('mingap')
|
|
652
|
+
maxgap = kwargs.get('maxgap')
|
|
653
|
+
maxspan = kwargs.get('maxspan')
|
|
654
|
+
transaction_col = kwargs.get('transaction_col')
|
|
655
|
+
item_col = kwargs.get('item_col')
|
|
656
|
+
timestamp_col = kwargs.get('timestamp_col')
|
|
657
|
+
sequence_col = kwargs.get('sequence_col')
|
|
658
|
+
file_format = kwargs['format']
|
|
659
|
+
verbose = kwargs['verbose']
|
|
660
|
+
preprocess_hook = kwargs.get('preprocess_hook')
|
|
661
|
+
postprocess_hook = kwargs.get('postprocess_hook')
|
|
662
|
+
candidate_filter_hook = kwargs.get('candidate_filter_hook')
|
|
663
|
+
|
|
577
664
|
setup_logging(verbose)
|
|
578
665
|
|
|
666
|
+
# Load hook functions if specified
|
|
667
|
+
try:
|
|
668
|
+
preprocess_fn = _load_hook_function(preprocess_hook, "preprocess") if preprocess_hook else None
|
|
669
|
+
postprocess_fn = _load_hook_function(postprocess_hook, "postprocess") if postprocess_hook else None
|
|
670
|
+
candidate_filter_fn = (
|
|
671
|
+
_load_hook_function(candidate_filter_hook, "candidate_filter") if candidate_filter_hook else None
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
if preprocess_fn:
|
|
675
|
+
logger.info(f"Loaded preprocessing hook: {preprocess_hook}")
|
|
676
|
+
if postprocess_fn:
|
|
677
|
+
logger.info(f"Loaded postprocessing hook: {postprocess_hook}")
|
|
678
|
+
if candidate_filter_fn:
|
|
679
|
+
logger.info(f"Loaded candidate filter hook: {candidate_filter_hook}")
|
|
680
|
+
except ValueError as e:
|
|
681
|
+
logger.error(f"Error loading hook function: {e}")
|
|
682
|
+
sys.exit(1)
|
|
683
|
+
|
|
579
684
|
# Detect file extension to determine if DataFrame column params are needed
|
|
580
685
|
_, file_extension = os.path.splitext(file_path)
|
|
581
686
|
file_extension = file_extension.lower()
|
|
@@ -583,10 +688,10 @@ def main(
|
|
|
583
688
|
|
|
584
689
|
# Automatically detect and load transactions
|
|
585
690
|
try:
|
|
586
|
-
|
|
691
|
+
file_format_lower = file_format.lower()
|
|
587
692
|
transactions = _load_transactions_by_format(
|
|
588
693
|
file_path,
|
|
589
|
-
|
|
694
|
+
file_format_lower,
|
|
590
695
|
file_extension,
|
|
591
696
|
is_dataframe_format,
|
|
592
697
|
transaction_col,
|
|
@@ -608,7 +713,13 @@ def main(
|
|
|
608
713
|
# Initialize and run GSP algorithm
|
|
609
714
|
try:
|
|
610
715
|
gsp = GSP(transactions, mingap=mingap, maxgap=maxgap, maxspan=maxspan, verbose=verbose)
|
|
611
|
-
patterns = gsp.search(
|
|
716
|
+
patterns = gsp.search(
|
|
717
|
+
min_support=min_support,
|
|
718
|
+
return_sequences=False,
|
|
719
|
+
preprocess_fn=preprocess_fn,
|
|
720
|
+
postprocess_fn=postprocess_fn,
|
|
721
|
+
candidate_filter_fn=candidate_filter_fn,
|
|
722
|
+
)
|
|
612
723
|
logger.info("Frequent Patterns Found:")
|
|
613
724
|
for i, level in enumerate(patterns, start=1):
|
|
614
725
|
logger.info(f"\n{i}-Sequence Patterns:")
|
|
@@ -98,6 +98,7 @@ from typing import (
|
|
|
98
98
|
Tuple,
|
|
99
99
|
Union,
|
|
100
100
|
Literal,
|
|
101
|
+
Callable,
|
|
101
102
|
Optional,
|
|
102
103
|
Sequence as TypingSequence,
|
|
103
104
|
cast,
|
|
@@ -715,6 +716,56 @@ class GSP:
|
|
|
715
716
|
|
|
716
717
|
return pruned_patterns
|
|
717
718
|
|
|
719
|
+
def _apply_candidate_filter(
|
|
720
|
+
self,
|
|
721
|
+
freq_patterns: Dict[Tuple[str, ...], int],
|
|
722
|
+
min_support_count: int,
|
|
723
|
+
k_level: int,
|
|
724
|
+
candidate_filter_fn: Optional[Callable[[Tuple[str, ...], int, Dict[str, Any]], bool]],
|
|
725
|
+
) -> Dict[Tuple[str, ...], int]:
|
|
726
|
+
"""
|
|
727
|
+
Apply user-provided candidate filter function to patterns.
|
|
728
|
+
|
|
729
|
+
This method applies a custom filter function provided by the user to
|
|
730
|
+
further refine the candidate set. Unlike the pruning strategy which
|
|
731
|
+
determines what to REMOVE, the filter function determines what to KEEP.
|
|
732
|
+
|
|
733
|
+
Parameters:
|
|
734
|
+
freq_patterns (Dict[Tuple[str, ...], int]): Dictionary of patterns and their support counts.
|
|
735
|
+
min_support_count (int): Absolute minimum support count threshold.
|
|
736
|
+
k_level (int): Current k-sequence level.
|
|
737
|
+
candidate_filter_fn (Optional[Callable]): User-provided filter function.
|
|
738
|
+
|
|
739
|
+
Returns:
|
|
740
|
+
Dict[Tuple[str, ...], int]: Filtered patterns after applying candidate filter.
|
|
741
|
+
"""
|
|
742
|
+
if candidate_filter_fn is None or not freq_patterns:
|
|
743
|
+
return freq_patterns
|
|
744
|
+
|
|
745
|
+
filtered_patterns: Dict[Tuple[str, ...], int] = {}
|
|
746
|
+
context = {
|
|
747
|
+
"min_support_count": min_support_count,
|
|
748
|
+
"total_transactions": len(self.transactions),
|
|
749
|
+
"k_level": k_level,
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
for candidate, support_count in freq_patterns.items():
|
|
753
|
+
try:
|
|
754
|
+
# Call user-provided filter function
|
|
755
|
+
# Returns True to KEEP the candidate, False to filter it out
|
|
756
|
+
if candidate_filter_fn(candidate, support_count, context):
|
|
757
|
+
filtered_patterns[candidate] = support_count
|
|
758
|
+
except Exception as e:
|
|
759
|
+
error_msg = f"Error in candidate_filter_fn for candidate {candidate}: {e}"
|
|
760
|
+
logger.error(error_msg)
|
|
761
|
+
raise RuntimeError(error_msg) from e
|
|
762
|
+
|
|
763
|
+
num_filtered = len(freq_patterns) - len(filtered_patterns)
|
|
764
|
+
if num_filtered > 0:
|
|
765
|
+
logger.debug("Candidate filter function filtered out %d additional candidates", num_filtered)
|
|
766
|
+
|
|
767
|
+
return filtered_patterns
|
|
768
|
+
|
|
718
769
|
def _print_status(self, run: int, candidates: List[Tuple[str, ...]]) -> None:
|
|
719
770
|
"""
|
|
720
771
|
Log progress information for the current GSP iteration.
|
|
@@ -728,6 +779,71 @@ class GSP:
|
|
|
728
779
|
"""
|
|
729
780
|
logger.info("Run %d: %d candidates filtered to %d.", run, len(candidates), len(self.freq_patterns[run - 1]))
|
|
730
781
|
|
|
782
|
+
def _apply_preprocess_hook(
|
|
783
|
+
self, preprocess_fn: Optional[Callable[[Any], Any]]
|
|
784
|
+
) -> Tuple[Any, Optional[Tuple[Any, List[Tuple[str, ...]], int]]]:
|
|
785
|
+
"""Apply preprocessing hook and return processed transactions with backup."""
|
|
786
|
+
transactions_to_use = self.transactions
|
|
787
|
+
backup_state = None
|
|
788
|
+
|
|
789
|
+
if preprocess_fn is not None:
|
|
790
|
+
try:
|
|
791
|
+
logger.debug("Applying preprocessing hook...")
|
|
792
|
+
preprocessed = preprocess_fn(transactions_to_use)
|
|
793
|
+
if preprocessed is not None:
|
|
794
|
+
transactions_to_use = preprocessed
|
|
795
|
+
logger.debug("Preprocessing hook completed successfully")
|
|
796
|
+
|
|
797
|
+
# Backup original state including max_size
|
|
798
|
+
backup_state = (self.transactions, self.unique_candidates, self.max_size)
|
|
799
|
+
|
|
800
|
+
# Update state with preprocessed data
|
|
801
|
+
self.transactions = transactions_to_use
|
|
802
|
+
# Recompute unique candidates from preprocessed transactions
|
|
803
|
+
# Use _extract_items_from_transaction to properly handle timestamped data
|
|
804
|
+
all_items_list: List[str] = []
|
|
805
|
+
for transaction in self.transactions:
|
|
806
|
+
all_items_list.extend(self._extract_items_from_transaction(transaction))
|
|
807
|
+
|
|
808
|
+
# Create singleton candidates from unique items
|
|
809
|
+
unique_items = set(all_items_list)
|
|
810
|
+
self.unique_candidates = [(item,) for item in sorted(unique_items)]
|
|
811
|
+
|
|
812
|
+
# Recompute max_size based on preprocessed transactions
|
|
813
|
+
self.max_size = max(len(tx) for tx in self.transactions) if self.transactions else 0
|
|
814
|
+
logger.debug("Recomputed unique candidates after preprocessing: %d items", len(self.unique_candidates))
|
|
815
|
+
|
|
816
|
+
except Exception as e:
|
|
817
|
+
error_msg = f"Error in preprocess_fn: {e}"
|
|
818
|
+
logger.error(error_msg)
|
|
819
|
+
raise RuntimeError(error_msg) from e
|
|
820
|
+
|
|
821
|
+
return transactions_to_use, backup_state
|
|
822
|
+
|
|
823
|
+
def _apply_postprocess_hook(
|
|
824
|
+
self, result: Any, postprocess_fn: Optional[Callable[[Any], Any]]
|
|
825
|
+
) -> Any:
|
|
826
|
+
"""Apply postprocessing hook to results."""
|
|
827
|
+
if postprocess_fn is not None:
|
|
828
|
+
try:
|
|
829
|
+
logger.debug("Applying postprocessing hook...")
|
|
830
|
+
postprocessed = postprocess_fn(result)
|
|
831
|
+
if postprocessed is not None:
|
|
832
|
+
result = postprocessed
|
|
833
|
+
logger.debug("Postprocessing hook completed successfully")
|
|
834
|
+
except Exception as e:
|
|
835
|
+
error_msg = f"Error in postprocess_fn: {e}"
|
|
836
|
+
logger.error(error_msg)
|
|
837
|
+
raise RuntimeError(error_msg) from e
|
|
838
|
+
return result
|
|
839
|
+
|
|
840
|
+
def _restore_preprocessing_state(
|
|
841
|
+
self, backup_state: Optional[Tuple[Any, List[Tuple[str, ...]], int]]
|
|
842
|
+
) -> None:
|
|
843
|
+
"""Restore original state after preprocessing."""
|
|
844
|
+
if backup_state is not None:
|
|
845
|
+
self.transactions, self.unique_candidates, self.max_size = backup_state
|
|
846
|
+
|
|
731
847
|
@overload
|
|
732
848
|
def search(
|
|
733
849
|
self,
|
|
@@ -737,6 +853,9 @@ class GSP:
|
|
|
737
853
|
verbose: Optional[bool] = None,
|
|
738
854
|
*,
|
|
739
855
|
return_sequences: Literal[False] = False,
|
|
856
|
+
preprocess_fn: Optional[Callable[[Any], Any]] = None,
|
|
857
|
+
postprocess_fn: Optional[Callable[[Any], Any]] = None,
|
|
858
|
+
candidate_filter_fn: Optional[Callable[[Tuple[str, ...], int, Dict[str, Any]], bool]] = None,
|
|
740
859
|
) -> List[Dict[Tuple[str, ...], int]]: ...
|
|
741
860
|
|
|
742
861
|
@overload
|
|
@@ -748,6 +867,9 @@ class GSP:
|
|
|
748
867
|
verbose: Optional[bool] = None,
|
|
749
868
|
*,
|
|
750
869
|
return_sequences: Literal[True],
|
|
870
|
+
preprocess_fn: Optional[Callable[[Any], Any]] = None,
|
|
871
|
+
postprocess_fn: Optional[Callable[[Any], Any]] = None,
|
|
872
|
+
candidate_filter_fn: Optional[Callable[[Tuple[str, ...], int, Dict[str, Any]], bool]] = None,
|
|
751
873
|
) -> List[List[Sequence]]: ...
|
|
752
874
|
|
|
753
875
|
def search(
|
|
@@ -758,6 +880,9 @@ class GSP:
|
|
|
758
880
|
verbose: Optional[bool] = None,
|
|
759
881
|
*,
|
|
760
882
|
return_sequences: bool = False,
|
|
883
|
+
preprocess_fn: Optional[Callable[[Any], Any]] = None,
|
|
884
|
+
postprocess_fn: Optional[Callable[[Any], Any]] = None,
|
|
885
|
+
candidate_filter_fn: Optional[Callable[[Tuple[str, ...], int, Dict[str, Any]], bool]] = None,
|
|
761
886
|
) -> Union[List[Dict[Tuple[str, ...], int]], List[List[Sequence]]]:
|
|
762
887
|
if backend is not None:
|
|
763
888
|
logger.debug("Backend parameter is currently unused: %s", backend)
|
|
@@ -786,6 +911,28 @@ class GSP:
|
|
|
786
911
|
compatibility. When True, returns List[List[Sequence]] where
|
|
787
912
|
each Sequence contains items, support count, and can be extended
|
|
788
913
|
with additional metadata.
|
|
914
|
+
preprocess_fn (Optional[Callable[[Any], Any]]): Custom preprocessing hook function.
|
|
915
|
+
Called once at the beginning of the search to transform transactions.
|
|
916
|
+
Receives the transaction list and should return a modified transaction list.
|
|
917
|
+
Useful for data cleaning, filtering, or transformation before mining.
|
|
918
|
+
Default is None (no preprocessing).
|
|
919
|
+
postprocess_fn (Optional[Callable[[Any], Any]]): Custom postprocessing hook function.
|
|
920
|
+
Called once at the end of the search to transform discovered patterns.
|
|
921
|
+
Receives the pattern list and should return a modified pattern list.
|
|
922
|
+
Useful for filtering, aggregating, or enriching results.
|
|
923
|
+
Default is None (no postprocessing).
|
|
924
|
+
candidate_filter_fn (Optional[Callable[[Tuple[str, ...], int, Dict[str, Any]], bool]]):
|
|
925
|
+
Custom candidate filtering function for dynamic pruning at runtime.
|
|
926
|
+
Called for each candidate after support counting to determine if it
|
|
927
|
+
should be kept. Receives:
|
|
928
|
+
- candidate: The candidate sequence as a tuple of strings
|
|
929
|
+
- support_count: The support count of the candidate
|
|
930
|
+
- context: A dictionary with additional information (e.g., 'min_support_count',
|
|
931
|
+
'total_transactions', 'k_level')
|
|
932
|
+
Should return True to KEEP the candidate, False to filter it out.
|
|
933
|
+
Complements the pruning_strategy but operates at a different level.
|
|
934
|
+
Supports lambda expressions and arbitrary callables.
|
|
935
|
+
Default is None (no additional filtering).
|
|
789
936
|
|
|
790
937
|
Returns:
|
|
791
938
|
Union[List[Dict[Tuple[str, ...], int]], List[List[Sequence]]]:
|
|
@@ -798,6 +945,8 @@ class GSP:
|
|
|
798
945
|
|
|
799
946
|
Raises:
|
|
800
947
|
ValueError: If the minimum support threshold is not in the range `(0.0, 1.0]`.
|
|
948
|
+
Exception: If any user-provided hook function raises an exception, it will be
|
|
949
|
+
propagated with additional context about where it occurred.
|
|
801
950
|
|
|
802
951
|
Logs:
|
|
803
952
|
- Information about the algorithm's start, intermediate progress (candidates filtered),
|
|
@@ -842,6 +991,60 @@ class GSP:
|
|
|
842
991
|
print(f"Pattern: {seq.items}, Support: {seq.support}")
|
|
843
992
|
```
|
|
844
993
|
|
|
994
|
+
Using custom hooks with lambda expressions:
|
|
995
|
+
|
|
996
|
+
```python
|
|
997
|
+
from gsppy.gsp import GSP
|
|
998
|
+
|
|
999
|
+
transactions = [
|
|
1000
|
+
["A", "B", "C"],
|
|
1001
|
+
["A", "C", "D"],
|
|
1002
|
+
["B", "C", "E"],
|
|
1003
|
+
]
|
|
1004
|
+
|
|
1005
|
+
# Preprocessing: Convert all items to uppercase in each itemset
|
|
1006
|
+
preprocess = lambda txs: [
|
|
1007
|
+
[tuple(item.upper() if isinstance(item, str) else item for item in itemset) for itemset in tx]
|
|
1008
|
+
for tx in txs
|
|
1009
|
+
]
|
|
1010
|
+
|
|
1011
|
+
# Candidate filtering: Only keep patterns with length <= 2
|
|
1012
|
+
filter_fn = lambda candidate, support, ctx: len(candidate) <= 2
|
|
1013
|
+
|
|
1014
|
+
# Postprocessing: Add metadata to each level
|
|
1015
|
+
postprocess = lambda patterns: [
|
|
1016
|
+
{k: v for k, v in level.items() if v > 1} for level in patterns
|
|
1017
|
+
]
|
|
1018
|
+
|
|
1019
|
+
gsp = GSP(transactions)
|
|
1020
|
+
patterns = gsp.search(
|
|
1021
|
+
min_support=0.3,
|
|
1022
|
+
preprocess_fn=preprocess,
|
|
1023
|
+
candidate_filter_fn=filter_fn,
|
|
1024
|
+
postprocess_fn=postprocess
|
|
1025
|
+
)
|
|
1026
|
+
```
|
|
1027
|
+
|
|
1028
|
+
Advanced candidate filtering with context:
|
|
1029
|
+
|
|
1030
|
+
```python
|
|
1031
|
+
from gsppy.gsp import GSP
|
|
1032
|
+
|
|
1033
|
+
transactions = [["A", "B", "C"], ["A", "C", "D"], ["B", "C", "E"]]
|
|
1034
|
+
|
|
1035
|
+
# Filter candidates based on both support and pattern characteristics
|
|
1036
|
+
def custom_filter(candidate, support_count, context):
|
|
1037
|
+
# Keep candidates with high support or specific patterns
|
|
1038
|
+
min_support = context.get('min_support_count', 0)
|
|
1039
|
+
if support_count >= min_support * 1.5: # 50% above minimum
|
|
1040
|
+
return True
|
|
1041
|
+
# Or keep patterns starting with 'A'
|
|
1042
|
+
return candidate[0] == 'A' if candidate else False
|
|
1043
|
+
|
|
1044
|
+
gsp = GSP(transactions)
|
|
1045
|
+
patterns = gsp.search(min_support=0.3, candidate_filter_fn=custom_filter)
|
|
1046
|
+
```
|
|
1047
|
+
|
|
845
1048
|
Usage with temporal constraints (requires timestamped transactions):
|
|
846
1049
|
|
|
847
1050
|
```python
|
|
@@ -877,59 +1080,75 @@ class GSP:
|
|
|
877
1080
|
f"Using temporal constraints: mingap={self.mingap}, maxgap={self.maxgap}, maxspan={self.maxspan}"
|
|
878
1081
|
)
|
|
879
1082
|
|
|
880
|
-
#
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
# Convert fractional support to absolute count (ceil to preserve threshold semantics)
|
|
884
|
-
abs_min_support = int(math.ceil(len(self.transactions) * float(min_support)))
|
|
885
|
-
|
|
886
|
-
# the set of frequent 1-sequence: all singleton sequences
|
|
887
|
-
# (k-itemsets/k-sequence = 1) - Initially, every item in DB is a
|
|
888
|
-
# candidate
|
|
889
|
-
candidates = self.unique_candidates
|
|
890
|
-
|
|
891
|
-
# scan transactions to collect support count for each candidate
|
|
892
|
-
# sequence & filter
|
|
893
|
-
freq_1 = self._support(candidates, abs_min_support)
|
|
894
|
-
# Apply pruning strategy for additional filtering
|
|
895
|
-
freq_1 = self._apply_pruning(freq_1, abs_min_support)
|
|
896
|
-
self.freq_patterns.append(freq_1)
|
|
1083
|
+
# Apply preprocessing hook and backup state
|
|
1084
|
+
transactions_to_use, backup_state = self._apply_preprocess_hook(preprocess_fn)
|
|
897
1085
|
|
|
898
|
-
#
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
1086
|
+
# Ensure state is restored even if errors occur during mining
|
|
1087
|
+
try:
|
|
1088
|
+
# Clear freq_patterns for this search (allow reusing the GSP instance)
|
|
1089
|
+
self.freq_patterns = []
|
|
902
1090
|
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
while (
|
|
906
|
-
self.freq_patterns[k_items - 1] and k_items + 1 <= self.max_size and (max_k is None or k_items + 1 <= max_k)
|
|
907
|
-
):
|
|
908
|
-
k_items += 1
|
|
1091
|
+
# Convert fractional support to absolute count (ceil to preserve threshold semantics)
|
|
1092
|
+
abs_min_support = int(math.ceil(len(transactions_to_use) * float(min_support)))
|
|
909
1093
|
|
|
910
|
-
#
|
|
911
|
-
#
|
|
912
|
-
#
|
|
913
|
-
candidates =
|
|
1094
|
+
# the set of frequent 1-sequence: all singleton sequences
|
|
1095
|
+
# (k-itemsets/k-sequence = 1) - Initially, every item in DB is a
|
|
1096
|
+
# candidate
|
|
1097
|
+
candidates = self.unique_candidates
|
|
914
1098
|
|
|
915
|
-
#
|
|
916
|
-
#
|
|
917
|
-
|
|
1099
|
+
# scan transactions to collect support count for each candidate
|
|
1100
|
+
# sequence & filter
|
|
1101
|
+
freq_1 = self._support(candidates, abs_min_support)
|
|
918
1102
|
# Apply pruning strategy for additional filtering
|
|
919
|
-
|
|
920
|
-
|
|
1103
|
+
freq_1 = self._apply_pruning(freq_1, abs_min_support)
|
|
1104
|
+
# Apply user-provided candidate filter if specified
|
|
1105
|
+
freq_1 = self._apply_candidate_filter(freq_1, abs_min_support, 1, candidate_filter_fn)
|
|
1106
|
+
self.freq_patterns.append(freq_1)
|
|
921
1107
|
|
|
922
|
-
|
|
923
|
-
|
|
1108
|
+
# (k-itemsets/k-sequence = 1)
|
|
1109
|
+
k_items = 1
|
|
924
1110
|
|
|
925
|
-
|
|
926
|
-
if verbose is not None:
|
|
927
|
-
self.verbose = original_verbose
|
|
928
|
-
self._configure_logging()
|
|
1111
|
+
self._print_status(k_items, candidates)
|
|
929
1112
|
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
1113
|
+
# repeat until no frequent sequence or no candidate can be found
|
|
1114
|
+
# If max_k is provided, stop generating candidates beyond that length
|
|
1115
|
+
while (
|
|
1116
|
+
self.freq_patterns[k_items - 1] and k_items + 1 <= self.max_size and (max_k is None or k_items + 1 <= max_k)
|
|
1117
|
+
):
|
|
1118
|
+
k_items += 1
|
|
1119
|
+
|
|
1120
|
+
# Generate candidate sets Ck (set of candidate k-sequences) -
|
|
1121
|
+
# generate new candidates from the last "best" candidates filtered
|
|
1122
|
+
# by minimum support
|
|
1123
|
+
candidates = generate_candidates_from_previous(self.freq_patterns[k_items - 2])
|
|
1124
|
+
|
|
1125
|
+
# candidate pruning - eliminates candidates who are not potentially
|
|
1126
|
+
# frequent (using support as threshold)
|
|
1127
|
+
freq_k = self._support(candidates, abs_min_support)
|
|
1128
|
+
# Apply pruning strategy for additional filtering
|
|
1129
|
+
freq_k = self._apply_pruning(freq_k, abs_min_support)
|
|
1130
|
+
# Apply user-provided candidate filter if specified
|
|
1131
|
+
freq_k = self._apply_candidate_filter(freq_k, abs_min_support, k_items, candidate_filter_fn)
|
|
1132
|
+
self.freq_patterns.append(freq_k)
|
|
1133
|
+
|
|
1134
|
+
self._print_status(k_items, candidates)
|
|
1135
|
+
logger.info("GSP algorithm completed.")
|
|
1136
|
+
|
|
1137
|
+
# Return results in the requested format
|
|
1138
|
+
result = self.freq_patterns[:-1]
|
|
1139
|
+
|
|
1140
|
+
# Apply postprocessing hook
|
|
1141
|
+
result = self._apply_postprocess_hook(result, postprocess_fn)
|
|
1142
|
+
|
|
1143
|
+
if return_sequences:
|
|
1144
|
+
# Convert Dict[Tuple[str, ...], int] to List[Sequence] for each level
|
|
1145
|
+
return [dict_to_sequences(level_patterns) for level_patterns in result]
|
|
1146
|
+
return result
|
|
1147
|
+
finally:
|
|
1148
|
+
# Always restore original state if preprocessing was applied, even on exceptions
|
|
1149
|
+
self._restore_preprocessing_state(backup_state)
|
|
1150
|
+
|
|
1151
|
+
# Restore original verbosity if it was overridden
|
|
1152
|
+
if verbose is not None:
|
|
1153
|
+
self.verbose = original_verbose
|
|
1154
|
+
self._configure_logging()
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gsppy"
|
|
7
|
-
version = "
|
|
7
|
+
version = "5.0.0"
|
|
8
8
|
description = "GSP (Generalized Sequence Pattern) algorithm in Python"
|
|
9
9
|
keywords = ["GSP", "sequential patterns", "data analysis", "sequence mining"]
|
|
10
10
|
license = { file = "LICENSE" }
|