gsppy 4.2.0__tar.gz → 5.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {gsppy-4.2.0 → gsppy-5.0.0}/CHANGELOG.md +18 -0
  2. {gsppy-4.2.0 → gsppy-5.0.0}/PKG-INFO +1 -1
  3. {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/cli.py +129 -18
  4. {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/gsp.py +267 -48
  5. {gsppy-4.2.0 → gsppy-5.0.0}/pyproject.toml +1 -1
  6. gsppy-5.0.0/tests/test_cli_hooks.py +197 -0
  7. gsppy-5.0.0/tests/test_hooks.py +510 -0
  8. {gsppy-4.2.0 → gsppy-5.0.0}/.gitignore +0 -0
  9. {gsppy-4.2.0 → gsppy-5.0.0}/CONTRIBUTING.md +0 -0
  10. {gsppy-4.2.0 → gsppy-5.0.0}/LICENSE +0 -0
  11. {gsppy-4.2.0 → gsppy-5.0.0}/README.md +0 -0
  12. {gsppy-4.2.0 → gsppy-5.0.0}/SECURITY.md +0 -0
  13. {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/__init__.py +0 -0
  14. {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/accelerate.py +0 -0
  15. {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/dataframe_adapters.py +0 -0
  16. {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/enums.py +0 -0
  17. {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/pruning.py +0 -0
  18. {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/py.typed +0 -0
  19. {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/sequence.py +0 -0
  20. {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/token_mapper.py +0 -0
  21. {gsppy-4.2.0 → gsppy-5.0.0}/gsppy/utils.py +0 -0
  22. {gsppy-4.2.0 → gsppy-5.0.0}/rust/Cargo.lock +0 -0
  23. {gsppy-4.2.0 → gsppy-5.0.0}/rust/Cargo.toml +0 -0
  24. {gsppy-4.2.0 → gsppy-5.0.0}/rust/src/lib.rs +0 -0
  25. {gsppy-4.2.0 → gsppy-5.0.0}/tests/__init__.py +0 -0
  26. {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_cli.py +0 -0
  27. {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_dataframe.py +0 -0
  28. {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_gsp.py +0 -0
  29. {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_gsp_fuzzing.py +0 -0
  30. {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_gsp_sequence_integration.py +0 -0
  31. {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_itemsets.py +0 -0
  32. {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_pruning.py +0 -0
  33. {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_sequence.py +0 -0
  34. {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_spm_format.py +0 -0
  35. {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_temporal_constraints.py +0 -0
  36. {gsppy-4.2.0 → gsppy-5.0.0}/tests/test_utils.py +0 -0
  37. {gsppy-4.2.0 → gsppy-5.0.0}/tox.ini +0 -0
@@ -1,6 +1,24 @@
1
1
  # CHANGELOG
2
2
 
3
3
 
4
+ ## v5.0.0 (2026-02-06)
5
+
6
+ ### Chores
7
+
8
+ - Adds support for optional types in item filters and ignores types in metadata printouts.
9
+ ([`79111b4`](https://github.com/jacksonpradolima/gsp-py/commit/79111b4c781a65b21a17f85b0b507b41ba6e51f9))
10
+
11
+ - Update uv.lock for version 4.2.0
12
+ ([`f8f690f`](https://github.com/jacksonpradolima/gsp-py/commit/f8f690f7f0304dc4331c17c68487fe3411436149))
13
+
14
+ ### Features
15
+
16
+ - Add preprocessing, postprocessing, and candidate filtering hooks to GSP algorithm
17
+ ([`495d290`](https://github.com/jacksonpradolima/gsp-py/commit/495d29009abe862bf992831bd276181efa40c99d))
18
+
19
+ feat!: add preprocessing, postprocessing, and candidate filtering hooks to GSP algorithm
20
+
21
+
4
22
  ## v4.2.0 (2026-02-01)
5
23
 
6
24
  ### Chores
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gsppy
3
- Version: 4.2.0
3
+ Version: 5.0.0
4
4
  Summary: GSP (Generalized Sequence Pattern) algorithm in Python
5
5
  Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
6
6
  Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
@@ -35,7 +35,8 @@ import csv
35
35
  import sys
36
36
  import json
37
37
  import logging
38
- from typing import Any, List, Tuple, Union, Optional, cast
38
+ import importlib
39
+ from typing import Any, List, Tuple, Union, Callable, Optional, cast
39
40
 
40
41
  import click
41
42
 
@@ -51,6 +52,54 @@ from gsppy.enums import (
51
52
  from gsppy.utils import has_timestamps
52
53
 
53
54
 
55
+ def _load_hook_function(import_path: str, hook_type: str) -> Callable[..., Any]:
56
+ """
57
+ Load a hook function from a Python module import path.
58
+
59
+ Parameters:
60
+ import_path (str): Import path in format 'module.submodule.function_name'
61
+ hook_type (str): Type of hook for error messages ('preprocess', 'postprocess', 'candidate_filter')
62
+
63
+ Returns:
64
+ Callable: The loaded hook function
65
+
66
+ Raises:
67
+ ValueError: If the import path is invalid or function cannot be loaded
68
+ """
69
+ try:
70
+ # Split into module path and function name
71
+ parts = import_path.rsplit(".", 1)
72
+ if len(parts) != 2:
73
+ raise ValueError(f"Invalid import path format. Expected 'module.function', got '{import_path}'")
74
+
75
+ module_name, function_name = parts
76
+
77
+ # Import the module
78
+ module = importlib.import_module(module_name)
79
+
80
+ # Get the function from the module
81
+ if not hasattr(module, function_name):
82
+ raise ValueError(f"Function '{function_name}' not found in module '{module_name}'")
83
+
84
+ hook_fn = getattr(module, function_name)
85
+
86
+ # Verify it's callable
87
+ if not callable(hook_fn):
88
+ raise ValueError(f"'{import_path}' is not a callable function")
89
+
90
+ return hook_fn
91
+
92
+ except ImportError as e:
93
+ # Extract module name from import path for error message
94
+ module_part = import_path.rsplit(".", 1)[0] if "." in import_path else import_path
95
+ raise ValueError(f"Failed to import {hook_type} hook module '{module_part}': {e}") from e
96
+ except ValueError:
97
+ # Re-raise ValueError as-is
98
+ raise
99
+ except Exception as e:
100
+ raise ValueError(f"Failed to load {hook_type} hook function '{import_path}': {e}") from e
101
+
102
+
54
103
  def setup_logging(verbose: bool) -> None:
55
104
  """
56
105
  Configure logging with standardized format based on verbosity level.
@@ -515,20 +564,26 @@ def _load_transactions_by_format(
515
564
  help="File format to use. 'auto' detects format from file extension.",
516
565
  )
517
566
  @click.option("--verbose", is_flag=True, help="Enable verbose output for debugging purposes.")
518
- def main(
519
- file_path: str,
520
- min_support: float,
521
- backend: str,
522
- mingap: Optional[float],
523
- maxgap: Optional[float],
524
- maxspan: Optional[float],
525
- transaction_col: Optional[str],
526
- item_col: Optional[str],
527
- timestamp_col: Optional[str],
528
- sequence_col: Optional[str],
529
- format: str, # noqa: A002
530
- verbose: bool,
531
- ) -> None:
567
+ @click.option(
568
+ "--preprocess-hook",
569
+ type=str,
570
+ default=None,
571
+ help="Python import path to preprocessing hook function (e.g., 'mymodule.preprocess_fn').",
572
+ )
573
+ @click.option(
574
+ "--postprocess-hook",
575
+ type=str,
576
+ default=None,
577
+ help="Python import path to postprocessing hook function (e.g., 'mymodule.postprocess_fn').",
578
+ )
579
+ @click.option(
580
+ "--candidate-filter-hook",
581
+ type=str,
582
+ default=None,
583
+ help="Python import path to candidate filter hook function (e.g., 'mymodule.filter_fn').",
584
+ )
585
+ @click.pass_context
586
+ def main(ctx: click.Context, **kwargs: Any) -> None:
532
587
  """
533
588
  Run the GSP algorithm on transactional data from a file.
534
589
 
@@ -573,9 +628,59 @@ def main(
573
628
  ```bash
574
629
  gsppy --file data.txt --format spm --min_support 0.3
575
630
  ```
631
+
632
+ With custom hooks (requires Python module with hook functions):
633
+
634
+ ```bash
635
+ # Create a hooks module first (hooks.py):
636
+ # def my_filter(candidate, support, context):
637
+ # return len(candidate) <= 2 # Keep only short patterns
638
+ #
639
+ # def my_postprocess(patterns):
640
+ # return patterns[:2] # Keep only first 2 levels
641
+
642
+ gsppy --file data.json --min_support 0.3 \
643
+ --candidate-filter-hook hooks.my_filter \
644
+ --postprocess-hook hooks.my_postprocess
645
+ ```
576
646
  """
647
+ # Extract parameters from kwargs
648
+ file_path = kwargs['file_path']
649
+ min_support = kwargs['min_support']
650
+ backend = kwargs['backend']
651
+ mingap = kwargs.get('mingap')
652
+ maxgap = kwargs.get('maxgap')
653
+ maxspan = kwargs.get('maxspan')
654
+ transaction_col = kwargs.get('transaction_col')
655
+ item_col = kwargs.get('item_col')
656
+ timestamp_col = kwargs.get('timestamp_col')
657
+ sequence_col = kwargs.get('sequence_col')
658
+ file_format = kwargs['format']
659
+ verbose = kwargs['verbose']
660
+ preprocess_hook = kwargs.get('preprocess_hook')
661
+ postprocess_hook = kwargs.get('postprocess_hook')
662
+ candidate_filter_hook = kwargs.get('candidate_filter_hook')
663
+
577
664
  setup_logging(verbose)
578
665
 
666
+ # Load hook functions if specified
667
+ try:
668
+ preprocess_fn = _load_hook_function(preprocess_hook, "preprocess") if preprocess_hook else None
669
+ postprocess_fn = _load_hook_function(postprocess_hook, "postprocess") if postprocess_hook else None
670
+ candidate_filter_fn = (
671
+ _load_hook_function(candidate_filter_hook, "candidate_filter") if candidate_filter_hook else None
672
+ )
673
+
674
+ if preprocess_fn:
675
+ logger.info(f"Loaded preprocessing hook: {preprocess_hook}")
676
+ if postprocess_fn:
677
+ logger.info(f"Loaded postprocessing hook: {postprocess_hook}")
678
+ if candidate_filter_fn:
679
+ logger.info(f"Loaded candidate filter hook: {candidate_filter_hook}")
680
+ except ValueError as e:
681
+ logger.error(f"Error loading hook function: {e}")
682
+ sys.exit(1)
683
+
579
684
  # Detect file extension to determine if DataFrame column params are needed
580
685
  _, file_extension = os.path.splitext(file_path)
581
686
  file_extension = file_extension.lower()
@@ -583,10 +688,10 @@ def main(
583
688
 
584
689
  # Automatically detect and load transactions
585
690
  try:
586
- file_format = format.lower()
691
+ file_format_lower = file_format.lower()
587
692
  transactions = _load_transactions_by_format(
588
693
  file_path,
589
- file_format,
694
+ file_format_lower,
590
695
  file_extension,
591
696
  is_dataframe_format,
592
697
  transaction_col,
@@ -608,7 +713,13 @@ def main(
608
713
  # Initialize and run GSP algorithm
609
714
  try:
610
715
  gsp = GSP(transactions, mingap=mingap, maxgap=maxgap, maxspan=maxspan, verbose=verbose)
611
- patterns = gsp.search(min_support=min_support, return_sequences=False)
716
+ patterns = gsp.search(
717
+ min_support=min_support,
718
+ return_sequences=False,
719
+ preprocess_fn=preprocess_fn,
720
+ postprocess_fn=postprocess_fn,
721
+ candidate_filter_fn=candidate_filter_fn,
722
+ )
612
723
  logger.info("Frequent Patterns Found:")
613
724
  for i, level in enumerate(patterns, start=1):
614
725
  logger.info(f"\n{i}-Sequence Patterns:")
@@ -98,6 +98,7 @@ from typing import (
98
98
  Tuple,
99
99
  Union,
100
100
  Literal,
101
+ Callable,
101
102
  Optional,
102
103
  Sequence as TypingSequence,
103
104
  cast,
@@ -715,6 +716,56 @@ class GSP:
715
716
 
716
717
  return pruned_patterns
717
718
 
719
+ def _apply_candidate_filter(
720
+ self,
721
+ freq_patterns: Dict[Tuple[str, ...], int],
722
+ min_support_count: int,
723
+ k_level: int,
724
+ candidate_filter_fn: Optional[Callable[[Tuple[str, ...], int, Dict[str, Any]], bool]],
725
+ ) -> Dict[Tuple[str, ...], int]:
726
+ """
727
+ Apply user-provided candidate filter function to patterns.
728
+
729
+ This method applies a custom filter function provided by the user to
730
+ further refine the candidate set. Unlike the pruning strategy which
731
+ determines what to REMOVE, the filter function determines what to KEEP.
732
+
733
+ Parameters:
734
+ freq_patterns (Dict[Tuple[str, ...], int]): Dictionary of patterns and their support counts.
735
+ min_support_count (int): Absolute minimum support count threshold.
736
+ k_level (int): Current k-sequence level.
737
+ candidate_filter_fn (Optional[Callable]): User-provided filter function.
738
+
739
+ Returns:
740
+ Dict[Tuple[str, ...], int]: Filtered patterns after applying candidate filter.
741
+ """
742
+ if candidate_filter_fn is None or not freq_patterns:
743
+ return freq_patterns
744
+
745
+ filtered_patterns: Dict[Tuple[str, ...], int] = {}
746
+ context = {
747
+ "min_support_count": min_support_count,
748
+ "total_transactions": len(self.transactions),
749
+ "k_level": k_level,
750
+ }
751
+
752
+ for candidate, support_count in freq_patterns.items():
753
+ try:
754
+ # Call user-provided filter function
755
+ # Returns True to KEEP the candidate, False to filter it out
756
+ if candidate_filter_fn(candidate, support_count, context):
757
+ filtered_patterns[candidate] = support_count
758
+ except Exception as e:
759
+ error_msg = f"Error in candidate_filter_fn for candidate {candidate}: {e}"
760
+ logger.error(error_msg)
761
+ raise RuntimeError(error_msg) from e
762
+
763
+ num_filtered = len(freq_patterns) - len(filtered_patterns)
764
+ if num_filtered > 0:
765
+ logger.debug("Candidate filter function filtered out %d additional candidates", num_filtered)
766
+
767
+ return filtered_patterns
768
+
718
769
  def _print_status(self, run: int, candidates: List[Tuple[str, ...]]) -> None:
719
770
  """
720
771
  Log progress information for the current GSP iteration.
@@ -728,6 +779,71 @@ class GSP:
728
779
  """
729
780
  logger.info("Run %d: %d candidates filtered to %d.", run, len(candidates), len(self.freq_patterns[run - 1]))
730
781
 
782
+ def _apply_preprocess_hook(
783
+ self, preprocess_fn: Optional[Callable[[Any], Any]]
784
+ ) -> Tuple[Any, Optional[Tuple[Any, List[Tuple[str, ...]], int]]]:
785
+ """Apply preprocessing hook and return processed transactions with backup."""
786
+ transactions_to_use = self.transactions
787
+ backup_state = None
788
+
789
+ if preprocess_fn is not None:
790
+ try:
791
+ logger.debug("Applying preprocessing hook...")
792
+ preprocessed = preprocess_fn(transactions_to_use)
793
+ if preprocessed is not None:
794
+ transactions_to_use = preprocessed
795
+ logger.debug("Preprocessing hook completed successfully")
796
+
797
+ # Backup original state including max_size
798
+ backup_state = (self.transactions, self.unique_candidates, self.max_size)
799
+
800
+ # Update state with preprocessed data
801
+ self.transactions = transactions_to_use
802
+ # Recompute unique candidates from preprocessed transactions
803
+ # Use _extract_items_from_transaction to properly handle timestamped data
804
+ all_items_list: List[str] = []
805
+ for transaction in self.transactions:
806
+ all_items_list.extend(self._extract_items_from_transaction(transaction))
807
+
808
+ # Create singleton candidates from unique items
809
+ unique_items = set(all_items_list)
810
+ self.unique_candidates = [(item,) for item in sorted(unique_items)]
811
+
812
+ # Recompute max_size based on preprocessed transactions
813
+ self.max_size = max(len(tx) for tx in self.transactions) if self.transactions else 0
814
+ logger.debug("Recomputed unique candidates after preprocessing: %d items", len(self.unique_candidates))
815
+
816
+ except Exception as e:
817
+ error_msg = f"Error in preprocess_fn: {e}"
818
+ logger.error(error_msg)
819
+ raise RuntimeError(error_msg) from e
820
+
821
+ return transactions_to_use, backup_state
822
+
823
+ def _apply_postprocess_hook(
824
+ self, result: Any, postprocess_fn: Optional[Callable[[Any], Any]]
825
+ ) -> Any:
826
+ """Apply postprocessing hook to results."""
827
+ if postprocess_fn is not None:
828
+ try:
829
+ logger.debug("Applying postprocessing hook...")
830
+ postprocessed = postprocess_fn(result)
831
+ if postprocessed is not None:
832
+ result = postprocessed
833
+ logger.debug("Postprocessing hook completed successfully")
834
+ except Exception as e:
835
+ error_msg = f"Error in postprocess_fn: {e}"
836
+ logger.error(error_msg)
837
+ raise RuntimeError(error_msg) from e
838
+ return result
839
+
840
+ def _restore_preprocessing_state(
841
+ self, backup_state: Optional[Tuple[Any, List[Tuple[str, ...]], int]]
842
+ ) -> None:
843
+ """Restore original state after preprocessing."""
844
+ if backup_state is not None:
845
+ self.transactions, self.unique_candidates, self.max_size = backup_state
846
+
731
847
  @overload
732
848
  def search(
733
849
  self,
@@ -737,6 +853,9 @@ class GSP:
737
853
  verbose: Optional[bool] = None,
738
854
  *,
739
855
  return_sequences: Literal[False] = False,
856
+ preprocess_fn: Optional[Callable[[Any], Any]] = None,
857
+ postprocess_fn: Optional[Callable[[Any], Any]] = None,
858
+ candidate_filter_fn: Optional[Callable[[Tuple[str, ...], int, Dict[str, Any]], bool]] = None,
740
859
  ) -> List[Dict[Tuple[str, ...], int]]: ...
741
860
 
742
861
  @overload
@@ -748,6 +867,9 @@ class GSP:
748
867
  verbose: Optional[bool] = None,
749
868
  *,
750
869
  return_sequences: Literal[True],
870
+ preprocess_fn: Optional[Callable[[Any], Any]] = None,
871
+ postprocess_fn: Optional[Callable[[Any], Any]] = None,
872
+ candidate_filter_fn: Optional[Callable[[Tuple[str, ...], int, Dict[str, Any]], bool]] = None,
751
873
  ) -> List[List[Sequence]]: ...
752
874
 
753
875
  def search(
@@ -758,6 +880,9 @@ class GSP:
758
880
  verbose: Optional[bool] = None,
759
881
  *,
760
882
  return_sequences: bool = False,
883
+ preprocess_fn: Optional[Callable[[Any], Any]] = None,
884
+ postprocess_fn: Optional[Callable[[Any], Any]] = None,
885
+ candidate_filter_fn: Optional[Callable[[Tuple[str, ...], int, Dict[str, Any]], bool]] = None,
761
886
  ) -> Union[List[Dict[Tuple[str, ...], int]], List[List[Sequence]]]:
762
887
  if backend is not None:
763
888
  logger.debug("Backend parameter is currently unused: %s", backend)
@@ -786,6 +911,28 @@ class GSP:
786
911
  compatibility. When True, returns List[List[Sequence]] where
787
912
  each Sequence contains items, support count, and can be extended
788
913
  with additional metadata.
914
+ preprocess_fn (Optional[Callable[[Any], Any]]): Custom preprocessing hook function.
915
+ Called once at the beginning of the search to transform transactions.
916
+ Receives the transaction list and should return a modified transaction list.
917
+ Useful for data cleaning, filtering, or transformation before mining.
918
+ Default is None (no preprocessing).
919
+ postprocess_fn (Optional[Callable[[Any], Any]]): Custom postprocessing hook function.
920
+ Called once at the end of the search to transform discovered patterns.
921
+ Receives the pattern list and should return a modified pattern list.
922
+ Useful for filtering, aggregating, or enriching results.
923
+ Default is None (no postprocessing).
924
+ candidate_filter_fn (Optional[Callable[[Tuple[str, ...], int, Dict[str, Any]], bool]]):
925
+ Custom candidate filtering function for dynamic pruning at runtime.
926
+ Called for each candidate after support counting to determine if it
927
+ should be kept. Receives:
928
+ - candidate: The candidate sequence as a tuple of strings
929
+ - support_count: The support count of the candidate
930
+ - context: A dictionary with additional information (e.g., 'min_support_count',
931
+ 'total_transactions', 'k_level')
932
+ Should return True to KEEP the candidate, False to filter it out.
933
+ Complements the pruning_strategy but operates at a different level.
934
+ Supports lambda expressions and arbitrary callables.
935
+ Default is None (no additional filtering).
789
936
 
790
937
  Returns:
791
938
  Union[List[Dict[Tuple[str, ...], int]], List[List[Sequence]]]:
@@ -798,6 +945,8 @@ class GSP:
798
945
 
799
946
  Raises:
800
947
  ValueError: If the minimum support threshold is not in the range `(0.0, 1.0]`.
948
+ Exception: If any user-provided hook function raises an exception, it will be
949
+ propagated with additional context about where it occurred.
801
950
 
802
951
  Logs:
803
952
  - Information about the algorithm's start, intermediate progress (candidates filtered),
@@ -842,6 +991,60 @@ class GSP:
842
991
  print(f"Pattern: {seq.items}, Support: {seq.support}")
843
992
  ```
844
993
 
994
+ Using custom hooks with lambda expressions:
995
+
996
+ ```python
997
+ from gsppy.gsp import GSP
998
+
999
+ transactions = [
1000
+ ["A", "B", "C"],
1001
+ ["A", "C", "D"],
1002
+ ["B", "C", "E"],
1003
+ ]
1004
+
1005
+ # Preprocessing: Convert all items to uppercase in each itemset
1006
+ preprocess = lambda txs: [
1007
+ [tuple(item.upper() if isinstance(item, str) else item for item in itemset) for itemset in tx]
1008
+ for tx in txs
1009
+ ]
1010
+
1011
+ # Candidate filtering: Only keep patterns with length <= 2
1012
+ filter_fn = lambda candidate, support, ctx: len(candidate) <= 2
1013
+
1014
+ # Postprocessing: Add metadata to each level
1015
+ postprocess = lambda patterns: [
1016
+ {k: v for k, v in level.items() if v > 1} for level in patterns
1017
+ ]
1018
+
1019
+ gsp = GSP(transactions)
1020
+ patterns = gsp.search(
1021
+ min_support=0.3,
1022
+ preprocess_fn=preprocess,
1023
+ candidate_filter_fn=filter_fn,
1024
+ postprocess_fn=postprocess
1025
+ )
1026
+ ```
1027
+
1028
+ Advanced candidate filtering with context:
1029
+
1030
+ ```python
1031
+ from gsppy.gsp import GSP
1032
+
1033
+ transactions = [["A", "B", "C"], ["A", "C", "D"], ["B", "C", "E"]]
1034
+
1035
+ # Filter candidates based on both support and pattern characteristics
1036
+ def custom_filter(candidate, support_count, context):
1037
+ # Keep candidates with high support or specific patterns
1038
+ min_support = context.get('min_support_count', 0)
1039
+ if support_count >= min_support * 1.5: # 50% above minimum
1040
+ return True
1041
+ # Or keep patterns starting with 'A'
1042
+ return candidate[0] == 'A' if candidate else False
1043
+
1044
+ gsp = GSP(transactions)
1045
+ patterns = gsp.search(min_support=0.3, candidate_filter_fn=custom_filter)
1046
+ ```
1047
+
845
1048
  Usage with temporal constraints (requires timestamped transactions):
846
1049
 
847
1050
  ```python
@@ -877,59 +1080,75 @@ class GSP:
877
1080
  f"Using temporal constraints: mingap={self.mingap}, maxgap={self.maxgap}, maxspan={self.maxspan}"
878
1081
  )
879
1082
 
880
- # Clear freq_patterns for this search (allow reusing the GSP instance)
881
- self.freq_patterns = []
882
-
883
- # Convert fractional support to absolute count (ceil to preserve threshold semantics)
884
- abs_min_support = int(math.ceil(len(self.transactions) * float(min_support)))
885
-
886
- # the set of frequent 1-sequence: all singleton sequences
887
- # (k-itemsets/k-sequence = 1) - Initially, every item in DB is a
888
- # candidate
889
- candidates = self.unique_candidates
890
-
891
- # scan transactions to collect support count for each candidate
892
- # sequence & filter
893
- freq_1 = self._support(candidates, abs_min_support)
894
- # Apply pruning strategy for additional filtering
895
- freq_1 = self._apply_pruning(freq_1, abs_min_support)
896
- self.freq_patterns.append(freq_1)
1083
+ # Apply preprocessing hook and backup state
1084
+ transactions_to_use, backup_state = self._apply_preprocess_hook(preprocess_fn)
897
1085
 
898
- # (k-itemsets/k-sequence = 1)
899
- k_items = 1
900
-
901
- self._print_status(k_items, candidates)
1086
+ # Ensure state is restored even if errors occur during mining
1087
+ try:
1088
+ # Clear freq_patterns for this search (allow reusing the GSP instance)
1089
+ self.freq_patterns = []
902
1090
 
903
- # repeat until no frequent sequence or no candidate can be found
904
- # If max_k is provided, stop generating candidates beyond that length
905
- while (
906
- self.freq_patterns[k_items - 1] and k_items + 1 <= self.max_size and (max_k is None or k_items + 1 <= max_k)
907
- ):
908
- k_items += 1
1091
+ # Convert fractional support to absolute count (ceil to preserve threshold semantics)
1092
+ abs_min_support = int(math.ceil(len(transactions_to_use) * float(min_support)))
909
1093
 
910
- # Generate candidate sets Ck (set of candidate k-sequences) -
911
- # generate new candidates from the last "best" candidates filtered
912
- # by minimum support
913
- candidates = generate_candidates_from_previous(self.freq_patterns[k_items - 2])
1094
+ # the set of frequent 1-sequence: all singleton sequences
1095
+ # (k-itemsets/k-sequence = 1) - Initially, every item in DB is a
1096
+ # candidate
1097
+ candidates = self.unique_candidates
914
1098
 
915
- # candidate pruning - eliminates candidates who are not potentially
916
- # frequent (using support as threshold)
917
- freq_k = self._support(candidates, abs_min_support)
1099
+ # scan transactions to collect support count for each candidate
1100
+ # sequence & filter
1101
+ freq_1 = self._support(candidates, abs_min_support)
918
1102
  # Apply pruning strategy for additional filtering
919
- freq_k = self._apply_pruning(freq_k, abs_min_support)
920
- self.freq_patterns.append(freq_k)
1103
+ freq_1 = self._apply_pruning(freq_1, abs_min_support)
1104
+ # Apply user-provided candidate filter if specified
1105
+ freq_1 = self._apply_candidate_filter(freq_1, abs_min_support, 1, candidate_filter_fn)
1106
+ self.freq_patterns.append(freq_1)
921
1107
 
922
- self._print_status(k_items, candidates)
923
- logger.info("GSP algorithm completed.")
1108
+ # (k-itemsets/k-sequence = 1)
1109
+ k_items = 1
924
1110
 
925
- # Restore original verbosity if it was overridden
926
- if verbose is not None:
927
- self.verbose = original_verbose
928
- self._configure_logging()
1111
+ self._print_status(k_items, candidates)
929
1112
 
930
- # Return results in the requested format
931
- result = self.freq_patterns[:-1]
932
- if return_sequences:
933
- # Convert Dict[Tuple[str, ...], int] to List[Sequence] for each level
934
- return [dict_to_sequences(level_patterns) for level_patterns in result]
935
- return result
1113
+ # repeat until no frequent sequence or no candidate can be found
1114
+ # If max_k is provided, stop generating candidates beyond that length
1115
+ while (
1116
+ self.freq_patterns[k_items - 1] and k_items + 1 <= self.max_size and (max_k is None or k_items + 1 <= max_k)
1117
+ ):
1118
+ k_items += 1
1119
+
1120
+ # Generate candidate sets Ck (set of candidate k-sequences) -
1121
+ # generate new candidates from the last "best" candidates filtered
1122
+ # by minimum support
1123
+ candidates = generate_candidates_from_previous(self.freq_patterns[k_items - 2])
1124
+
1125
+ # candidate pruning - eliminates candidates who are not potentially
1126
+ # frequent (using support as threshold)
1127
+ freq_k = self._support(candidates, abs_min_support)
1128
+ # Apply pruning strategy for additional filtering
1129
+ freq_k = self._apply_pruning(freq_k, abs_min_support)
1130
+ # Apply user-provided candidate filter if specified
1131
+ freq_k = self._apply_candidate_filter(freq_k, abs_min_support, k_items, candidate_filter_fn)
1132
+ self.freq_patterns.append(freq_k)
1133
+
1134
+ self._print_status(k_items, candidates)
1135
+ logger.info("GSP algorithm completed.")
1136
+
1137
+ # Return results in the requested format
1138
+ result = self.freq_patterns[:-1]
1139
+
1140
+ # Apply postprocessing hook
1141
+ result = self._apply_postprocess_hook(result, postprocess_fn)
1142
+
1143
+ if return_sequences:
1144
+ # Convert Dict[Tuple[str, ...], int] to List[Sequence] for each level
1145
+ return [dict_to_sequences(level_patterns) for level_patterns in result]
1146
+ return result
1147
+ finally:
1148
+ # Always restore original state if preprocessing was applied, even on exceptions
1149
+ self._restore_preprocessing_state(backup_state)
1150
+
1151
+ # Restore original verbosity if it was overridden
1152
+ if verbose is not None:
1153
+ self.verbose = original_verbose
1154
+ self._configure_logging()
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "gsppy"
7
- version = "4.2.0"
7
+ version = "5.0.0"
8
8
  description = "GSP (Generalized Sequence Pattern) algorithm in Python"
9
9
  keywords = ["GSP", "sequential patterns", "data analysis", "sequence mining"]
10
10
  license = { file = "LICENSE" }