gsppy 4.1.0__tar.gz → 5.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {gsppy-4.1.0 → gsppy-5.0.0}/CHANGELOG.md +31 -0
  2. {gsppy-4.1.0 → gsppy-5.0.0}/PKG-INFO +146 -1
  3. {gsppy-4.1.0 → gsppy-5.0.0}/README.md +145 -0
  4. {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/cli.py +129 -18
  5. {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/gsp.py +485 -127
  6. {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/utils.py +333 -33
  7. {gsppy-4.1.0 → gsppy-5.0.0}/pyproject.toml +1 -1
  8. gsppy-5.0.0/tests/test_cli_hooks.py +197 -0
  9. {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_gsp.py +6 -1
  10. gsppy-5.0.0/tests/test_hooks.py +510 -0
  11. gsppy-5.0.0/tests/test_itemsets.py +300 -0
  12. {gsppy-4.1.0 → gsppy-5.0.0}/tox.ini +1 -0
  13. {gsppy-4.1.0 → gsppy-5.0.0}/.gitignore +0 -0
  14. {gsppy-4.1.0 → gsppy-5.0.0}/CONTRIBUTING.md +0 -0
  15. {gsppy-4.1.0 → gsppy-5.0.0}/LICENSE +0 -0
  16. {gsppy-4.1.0 → gsppy-5.0.0}/SECURITY.md +0 -0
  17. {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/__init__.py +0 -0
  18. {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/accelerate.py +0 -0
  19. {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/dataframe_adapters.py +0 -0
  20. {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/enums.py +0 -0
  21. {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/pruning.py +0 -0
  22. {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/py.typed +0 -0
  23. {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/sequence.py +0 -0
  24. {gsppy-4.1.0 → gsppy-5.0.0}/gsppy/token_mapper.py +0 -0
  25. {gsppy-4.1.0 → gsppy-5.0.0}/rust/Cargo.lock +0 -0
  26. {gsppy-4.1.0 → gsppy-5.0.0}/rust/Cargo.toml +0 -0
  27. {gsppy-4.1.0 → gsppy-5.0.0}/rust/src/lib.rs +0 -0
  28. {gsppy-4.1.0 → gsppy-5.0.0}/tests/__init__.py +0 -0
  29. {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_cli.py +0 -0
  30. {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_dataframe.py +0 -0
  31. {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_gsp_fuzzing.py +0 -0
  32. {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_gsp_sequence_integration.py +0 -0
  33. {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_pruning.py +0 -0
  34. {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_sequence.py +0 -0
  35. {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_spm_format.py +0 -0
  36. {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_temporal_constraints.py +0 -0
  37. {gsppy-4.1.0 → gsppy-5.0.0}/tests/test_utils.py +0 -0
@@ -1,6 +1,37 @@
1
1
  # CHANGELOG
2
2
 
3
3
 
4
+ ## v5.0.0 (2026-02-06)
5
+
6
+ ### Chores
7
+
8
+ - Adds support for optional types in item filters and ignores types in metadata printouts.
9
+ ([`79111b4`](https://github.com/jacksonpradolima/gsp-py/commit/79111b4c781a65b21a17f85b0b507b41ba6e51f9))
10
+
11
+ - Update uv.lock for version 4.2.0
12
+ ([`f8f690f`](https://github.com/jacksonpradolima/gsp-py/commit/f8f690f7f0304dc4331c17c68487fe3411436149))
13
+
14
+ ### Features
15
+
16
+ - Add preprocessing, postprocessing, and candidate filtering hooks to GSP algorithm
17
+ ([`495d290`](https://github.com/jacksonpradolima/gsp-py/commit/495d29009abe862bf992831bd276181efa40c99d))
18
+
19
+ feat!: add preprocessing, postprocessing, and candidate filtering hooks to GSP algorithm
20
+
21
+
22
+ ## v4.2.0 (2026-02-01)
23
+
24
+ ### Chores
25
+
26
+ - Update uv.lock for version 4.1.0
27
+ ([`5ed3d9e`](https://github.com/jacksonpradolima/gsp-py/commit/5ed3d9e46cf158a2261462cb8974b6bbb452f32e))
28
+
29
+ ### Features
30
+
31
+ - Add itemset support for co-occurrence semantics in sequence mining
32
+ ([`90805b1`](https://github.com/jacksonpradolima/gsp-py/commit/90805b190f40ebf34a72da0bbe949cb627140337))
33
+
34
+
4
35
  ## v4.1.0 (2026-02-01)
5
36
 
6
37
  ### Bug Fixes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gsppy
3
- Version: 4.1.0
3
+ Version: 5.0.0
4
4
  Summary: GSP (Generalized Sequence Pattern) algorithm in Python
5
5
  Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
6
6
  Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
@@ -112,6 +112,7 @@ Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal m
112
112
  - [✅ Example: Analyzing Sales Data](#example-analyzing-sales-data)
113
113
  - [📊 Explanation: Support and Results](#explanation-support-and-results)
114
114
  - [📊 DataFrame Input Support](#dataframe-input-support)
115
+ - [🔗 Itemset Support](#itemset-support)
115
116
  - [⏱️ Temporal Constraints](#temporal-constraints)
116
117
  7. [⌨️ Typing](#typing)
117
118
  8. [🌟 Planned Features](#planned-features)
@@ -980,6 +981,150 @@ For complete examples and edge cases, see:
980
981
 
981
982
  ---
982
983
 
984
+ ## 🔗 Itemset Support
985
+
986
+ GSP-Py supports **itemsets** within sequence elements, enabling you to capture **co-occurrence** of multiple items at the same time step. This is crucial for applications where items occur together rather than in strict sequential order.
987
+
988
+ ### What are Itemsets?
989
+
990
+ - **Flat sequences**: `['A', 'B', 'C']` - each item occurs at a separate time step
991
+ - **Itemset sequences**: `[['A', 'B'], ['C']]` - items A and B occur together at the first time step, then C occurs later
992
+
993
+ ### Why Use Itemsets?
994
+
995
+ Itemsets are essential when temporal co-occurrence matters in your domain:
996
+
997
+ - **Market basket analysis**: Customers buy multiple items in a single shopping trip, then return for more items later
998
+ - **Web analytics**: Users open multiple pages in parallel tabs before moving to the next set of pages
999
+ - **Event logs**: Multiple events can occur simultaneously in complex systems
1000
+ - **Purchase patterns**: Items bought together vs. items bought in sequence
1001
+
1002
+ ### Using Itemsets
1003
+
1004
+ #### Basic Example
1005
+
1006
+ ```python
1007
+ from gsppy import GSP
1008
+
1009
+ # Itemset format: nested lists where inner lists are items that occur together
1010
+ transactions = [
1011
+ [['Bread', 'Milk'], ['Eggs']], # Bought Bread & Milk together, then Eggs later
1012
+ [['Bread', 'Milk', 'Butter']], # Bought all three items together
1013
+ [['Bread', 'Milk'], ['Eggs']], # Same pattern as customer 1
1014
+ ]
1015
+
1016
+ gsp = GSP(transactions)
1017
+ patterns = gsp.search(min_support=0.5)
1018
+
1019
+ # Pattern ('Bread',) will match any itemset containing Bread
1020
+ # Pattern ('Bread', 'Eggs') will match sequences where Bread appears before Eggs
1021
+ # (even if they're in different itemsets)
1022
+ ```
1023
+
1024
+ #### Backward Compatibility with Flat Sequences
1025
+
1026
+ GSP-Py automatically normalizes flat sequences to itemsets internally, ensuring full backward compatibility:
1027
+
1028
+ ```python
1029
+ from gsppy import GSP
1030
+
1031
+ # These are equivalent after normalization:
1032
+ flat_transactions = [['A', 'B', 'C']] # Flat format
1033
+ itemset_transactions = [[['A'], ['B'], ['C']]] # Equivalent itemset format
1034
+
1035
+ # Both produce the same results
1036
+ gsp1 = GSP(flat_transactions)
1037
+ gsp2 = GSP(itemset_transactions)
1038
+
1039
+ # Patterns are identical
1040
+ patterns1 = gsp1.search(min_support=0.5)
1041
+ patterns2 = gsp2.search(min_support=0.5)
1042
+ ```
1043
+
1044
+ ### Itemset Matching Semantics
1045
+
1046
+ Pattern matching with itemsets uses **subset semantics**:
1047
+
1048
+ - A pattern element matches a sequence element if all items in the pattern element are present in the sequence element
1049
+ - Example: Pattern `[['A', 'B']]` matches sequence element `['A', 'B', 'C']` because {A, B} ⊆ {A, B, C}
1050
+ - Pattern elements must appear in order across the sequence
1051
+
1052
+ ```python
1053
+ from gsppy import GSP
1054
+
1055
+ transactions = [
1056
+ [['A', 'B', 'D'], ['E'], ['C', 'F']], # A,B,D together, then E, then C,F together
1057
+ ]
1058
+
1059
+ gsp = GSP(transactions)
1060
+
1061
+ # Pattern ('A', 'C') will match because:
1062
+ # - 'A' is in first itemset ['A', 'B', 'D'] ✓
1063
+ # - 'C' appears later in third itemset ['C', 'F'] ✓
1064
+ # - Order is preserved ✓
1065
+ ```
1066
+
1067
+ ### Reading Itemsets from SPM Format
1068
+
1069
+ The SPM/GSP format supports itemsets using delimiters:
1070
+
1071
+ - `-1`: End of itemset
1072
+ - `-2`: End of sequence
1073
+
1074
+ ```python
1075
+ from gsppy.utils import read_transactions_from_spm
1076
+
1077
+ # SPM file content:
1078
+ # 1 2 -1 3 -1 -2
1079
+ # 1 -1 3 4 -1 -2
1080
+
1081
+ # Read with itemsets preserved
1082
+ transactions = read_transactions_from_spm("data.txt", preserve_itemsets=True)
1083
+ # Result: [[['1', '2'], ['3']], [['1'], ['3', '4']]]
1084
+
1085
+ # Read with itemsets flattened (backward compatible)
1086
+ transactions = read_transactions_from_spm("data.txt", preserve_itemsets=False)
1087
+ # Result: [['1', '2', '3'], ['1', '3', '4']]
1088
+ ```
1089
+
1090
+ ### Itemsets with Timestamps
1091
+
1092
+ Itemsets work seamlessly with temporal constraints:
1093
+
1094
+ ```python
1095
+ from gsppy import GSP
1096
+
1097
+ # Itemsets with timestamps: [(item, timestamp), ...]
1098
+ transactions = [
1099
+ [[('Login', 0), ('Home', 0)], [('Product', 5)], [('Checkout', 10)]],
1100
+ [[('Login', 0)], [('Home', 2), ('Product', 2)], [('Checkout', 15)]],
1101
+ ]
1102
+
1103
+ # Find patterns where events in the same itemset occur together
1104
+ # and subsequent itemsets occur within maxgap time units
1105
+ gsp = GSP(transactions, maxgap=10)
1106
+ patterns = gsp.search(min_support=0.5)
1107
+ ```
1108
+
1109
+ ### Complete Example
1110
+
1111
+ See [examples/itemset_example.py](examples/itemset_example.py) for comprehensive examples including:
1112
+
1113
+ - Market basket analysis with itemsets
1114
+ - Web clickstream with parallel page views
1115
+ - Comparison of flat vs. itemset semantics
1116
+ - Reading and processing SPM format files
1117
+
1118
+ ### Key Takeaways
1119
+
1120
+ ✓ **Itemsets capture co-occurrence** of items at the same time step
1121
+ ✓ **Flat sequences are automatically normalized** to itemsets internally
1122
+ ✓ **Both formats work seamlessly** with GSP-Py
1123
+ ✓ **Use itemsets when temporal co-occurrence matters** in your domain
1124
+ ✓ **SPM format supports** both flat and itemset representations
1125
+
1126
+ ---
1127
+
983
1128
  ## ⏱️ Temporal Constraints
984
1129
 
985
1130
  GSP-Py supports **time-constrained sequential pattern mining** with three powerful temporal constraints: `mingap`, `maxgap`, and `maxspan`. These constraints enable domain-specific applications such as medical event mining, retail analytics, and temporal user journey discovery.
@@ -39,6 +39,7 @@ Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal m
39
39
  - [✅ Example: Analyzing Sales Data](#example-analyzing-sales-data)
40
40
  - [📊 Explanation: Support and Results](#explanation-support-and-results)
41
41
  - [📊 DataFrame Input Support](#dataframe-input-support)
42
+ - [🔗 Itemset Support](#itemset-support)
42
43
  - [⏱️ Temporal Constraints](#temporal-constraints)
43
44
  7. [⌨️ Typing](#typing)
44
45
  8. [🌟 Planned Features](#planned-features)
@@ -907,6 +908,150 @@ For complete examples and edge cases, see:
907
908
 
908
909
  ---
909
910
 
911
+ ## 🔗 Itemset Support
912
+
913
+ GSP-Py supports **itemsets** within sequence elements, enabling you to capture **co-occurrence** of multiple items at the same time step. This is crucial for applications where items occur together rather than in strict sequential order.
914
+
915
+ ### What are Itemsets?
916
+
917
+ - **Flat sequences**: `['A', 'B', 'C']` - each item occurs at a separate time step
918
+ - **Itemset sequences**: `[['A', 'B'], ['C']]` - items A and B occur together at the first time step, then C occurs later
919
+
920
+ ### Why Use Itemsets?
921
+
922
+ Itemsets are essential when temporal co-occurrence matters in your domain:
923
+
924
+ - **Market basket analysis**: Customers buy multiple items in a single shopping trip, then return for more items later
925
+ - **Web analytics**: Users open multiple pages in parallel tabs before moving to the next set of pages
926
+ - **Event logs**: Multiple events can occur simultaneously in complex systems
927
+ - **Purchase patterns**: Items bought together vs. items bought in sequence
928
+
929
+ ### Using Itemsets
930
+
931
+ #### Basic Example
932
+
933
+ ```python
934
+ from gsppy import GSP
935
+
936
+ # Itemset format: nested lists where inner lists are items that occur together
937
+ transactions = [
938
+ [['Bread', 'Milk'], ['Eggs']], # Bought Bread & Milk together, then Eggs later
939
+ [['Bread', 'Milk', 'Butter']], # Bought all three items together
940
+ [['Bread', 'Milk'], ['Eggs']], # Same pattern as customer 1
941
+ ]
942
+
943
+ gsp = GSP(transactions)
944
+ patterns = gsp.search(min_support=0.5)
945
+
946
+ # Pattern ('Bread',) will match any itemset containing Bread
947
+ # Pattern ('Bread', 'Eggs') will match sequences where Bread appears before Eggs
948
+ # (even if they're in different itemsets)
949
+ ```
950
+
951
+ #### Backward Compatibility with Flat Sequences
952
+
953
+ GSP-Py automatically normalizes flat sequences to itemsets internally, ensuring full backward compatibility:
954
+
955
+ ```python
956
+ from gsppy import GSP
957
+
958
+ # These are equivalent after normalization:
959
+ flat_transactions = [['A', 'B', 'C']] # Flat format
960
+ itemset_transactions = [[['A'], ['B'], ['C']]] # Equivalent itemset format
961
+
962
+ # Both produce the same results
963
+ gsp1 = GSP(flat_transactions)
964
+ gsp2 = GSP(itemset_transactions)
965
+
966
+ # Patterns are identical
967
+ patterns1 = gsp1.search(min_support=0.5)
968
+ patterns2 = gsp2.search(min_support=0.5)
969
+ ```
970
+
971
+ ### Itemset Matching Semantics
972
+
973
+ Pattern matching with itemsets uses **subset semantics**:
974
+
975
+ - A pattern element matches a sequence element if all items in the pattern element are present in the sequence element
976
+ - Example: Pattern `[['A', 'B']]` matches sequence element `['A', 'B', 'C']` because {A, B} ⊆ {A, B, C}
977
+ - Pattern elements must appear in order across the sequence
978
+
979
+ ```python
980
+ from gsppy import GSP
981
+
982
+ transactions = [
983
+ [['A', 'B', 'D'], ['E'], ['C', 'F']], # A,B,D together, then E, then C,F together
984
+ ]
985
+
986
+ gsp = GSP(transactions)
987
+
988
+ # Pattern ('A', 'C') will match because:
989
+ # - 'A' is in first itemset ['A', 'B', 'D'] ✓
990
+ # - 'C' appears later in third itemset ['C', 'F'] ✓
991
+ # - Order is preserved ✓
992
+ ```
993
+
994
+ ### Reading Itemsets from SPM Format
995
+
996
+ The SPM/GSP format supports itemsets using delimiters:
997
+
998
+ - `-1`: End of itemset
999
+ - `-2`: End of sequence
1000
+
1001
+ ```python
1002
+ from gsppy.utils import read_transactions_from_spm
1003
+
1004
+ # SPM file content:
1005
+ # 1 2 -1 3 -1 -2
1006
+ # 1 -1 3 4 -1 -2
1007
+
1008
+ # Read with itemsets preserved
1009
+ transactions = read_transactions_from_spm("data.txt", preserve_itemsets=True)
1010
+ # Result: [[['1', '2'], ['3']], [['1'], ['3', '4']]]
1011
+
1012
+ # Read with itemsets flattened (backward compatible)
1013
+ transactions = read_transactions_from_spm("data.txt", preserve_itemsets=False)
1014
+ # Result: [['1', '2', '3'], ['1', '3', '4']]
1015
+ ```
1016
+
1017
+ ### Itemsets with Timestamps
1018
+
1019
+ Itemsets work seamlessly with temporal constraints:
1020
+
1021
+ ```python
1022
+ from gsppy import GSP
1023
+
1024
+ # Itemsets with timestamps: [(item, timestamp), ...]
1025
+ transactions = [
1026
+ [[('Login', 0), ('Home', 0)], [('Product', 5)], [('Checkout', 10)]],
1027
+ [[('Login', 0)], [('Home', 2), ('Product', 2)], [('Checkout', 15)]],
1028
+ ]
1029
+
1030
+ # Find patterns where events in the same itemset occur together
1031
+ # and subsequent itemsets occur within maxgap time units
1032
+ gsp = GSP(transactions, maxgap=10)
1033
+ patterns = gsp.search(min_support=0.5)
1034
+ ```
1035
+
1036
+ ### Complete Example
1037
+
1038
+ See [examples/itemset_example.py](examples/itemset_example.py) for comprehensive examples including:
1039
+
1040
+ - Market basket analysis with itemsets
1041
+ - Web clickstream with parallel page views
1042
+ - Comparison of flat vs. itemset semantics
1043
+ - Reading and processing SPM format files
1044
+
1045
+ ### Key Takeaways
1046
+
1047
+ ✓ **Itemsets capture co-occurrence** of items at the same time step
1048
+ ✓ **Flat sequences are automatically normalized** to itemsets internally
1049
+ ✓ **Both formats work seamlessly** with GSP-Py
1050
+ ✓ **Use itemsets when temporal co-occurrence matters** in your domain
1051
+ ✓ **SPM format supports** both flat and itemset representations
1052
+
1053
+ ---
1054
+
910
1055
  ## ⏱️ Temporal Constraints
911
1056
 
912
1057
  GSP-Py supports **time-constrained sequential pattern mining** with three powerful temporal constraints: `mingap`, `maxgap`, and `maxspan`. These constraints enable domain-specific applications such as medical event mining, retail analytics, and temporal user journey discovery.
@@ -35,7 +35,8 @@ import csv
35
35
  import sys
36
36
  import json
37
37
  import logging
38
- from typing import Any, List, Tuple, Union, Optional, cast
38
+ import importlib
39
+ from typing import Any, List, Tuple, Union, Callable, Optional, cast
39
40
 
40
41
  import click
41
42
 
@@ -51,6 +52,54 @@ from gsppy.enums import (
51
52
  from gsppy.utils import has_timestamps
52
53
 
53
54
 
55
+ def _load_hook_function(import_path: str, hook_type: str) -> Callable[..., Any]:
56
+ """
57
+ Load a hook function from a Python module import path.
58
+
59
+ Parameters:
60
+ import_path (str): Import path in format 'module.submodule.function_name'
61
+ hook_type (str): Type of hook for error messages ('preprocess', 'postprocess', 'candidate_filter')
62
+
63
+ Returns:
64
+ Callable: The loaded hook function
65
+
66
+ Raises:
67
+ ValueError: If the import path is invalid or function cannot be loaded
68
+ """
69
+ try:
70
+ # Split into module path and function name
71
+ parts = import_path.rsplit(".", 1)
72
+ if len(parts) != 2:
73
+ raise ValueError(f"Invalid import path format. Expected 'module.function', got '{import_path}'")
74
+
75
+ module_name, function_name = parts
76
+
77
+ # Import the module
78
+ module = importlib.import_module(module_name)
79
+
80
+ # Get the function from the module
81
+ if not hasattr(module, function_name):
82
+ raise ValueError(f"Function '{function_name}' not found in module '{module_name}'")
83
+
84
+ hook_fn = getattr(module, function_name)
85
+
86
+ # Verify it's callable
87
+ if not callable(hook_fn):
88
+ raise ValueError(f"'{import_path}' is not a callable function")
89
+
90
+ return hook_fn
91
+
92
+ except ImportError as e:
93
+ # Extract module name from import path for error message
94
+ module_part = import_path.rsplit(".", 1)[0] if "." in import_path else import_path
95
+ raise ValueError(f"Failed to import {hook_type} hook module '{module_part}': {e}") from e
96
+ except ValueError:
97
+ # Re-raise ValueError as-is
98
+ raise
99
+ except Exception as e:
100
+ raise ValueError(f"Failed to load {hook_type} hook function '{import_path}': {e}") from e
101
+
102
+
54
103
  def setup_logging(verbose: bool) -> None:
55
104
  """
56
105
  Configure logging with standardized format based on verbosity level.
@@ -515,20 +564,26 @@ def _load_transactions_by_format(
515
564
  help="File format to use. 'auto' detects format from file extension.",
516
565
  )
517
566
  @click.option("--verbose", is_flag=True, help="Enable verbose output for debugging purposes.")
518
- def main(
519
- file_path: str,
520
- min_support: float,
521
- backend: str,
522
- mingap: Optional[float],
523
- maxgap: Optional[float],
524
- maxspan: Optional[float],
525
- transaction_col: Optional[str],
526
- item_col: Optional[str],
527
- timestamp_col: Optional[str],
528
- sequence_col: Optional[str],
529
- format: str, # noqa: A002
530
- verbose: bool,
531
- ) -> None:
567
+ @click.option(
568
+ "--preprocess-hook",
569
+ type=str,
570
+ default=None,
571
+ help="Python import path to preprocessing hook function (e.g., 'mymodule.preprocess_fn').",
572
+ )
573
+ @click.option(
574
+ "--postprocess-hook",
575
+ type=str,
576
+ default=None,
577
+ help="Python import path to postprocessing hook function (e.g., 'mymodule.postprocess_fn').",
578
+ )
579
+ @click.option(
580
+ "--candidate-filter-hook",
581
+ type=str,
582
+ default=None,
583
+ help="Python import path to candidate filter hook function (e.g., 'mymodule.filter_fn').",
584
+ )
585
+ @click.pass_context
586
+ def main(ctx: click.Context, **kwargs: Any) -> None:
532
587
  """
533
588
  Run the GSP algorithm on transactional data from a file.
534
589
 
@@ -573,9 +628,59 @@ def main(
573
628
  ```bash
574
629
  gsppy --file data.txt --format spm --min_support 0.3
575
630
  ```
631
+
632
+ With custom hooks (requires Python module with hook functions):
633
+
634
+ ```bash
635
+ # Create a hooks module first (hooks.py):
636
+ # def my_filter(candidate, support, context):
637
+ # return len(candidate) <= 2 # Keep only short patterns
638
+ #
639
+ # def my_postprocess(patterns):
640
+ # return patterns[:2] # Keep only first 2 levels
641
+
642
+ gsppy --file data.json --min_support 0.3 \
643
+ --candidate-filter-hook hooks.my_filter \
644
+ --postprocess-hook hooks.my_postprocess
645
+ ```
576
646
  """
647
+ # Extract parameters from kwargs
648
+ file_path = kwargs['file_path']
649
+ min_support = kwargs['min_support']
650
+ backend = kwargs['backend']
651
+ mingap = kwargs.get('mingap')
652
+ maxgap = kwargs.get('maxgap')
653
+ maxspan = kwargs.get('maxspan')
654
+ transaction_col = kwargs.get('transaction_col')
655
+ item_col = kwargs.get('item_col')
656
+ timestamp_col = kwargs.get('timestamp_col')
657
+ sequence_col = kwargs.get('sequence_col')
658
+ file_format = kwargs['format']
659
+ verbose = kwargs['verbose']
660
+ preprocess_hook = kwargs.get('preprocess_hook')
661
+ postprocess_hook = kwargs.get('postprocess_hook')
662
+ candidate_filter_hook = kwargs.get('candidate_filter_hook')
663
+
577
664
  setup_logging(verbose)
578
665
 
666
+ # Load hook functions if specified
667
+ try:
668
+ preprocess_fn = _load_hook_function(preprocess_hook, "preprocess") if preprocess_hook else None
669
+ postprocess_fn = _load_hook_function(postprocess_hook, "postprocess") if postprocess_hook else None
670
+ candidate_filter_fn = (
671
+ _load_hook_function(candidate_filter_hook, "candidate_filter") if candidate_filter_hook else None
672
+ )
673
+
674
+ if preprocess_fn:
675
+ logger.info(f"Loaded preprocessing hook: {preprocess_hook}")
676
+ if postprocess_fn:
677
+ logger.info(f"Loaded postprocessing hook: {postprocess_hook}")
678
+ if candidate_filter_fn:
679
+ logger.info(f"Loaded candidate filter hook: {candidate_filter_hook}")
680
+ except ValueError as e:
681
+ logger.error(f"Error loading hook function: {e}")
682
+ sys.exit(1)
683
+
579
684
  # Detect file extension to determine if DataFrame column params are needed
580
685
  _, file_extension = os.path.splitext(file_path)
581
686
  file_extension = file_extension.lower()
@@ -583,10 +688,10 @@ def main(
583
688
 
584
689
  # Automatically detect and load transactions
585
690
  try:
586
- file_format = format.lower()
691
+ file_format_lower = file_format.lower()
587
692
  transactions = _load_transactions_by_format(
588
693
  file_path,
589
- file_format,
694
+ file_format_lower,
590
695
  file_extension,
591
696
  is_dataframe_format,
592
697
  transaction_col,
@@ -608,7 +713,13 @@ def main(
608
713
  # Initialize and run GSP algorithm
609
714
  try:
610
715
  gsp = GSP(transactions, mingap=mingap, maxgap=maxgap, maxspan=maxspan, verbose=verbose)
611
- patterns = gsp.search(min_support=min_support, return_sequences=False)
716
+ patterns = gsp.search(
717
+ min_support=min_support,
718
+ return_sequences=False,
719
+ preprocess_fn=preprocess_fn,
720
+ postprocess_fn=postprocess_fn,
721
+ candidate_filter_fn=candidate_filter_fn,
722
+ )
612
723
  logger.info("Frequent Patterns Found:")
613
724
  for i, level in enumerate(patterns, start=1):
614
725
  logger.info(f"\n{i}-Sequence Patterns:")