gsppy 2.0.1__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsppy/cli.py +46 -18
- gsppy/gsp.py +23 -20
- gsppy/utils.py +12 -9
- {gsppy-2.0.1.dist-info → gsppy-2.2.0.dist-info}/METADATA +91 -33
- gsppy-2.2.0.dist-info/RECORD +9 -0
- {gsppy-2.0.1.dist-info → gsppy-2.2.0.dist-info}/WHEEL +1 -2
- gsppy/tests/__init__.py +0 -0
- gsppy/tests/test_cli.py +0 -326
- gsppy/tests/test_gsp.py +0 -250
- gsppy/tests/test_utils.py +0 -91
- gsppy-2.0.1.dist-info/RECORD +0 -14
- gsppy-2.0.1.dist-info/top_level.txt +0 -1
- {gsppy-2.0.1.dist-info → gsppy-2.2.0.dist-info}/entry_points.txt +0 -0
- {gsppy-2.0.1.dist-info → gsppy-2.2.0.dist-info/licenses}/LICENSE +0 -0
gsppy/cli.py
CHANGED
|
@@ -27,17 +27,37 @@ Key Features:
|
|
|
27
27
|
This CLI empowers users to perform sequential pattern mining on transactional data efficiently through
|
|
28
28
|
a simple command-line interface.
|
|
29
29
|
"""
|
|
30
|
-
import
|
|
30
|
+
import os
|
|
31
31
|
import csv
|
|
32
|
+
import sys
|
|
32
33
|
import json
|
|
33
34
|
import logging
|
|
34
|
-
import
|
|
35
|
-
from typing import List
|
|
35
|
+
import argparse
|
|
36
|
+
from typing import Dict, List, Tuple
|
|
36
37
|
|
|
37
38
|
from gsppy.gsp import GSP
|
|
38
39
|
|
|
40
|
+
# Configure logging
|
|
41
|
+
logging.basicConfig(
|
|
42
|
+
level=logging.INFO,
|
|
43
|
+
format="%(message)s", # Simplified to keep CLI output clean
|
|
44
|
+
handlers=[logging.StreamHandler(sys.stdout)],
|
|
45
|
+
)
|
|
46
|
+
logger = logging.getLogger(__name__)
|
|
39
47
|
|
|
40
|
-
|
|
48
|
+
|
|
49
|
+
def setup_logging(verbose: bool) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Set the logging level based on the verbosity of the CLI output.
|
|
52
|
+
:param verbose: Whether to enable verbose logging.
|
|
53
|
+
"""
|
|
54
|
+
if verbose:
|
|
55
|
+
logger.setLevel(logging.DEBUG)
|
|
56
|
+
else:
|
|
57
|
+
logger.setLevel(logging.INFO)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def read_transactions_from_json(file_path: str) -> List[List[str]]:
|
|
41
61
|
"""
|
|
42
62
|
Read transactions from a JSON file.
|
|
43
63
|
|
|
@@ -52,9 +72,7 @@ def read_transactions_from_json(file_path: str) -> List[List]:
|
|
|
52
72
|
"""
|
|
53
73
|
try:
|
|
54
74
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
55
|
-
transactions = json.load(f)
|
|
56
|
-
if not isinstance(transactions, list) or not all(isinstance(t, list) for t in transactions):
|
|
57
|
-
raise ValueError("File should contain a JSON array of transaction lists.")
|
|
75
|
+
transactions: List[List[str]] = json.load(f)
|
|
58
76
|
return transactions
|
|
59
77
|
except Exception as e:
|
|
60
78
|
msg = f"Error reading transaction data from JSON file '{file_path}': {e}"
|
|
@@ -62,7 +80,7 @@ def read_transactions_from_json(file_path: str) -> List[List]:
|
|
|
62
80
|
raise ValueError(msg) from e
|
|
63
81
|
|
|
64
82
|
|
|
65
|
-
def read_transactions_from_csv(file_path: str) -> List[List]:
|
|
83
|
+
def read_transactions_from_csv(file_path: str) -> List[List[str]]:
|
|
66
84
|
"""
|
|
67
85
|
Read transactions from a CSV file.
|
|
68
86
|
|
|
@@ -76,7 +94,7 @@ def read_transactions_from_csv(file_path: str) -> List[List]:
|
|
|
76
94
|
ValueError: If the file cannot be read or contains invalid data.
|
|
77
95
|
"""
|
|
78
96
|
try:
|
|
79
|
-
transactions = []
|
|
97
|
+
transactions: List[List[str]] = []
|
|
80
98
|
with open(file_path, newline='', encoding='utf-8') as csvfile:
|
|
81
99
|
reader = csv.reader(csvfile)
|
|
82
100
|
for row in reader:
|
|
@@ -92,7 +110,7 @@ def read_transactions_from_csv(file_path: str) -> List[List]:
|
|
|
92
110
|
raise ValueError(msg) from e
|
|
93
111
|
|
|
94
112
|
|
|
95
|
-
def detect_and_read_file(file_path: str) -> List[List]:
|
|
113
|
+
def detect_and_read_file(file_path: str) -> List[List[str]]:
|
|
96
114
|
"""
|
|
97
115
|
Detect file format (CSV or JSON) and read transactions.
|
|
98
116
|
|
|
@@ -120,7 +138,7 @@ def detect_and_read_file(file_path: str) -> List[List]:
|
|
|
120
138
|
raise ValueError("Unsupported file format. Please provide a JSON or CSV file.")
|
|
121
139
|
|
|
122
140
|
|
|
123
|
-
def main():
|
|
141
|
+
def main() -> None:
|
|
124
142
|
"""
|
|
125
143
|
Main function to handle CLI input and run the GSP algorithm.
|
|
126
144
|
|
|
@@ -150,32 +168,42 @@ def main():
|
|
|
150
168
|
help="Minimum support threshold as a fraction of total transactions (default: 0.2)"
|
|
151
169
|
)
|
|
152
170
|
|
|
171
|
+
# Verbose output argument
|
|
172
|
+
parser.add_argument(
|
|
173
|
+
'--verbose',
|
|
174
|
+
action='store_true',
|
|
175
|
+
help='Enable verbose output for debugging purposes.'
|
|
176
|
+
)
|
|
177
|
+
|
|
153
178
|
# Parse arguments
|
|
154
179
|
args = parser.parse_args()
|
|
155
180
|
|
|
181
|
+
# Setup logging verbosity
|
|
182
|
+
setup_logging(args.verbose)
|
|
183
|
+
|
|
156
184
|
# Automatically detect and load transactions
|
|
157
185
|
try:
|
|
158
186
|
transactions = detect_and_read_file(args.file)
|
|
159
187
|
except ValueError as e:
|
|
160
|
-
|
|
188
|
+
logger.error(f"Error: {e}")
|
|
161
189
|
return
|
|
162
190
|
|
|
163
191
|
# Check min_support
|
|
164
192
|
if args.min_support <= 0.0 or args.min_support > 1.0:
|
|
165
|
-
|
|
193
|
+
logger.error("Error: min_support must be in the range (0.0, 1.0].")
|
|
166
194
|
return
|
|
167
195
|
|
|
168
196
|
# Initialize and run GSP algorithm
|
|
169
197
|
try:
|
|
170
198
|
gsp = GSP(transactions)
|
|
171
|
-
patterns = gsp.search(min_support=args.min_support)
|
|
172
|
-
|
|
199
|
+
patterns: List[Dict[Tuple[str, ...], int]] = gsp.search(min_support=args.min_support)
|
|
200
|
+
logger.info("Frequent Patterns Found:")
|
|
173
201
|
for i, level in enumerate(patterns, start=1):
|
|
174
|
-
|
|
202
|
+
logger.info(f"\n{i}-Sequence Patterns:")
|
|
175
203
|
for pattern, support in level.items():
|
|
176
|
-
|
|
204
|
+
logger.info(f"Pattern: {pattern}, Support: {support}")
|
|
177
205
|
except Exception as e:
|
|
178
|
-
|
|
206
|
+
logger.error(f"Error executing GSP algorithm: {e}")
|
|
179
207
|
|
|
180
208
|
|
|
181
209
|
if __name__ == '__main__':
|
gsppy/gsp.py
CHANGED
|
@@ -86,9 +86,9 @@ Version:
|
|
|
86
86
|
"""
|
|
87
87
|
import logging
|
|
88
88
|
import multiprocessing as mp
|
|
89
|
-
from
|
|
89
|
+
from typing import Any, Dict, List, Tuple
|
|
90
90
|
from itertools import chain
|
|
91
|
-
from
|
|
91
|
+
from collections import Counter
|
|
92
92
|
|
|
93
93
|
from gsppy.utils import split_into_batches, is_subsequence_in_list, generate_candidates_from_previous
|
|
94
94
|
|
|
@@ -114,7 +114,7 @@ class GSP:
|
|
|
114
114
|
k-sequence for pattern generation.
|
|
115
115
|
"""
|
|
116
116
|
|
|
117
|
-
def __init__(self, raw_transactions: List[List]):
|
|
117
|
+
def __init__(self, raw_transactions: List[List[str]]):
|
|
118
118
|
"""
|
|
119
119
|
Initialize the GSP algorithm with raw transactional data.
|
|
120
120
|
|
|
@@ -132,10 +132,10 @@ class GSP:
|
|
|
132
132
|
ValueError: If the input transaction dataset is empty, contains
|
|
133
133
|
fewer than two transactions, or is not properly formatted.
|
|
134
134
|
"""
|
|
135
|
-
self.freq_patterns = []
|
|
135
|
+
self.freq_patterns: List[Dict[Tuple[str, ...], int]] = []
|
|
136
136
|
self._pre_processing(raw_transactions)
|
|
137
137
|
|
|
138
|
-
def _pre_processing(self, raw_transactions: List[List]):
|
|
138
|
+
def _pre_processing(self, raw_transactions: List[List[str]]) -> None:
|
|
139
139
|
"""
|
|
140
140
|
Validate and preprocess the input transactional dataset.
|
|
141
141
|
|
|
@@ -167,20 +167,19 @@ class GSP:
|
|
|
167
167
|
logger.error(msg)
|
|
168
168
|
raise ValueError(msg)
|
|
169
169
|
|
|
170
|
-
if not all(isinstance(item, list) for item in raw_transactions):
|
|
171
|
-
msg = "The dataset must be a list of transactions."
|
|
172
|
-
logger.error(msg)
|
|
173
|
-
raise ValueError(msg)
|
|
174
|
-
|
|
175
170
|
logger.info("Pre-processing transactions...")
|
|
176
171
|
self.max_size = max(len(item) for item in raw_transactions)
|
|
177
|
-
self.transactions = [tuple(transaction) for transaction in raw_transactions]
|
|
178
|
-
counts = Counter(chain.from_iterable(raw_transactions))
|
|
179
|
-
self.unique_candidates = [(item,) for item in counts.keys()]
|
|
172
|
+
self.transactions: List[Tuple[str, ...]] = [tuple(transaction) for transaction in raw_transactions]
|
|
173
|
+
counts: Counter[str] = Counter(chain.from_iterable(raw_transactions))
|
|
174
|
+
self.unique_candidates: list[tuple[str, Any]] = [(item,) for item in counts.keys()]
|
|
180
175
|
logger.debug("Unique candidates: %s", self.unique_candidates)
|
|
181
176
|
|
|
182
177
|
@staticmethod
|
|
183
|
-
def _worker_batch(
|
|
178
|
+
def _worker_batch(
|
|
179
|
+
batch: List[Tuple[str, ...]],
|
|
180
|
+
transactions: List[Tuple[str, ...]],
|
|
181
|
+
min_support: int
|
|
182
|
+
) -> List[Tuple[Tuple[str, ...], int]]:
|
|
184
183
|
"""
|
|
185
184
|
Evaluate a batch of candidate sequences to compute their support.
|
|
186
185
|
|
|
@@ -198,14 +197,17 @@ class GSP:
|
|
|
198
197
|
- A candidate sequence.
|
|
199
198
|
- The candidate's support count.
|
|
200
199
|
"""
|
|
201
|
-
results = []
|
|
200
|
+
results: List[Tuple[Tuple[str, ...], int]] = []
|
|
202
201
|
for item in batch:
|
|
203
202
|
frequency = sum(1 for t in transactions if is_subsequence_in_list(item, t))
|
|
204
203
|
if frequency >= min_support:
|
|
205
204
|
results.append((item, frequency))
|
|
206
205
|
return results
|
|
207
206
|
|
|
208
|
-
def _support(
|
|
207
|
+
def _support(
|
|
208
|
+
self,
|
|
209
|
+
items: List[Tuple[str, ...]], min_support: float = 0, batch_size: int = 100
|
|
210
|
+
) -> Dict[Tuple[str, ...], int]:
|
|
209
211
|
"""
|
|
210
212
|
Calculate support counts for candidate sequences, using parallel processing.
|
|
211
213
|
|
|
@@ -235,7 +237,7 @@ class GSP:
|
|
|
235
237
|
# Flatten the list of results and convert to a dictionary
|
|
236
238
|
return {item: freq for batch in batch_results for item, freq in batch}
|
|
237
239
|
|
|
238
|
-
def _print_status(self, run: int, candidates: List[Tuple]):
|
|
240
|
+
def _print_status(self, run: int, candidates: List[Tuple[str, ...]]) -> None:
|
|
239
241
|
"""
|
|
240
242
|
Log progress information for the current GSP iteration.
|
|
241
243
|
|
|
@@ -249,7 +251,7 @@ class GSP:
|
|
|
249
251
|
logger.info("Run %d: %d candidates filtered to %d.",
|
|
250
252
|
run, len(candidates), len(self.freq_patterns[run - 1]))
|
|
251
253
|
|
|
252
|
-
def search(self, min_support: float = 0.2) -> List[Dict[Tuple, int]]:
|
|
254
|
+
def search(self, min_support: float = 0.2) -> List[Dict[Tuple[str, ...], int]]:
|
|
253
255
|
"""
|
|
254
256
|
Execute the Generalized Sequential Pattern (GSP) mining algorithm.
|
|
255
257
|
|
|
@@ -263,8 +265,9 @@ class GSP:
|
|
|
263
265
|
appears in at least 30% of all transactions.
|
|
264
266
|
|
|
265
267
|
Returns:
|
|
266
|
-
List[Dict[Tuple, int]]: A list
|
|
267
|
-
|
|
268
|
+
List[Dict[Tuple[str, ...], int]]: A list of dictionaries containing frequent patterns
|
|
269
|
+
at each k-sequence level, with patterns as keys
|
|
270
|
+
and their support counts as values.
|
|
268
271
|
|
|
269
272
|
Raises:
|
|
270
273
|
ValueError: If the minimum support threshold is not in the range `(0.0, 1.0]`.
|
gsppy/utils.py
CHANGED
|
@@ -20,34 +20,35 @@ Main functionalities:
|
|
|
20
20
|
These utilities are designed to support sequence processing tasks and can be
|
|
21
21
|
adapted to various domains, such as data mining, recommendation systems, and sequence analysis.
|
|
22
22
|
"""
|
|
23
|
+
from typing import Dict, List, Tuple, Sequence, Generator
|
|
23
24
|
from functools import lru_cache
|
|
24
25
|
from itertools import product
|
|
25
|
-
from typing import List, Tuple, Generator, Dict
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
def split_into_batches(
|
|
28
|
+
def split_into_batches(
|
|
29
|
+
items: Sequence[Tuple[str, ...]], batch_size: int
|
|
30
|
+
) -> Generator[Sequence[Tuple[str, ...]], None, None]:
|
|
29
31
|
"""
|
|
30
32
|
Split the list of items into smaller batches.
|
|
31
33
|
|
|
32
34
|
Parameters:
|
|
33
|
-
items (
|
|
35
|
+
items (Sequence[Tuple]): A sequence of items to be batched.
|
|
34
36
|
batch_size (int): The maximum size of each batch.
|
|
35
37
|
|
|
36
38
|
Returns:
|
|
37
|
-
|
|
39
|
+
Generator[Sequence[Tuple], None, None]: A generator yielding batches of items.
|
|
38
40
|
"""
|
|
39
41
|
for i in range(0, len(items), batch_size):
|
|
40
42
|
yield items[i:i + batch_size]
|
|
41
43
|
|
|
42
44
|
|
|
43
|
-
# Cache the results of the slice comparison function to avoid redundant calculations
|
|
44
45
|
@lru_cache(maxsize=None)
|
|
45
|
-
def is_subsequence_in_list(subsequence: Tuple, sequence: Tuple) -> bool:
|
|
46
|
+
def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ...]) -> bool:
|
|
46
47
|
"""
|
|
47
48
|
Check if a subsequence exists within a sequence as a contiguous subsequence.
|
|
48
49
|
|
|
49
50
|
Parameters:
|
|
50
|
-
subsequence:
|
|
51
|
+
subsequence: (tuple): The sequence to search for.
|
|
51
52
|
sequence (tuple): The sequence to search within.
|
|
52
53
|
|
|
53
54
|
Returns:
|
|
@@ -67,12 +68,14 @@ def is_subsequence_in_list(subsequence: Tuple, sequence: Tuple) -> bool:
|
|
|
67
68
|
return any(sequence[i:i + len_sub] == subsequence for i in range(len_seq - len_sub + 1))
|
|
68
69
|
|
|
69
70
|
|
|
70
|
-
def generate_candidates_from_previous(
|
|
71
|
+
def generate_candidates_from_previous(
|
|
72
|
+
prev_patterns: Dict[Tuple[str, ...], int]
|
|
73
|
+
) -> List[Tuple[str, ...]]:
|
|
71
74
|
"""
|
|
72
75
|
Generate joined candidates from the previous level's frequent patterns.
|
|
73
76
|
|
|
74
77
|
Parameters:
|
|
75
|
-
prev_patterns (Dict[Tuple, int]):
|
|
78
|
+
prev_patterns (Dict[Tuple, int]): A dictionary of frequent patterns from the previous level.
|
|
76
79
|
|
|
77
80
|
Returns:
|
|
78
81
|
List[Tuple]: Candidate patterns for the next level.
|
|
@@ -1,33 +1,63 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: gsppy
|
|
3
|
-
Version: 2.0
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: GSP (Generalized Sequence Pattern) algorithm in Python
|
|
5
|
-
|
|
6
|
-
Author: Jackson Antonio do Prado Lima
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
5
|
+
Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
|
|
6
|
+
Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
7
|
+
Maintainer-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
8
|
+
License: MIT License
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2024 Jackson Antonio do Prado Lima
|
|
11
|
+
|
|
12
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
13
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
14
|
+
in the Software without restriction, including without limitation the rights
|
|
15
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
16
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
17
|
+
furnished to do so, subject to the following conditions:
|
|
18
|
+
|
|
19
|
+
The above copyright notice and this permission notice shall be included in all
|
|
20
|
+
copies or substantial portions of the Software.
|
|
21
|
+
|
|
22
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
23
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
24
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
25
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
26
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
27
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
28
|
+
SOFTWARE.
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Keywords: GSP,data analysis,sequence mining,sequential patterns
|
|
15
31
|
Classifier: Intended Audience :: Science/Research
|
|
32
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
33
|
+
Classifier: Natural Language :: English
|
|
34
|
+
Classifier: Operating System :: OS Independent
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
41
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
17
42
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
-
|
|
19
|
-
Requires-Python: >=3.11
|
|
20
|
-
Description-Content-Type: text/markdown
|
|
21
|
-
License-File: LICENSE
|
|
43
|
+
Requires-Python: >=3.8
|
|
22
44
|
Provides-Extra: dev
|
|
23
|
-
Requires-Dist:
|
|
24
|
-
Requires-Dist:
|
|
25
|
-
Requires-Dist:
|
|
26
|
-
Requires-Dist:
|
|
45
|
+
Requires-Dist: cython==3.0.11; extra == 'dev'
|
|
46
|
+
Requires-Dist: hatch==1.14.0; extra == 'dev'
|
|
47
|
+
Requires-Dist: hatchling==1.27.0; extra == 'dev'
|
|
48
|
+
Requires-Dist: mypy==1.14.0; extra == 'dev'
|
|
49
|
+
Requires-Dist: pylint==3.3.3; extra == 'dev'
|
|
50
|
+
Requires-Dist: pyright==1.1.391; extra == 'dev'
|
|
51
|
+
Requires-Dist: pytest-benchmark==5.1.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: pytest-cov==6.0.0; extra == 'dev'
|
|
53
|
+
Requires-Dist: pytest==8.3.4; extra == 'dev'
|
|
54
|
+
Requires-Dist: ruff==0.8.4; extra == 'dev'
|
|
55
|
+
Requires-Dist: tox==4.23.2; extra == 'dev'
|
|
56
|
+
Description-Content-Type: text/markdown
|
|
27
57
|
|
|
28
58
|
[]()
|
|
29
|
-

|
|
60
|
+
[](https://doi.org/10.5281/zenodo.3333987)
|
|
31
61
|
|
|
32
62
|
[](https://pypi.org/project/gsppy/)
|
|
33
63
|
[](https://sonarcloud.io/summary/new_code?id=jacksonpradolima_gsp-py)
|
|
@@ -41,6 +71,9 @@ Requires-Dist: pytest-cov==6.0.0; extra == "dev"
|
|
|
41
71
|
**GSP-Py**: A Python-powered library to mine sequential patterns in large datasets, based on the robust **Generalized
|
|
42
72
|
Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal mining, and user journey discovery.
|
|
43
73
|
|
|
74
|
+
> [!IMPORTANT]
|
|
75
|
+
> GSP-Py is compatible with Python 3.8 and later versions!
|
|
76
|
+
|
|
44
77
|
---
|
|
45
78
|
|
|
46
79
|
## 📚 Table of Contents
|
|
@@ -89,10 +122,6 @@ sudo apt install python3
|
|
|
89
122
|
|
|
90
123
|
For package dependencies of GSP-Py, they will automatically be installed when using `pip`.
|
|
91
124
|
|
|
92
|
-
> [!IMPORTANT]
|
|
93
|
-
> GSP-Py is compatible with Python 3.11 and later versions.
|
|
94
|
-
> We didn't test it on Python 3.10 or earlier versions.
|
|
95
|
-
|
|
96
125
|
---
|
|
97
126
|
|
|
98
127
|
## 🚀 Installation
|
|
@@ -121,16 +150,45 @@ pip install gsppy
|
|
|
121
150
|
|
|
122
151
|
## 🛠️ Developer Installation
|
|
123
152
|
|
|
124
|
-
|
|
125
|
-
linting).
|
|
153
|
+
This project uses [Rye](https://github.com/mitsuhiko/rye) for managing dependencies, running scripts, and setting up the environment. Follow these steps to install and set up Rye for this project:
|
|
126
154
|
|
|
127
|
-
|
|
155
|
+
#### 1. Install Rye
|
|
156
|
+
Run the following command to install Rye:
|
|
128
157
|
|
|
129
158
|
```bash
|
|
130
|
-
|
|
159
|
+
curl -sSf https://rye.astral.sh/get | bash
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
If the `~/.rye/bin` directory is not in your PATH, add the following line to your shell configuration file (e.g., `~/.bashrc`, `~/.zshrc`, etc.):
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
export PATH="$HOME/.rye/bin:$PATH"
|
|
131
166
|
```
|
|
132
167
|
|
|
133
|
-
|
|
168
|
+
Reload your shell configuration file:
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
source ~/.bashrc # or `source ~/.zshrc`
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
#### 2. Set Up the Project Environment
|
|
175
|
+
To configure the project environment and install its dependencies, run:
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
rye sync
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
#### 3. Use Rye Scripts
|
|
182
|
+
Once the environment is set up, you can run the following commands to simplify project tasks:
|
|
183
|
+
|
|
184
|
+
- Run tests: `rye run test`
|
|
185
|
+
- Format code: `rye run format`
|
|
186
|
+
- Lint code: `rye run lint`
|
|
187
|
+
- Type-check: `rye run typecheck`
|
|
188
|
+
|
|
189
|
+
#### Notes
|
|
190
|
+
- Rye automatically reads dependencies and scripts from the `pyproject.toml` file.
|
|
191
|
+
- No need for `requirements.txt`, as Rye manages all dependencies!
|
|
134
192
|
|
|
135
193
|
## 💡 Usage
|
|
136
194
|
|
|
@@ -269,8 +327,8 @@ If GSP-Py contributed to your research or project that led to a publication, we
|
|
|
269
327
|
@misc{pradolima_gsppy,
|
|
270
328
|
author = {Prado Lima, Jackson Antonio do},
|
|
271
329
|
title = {{GSP-Py - Generalized Sequence Pattern algorithm in Python}},
|
|
272
|
-
month =
|
|
273
|
-
year =
|
|
330
|
+
month = Dec,
|
|
331
|
+
year = 2024,
|
|
274
332
|
doi = {10.5281/zenodo.3333987},
|
|
275
333
|
url = {https://doi.org/10.5281/zenodo.3333987}
|
|
276
334
|
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
gsppy/cli.py,sha256=YxBL341LJzb6EN-RBkhW3o4ZCexOGiQXq_aRovKccA8,6790
|
|
3
|
+
gsppy/gsp.py,sha256=CUCC1W5GGlGbWkC_td0qDfnSJiuzbWoMapR0qciejw8,13800
|
|
4
|
+
gsppy/utils.py,sha256=gOT3USxmC0MrBnSHOQ8avxghWmjQe59hS4jNQ3eiENQ,3363
|
|
5
|
+
gsppy-2.2.0.dist-info/METADATA,sha256=1Y8LcuU7engLWoCWFIKRwRMNsgkAawnpvX6s1BoXP_8,12485
|
|
6
|
+
gsppy-2.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
7
|
+
gsppy-2.2.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
8
|
+
gsppy-2.2.0.dist-info/licenses/LICENSE,sha256=co1jy5VZd1wXOPdUC2uk1hn7zsBm6aJNgVmhPOZ47g8,1086
|
|
9
|
+
gsppy-2.2.0.dist-info/RECORD,,
|
gsppy/tests/__init__.py
DELETED
|
File without changes
|
gsppy/tests/test_cli.py
DELETED
|
@@ -1,326 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This module contains unit tests for the CLI-related functionality of the `gsppy` package
|
|
3
|
-
and the Generalized Sequential Pattern (GSP) mining algorithm. The tests ensure correctness,
|
|
4
|
-
robustness, and error handling for both file handling and the GSP algorithm implementation.
|
|
5
|
-
|
|
6
|
-
The tests include:
|
|
7
|
-
1. Validating file input handling for both JSON and CSV formats.
|
|
8
|
-
2. Ensuring proper error handling for invalid or malformed files (JSON, CSV) and unsupported formats.
|
|
9
|
-
3. Testing exceptions for non-existent files.
|
|
10
|
-
4. Verifying the behavior of the GSP algorithm when given valid inputs and configurations.
|
|
11
|
-
5. Checking for appropriate error handling when invalid parameters (e.g., `min_support`)
|
|
12
|
-
are provided to the GSP algorithm.
|
|
13
|
-
|
|
14
|
-
Key components tested:
|
|
15
|
-
- `detect_and_read_file`: A method to detect the file type (JSON/CSV) and read transactions from it.
|
|
16
|
-
- `GSP.search`: Validates the sequential pattern mining functionality for valid and invalid `min_support` parameters.
|
|
17
|
-
|
|
18
|
-
Fixtures are used to create temporary files (valid/invalid JSON and CSV) for reliable testing
|
|
19
|
-
without affecting the file system.
|
|
20
|
-
Pytest is utilized for parametrized testing to improve coverage and reduce redundancy in test cases.
|
|
21
|
-
"""
|
|
22
|
-
import json
|
|
23
|
-
import os
|
|
24
|
-
import runpy
|
|
25
|
-
import sys
|
|
26
|
-
import tempfile
|
|
27
|
-
from unittest.mock import patch
|
|
28
|
-
|
|
29
|
-
import pytest
|
|
30
|
-
|
|
31
|
-
from gsppy.cli import detect_and_read_file, main
|
|
32
|
-
from gsppy.gsp import GSP
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def test_invalid_json_structure():
|
|
36
|
-
"""
|
|
37
|
-
Test if a JSON file with an invalid structure raises an error.
|
|
38
|
-
"""
|
|
39
|
-
# Create an invalid JSON structure that does not adhere to the expected format
|
|
40
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w") as temp_file:
|
|
41
|
-
temp_file.write(json.dumps({"invalid": "data"}))
|
|
42
|
-
temp_file_name = temp_file.name
|
|
43
|
-
|
|
44
|
-
# Attempt to read the invalid JSON file
|
|
45
|
-
with pytest.raises(ValueError, match="File should contain a JSON array of transaction lists."):
|
|
46
|
-
detect_and_read_file(temp_file_name)
|
|
47
|
-
|
|
48
|
-
# Cleanup
|
|
49
|
-
os.unlink(temp_file_name)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
@pytest.fixture
|
|
53
|
-
def valid_json_file():
|
|
54
|
-
"""Fixture to create a valid JSON file."""
|
|
55
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w") as temp_file:
|
|
56
|
-
json.dump([["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]], temp_file)
|
|
57
|
-
temp_file_name = temp_file.name
|
|
58
|
-
yield temp_file_name
|
|
59
|
-
os.unlink(temp_file_name)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
@pytest.fixture
|
|
63
|
-
def valid_csv_file():
|
|
64
|
-
"""Fixture to create a valid CSV file."""
|
|
65
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file:
|
|
66
|
-
temp_file.write(b"Bread,Milk\nMilk,Diaper\nBread,Diaper,Beer\n")
|
|
67
|
-
temp_file_name = temp_file.name
|
|
68
|
-
yield temp_file_name
|
|
69
|
-
os.unlink(temp_file_name)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
@pytest.fixture
|
|
73
|
-
def invalid_json_file():
|
|
74
|
-
"""Fixture to create an invalid JSON file."""
|
|
75
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as temp_file:
|
|
76
|
-
temp_file.write(b"{invalid_json: true") # Malformed JSON
|
|
77
|
-
temp_file_name = temp_file.name
|
|
78
|
-
yield temp_file_name
|
|
79
|
-
os.unlink(temp_file_name)
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
@pytest.fixture
|
|
83
|
-
def invalid_csv_file():
|
|
84
|
-
"""Fixture to create an invalid CSV file."""
|
|
85
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file:
|
|
86
|
-
temp_file.write(b",,\nBread,,Milk\n") # Broken format
|
|
87
|
-
temp_file_name = temp_file.name
|
|
88
|
-
yield temp_file_name
|
|
89
|
-
os.unlink(temp_file_name)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
@pytest.fixture
|
|
93
|
-
def unsupported_file():
|
|
94
|
-
"""Fixture to create an unsupported file."""
|
|
95
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
|
|
96
|
-
temp_file.write(b"This is a plain text file.")
|
|
97
|
-
temp_file_name = temp_file.name
|
|
98
|
-
yield temp_file_name
|
|
99
|
-
os.unlink(temp_file_name)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def test_valid_json_file(valid_json_file):
|
|
103
|
-
"""Test if a valid JSON file is correctly read."""
|
|
104
|
-
transactions = detect_and_read_file(valid_json_file)
|
|
105
|
-
assert transactions == [["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]]
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def test_valid_csv_file(valid_csv_file):
|
|
109
|
-
"""Test if a valid CSV file is correctly read."""
|
|
110
|
-
transactions = detect_and_read_file(valid_csv_file)
|
|
111
|
-
assert transactions == [["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]]
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def test_invalid_json_file(invalid_json_file):
|
|
115
|
-
"""Test if an invalid JSON file raises an error."""
|
|
116
|
-
with pytest.raises(ValueError, match="Error reading transaction data from JSON file"):
|
|
117
|
-
detect_and_read_file(invalid_json_file)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def test_invalid_csv_file(invalid_csv_file):
|
|
121
|
-
"""Test if an invalid CSV file raises an error."""
|
|
122
|
-
with pytest.raises(ValueError, match="Error reading transaction data from CSV file"):
|
|
123
|
-
detect_and_read_file(invalid_csv_file)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
def test_unsupported_file_format(unsupported_file):
|
|
127
|
-
"""Test if an unsupported file format raises an error."""
|
|
128
|
-
with pytest.raises(ValueError, match="Unsupported file format"):
|
|
129
|
-
detect_and_read_file(unsupported_file)
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
def test_non_existent_file():
|
|
133
|
-
"""Test if a non-existent file raises an error."""
|
|
134
|
-
with pytest.raises(ValueError, match="File 'non_existent_file.json' does not exist."):
|
|
135
|
-
detect_and_read_file("non_existent_file.json")
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
@pytest.mark.parametrize("min_support", [-0.1, 1.1])
|
|
139
|
-
def test_invalid_min_support_gsp(min_support):
|
|
140
|
-
"""Test if invalid min_support values raise an error."""
|
|
141
|
-
transactions = [["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]]
|
|
142
|
-
gsp = GSP(transactions)
|
|
143
|
-
with pytest.raises(ValueError):
|
|
144
|
-
gsp.search(min_support=min_support)
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
@pytest.mark.parametrize("min_support", [0.5])
|
|
148
|
-
def test_valid_min_support_gsp(min_support):
|
|
149
|
-
"""Test if valid min_support values work with the GSP algorithm."""
|
|
150
|
-
transactions = [["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]]
|
|
151
|
-
gsp = GSP(transactions)
|
|
152
|
-
patterns = gsp.search(min_support=min_support)
|
|
153
|
-
assert len(patterns) > 0 # Ensure at least some patterns are found
|
|
154
|
-
assert patterns[0] # Ensure frequent patterns are not empty
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
def test_main_invalid_json_file(monkeypatch, capfd):
|
|
158
|
-
"""
|
|
159
|
-
Test `main()` with a JSON file that has an invalid structure.
|
|
160
|
-
"""
|
|
161
|
-
# Create an invalid JSON file
|
|
162
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w") as temp_file:
|
|
163
|
-
temp_file.write(json.dumps({"invalid": "data"}))
|
|
164
|
-
temp_file_name = temp_file.name
|
|
165
|
-
|
|
166
|
-
# Mock CLI arguments
|
|
167
|
-
monkeypatch.setattr(
|
|
168
|
-
'sys.argv', ['main', '--file', temp_file_name, '--min_support', '0.2']
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
main()
|
|
172
|
-
|
|
173
|
-
# Capture output
|
|
174
|
-
captured = capfd.readouterr()
|
|
175
|
-
assert "File should contain a JSON array of transaction lists." in captured.out
|
|
176
|
-
|
|
177
|
-
# Cleanup
|
|
178
|
-
os.unlink(temp_file_name)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
def test_main_non_existent_file(monkeypatch, capfd):
|
|
182
|
-
"""
|
|
183
|
-
Test `main()` with a file that does not exist.
|
|
184
|
-
"""
|
|
185
|
-
# Mock CLI arguments
|
|
186
|
-
monkeypatch.setattr(
|
|
187
|
-
'sys.argv', ['main', '--file', 'non_existent.json', '--min_support', '0.2']
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
main()
|
|
191
|
-
|
|
192
|
-
# Capture output
|
|
193
|
-
captured = capfd.readouterr()
|
|
194
|
-
assert "File 'non_existent.json' does not exist." in captured.out
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
def test_main_valid_json_file(monkeypatch, capfd):
|
|
198
|
-
"""
|
|
199
|
-
Test `main()` with a valid JSON file.
|
|
200
|
-
"""
|
|
201
|
-
# Create a valid JSON file
|
|
202
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w") as temp_file:
|
|
203
|
-
json.dump([["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]], temp_file)
|
|
204
|
-
temp_file_name = temp_file.name
|
|
205
|
-
|
|
206
|
-
# Mock CLI arguments
|
|
207
|
-
monkeypatch.setattr(
|
|
208
|
-
'sys.argv', ['main', '--file', temp_file_name, '--min_support', '0.2']
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
main()
|
|
212
|
-
|
|
213
|
-
# Capture output
|
|
214
|
-
captured = capfd.readouterr()
|
|
215
|
-
assert "Frequent Patterns Found:" in captured.out
|
|
216
|
-
|
|
217
|
-
# Cleanup
|
|
218
|
-
os.unlink(temp_file_name)
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
def test_main_invalid_min_support(monkeypatch, capfd):
|
|
222
|
-
"""
|
|
223
|
-
Test `main()` with an invalid `min_support` value.
|
|
224
|
-
"""
|
|
225
|
-
# Create a valid JSON file
|
|
226
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w") as temp_file:
|
|
227
|
-
json.dump([["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]], temp_file)
|
|
228
|
-
temp_file_name = temp_file.name
|
|
229
|
-
|
|
230
|
-
# Mock CLI arguments
|
|
231
|
-
monkeypatch.setattr(
|
|
232
|
-
'sys.argv', ['main', '--file', temp_file_name, '--min_support', '-1.0'] # Invalid min_support
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
main()
|
|
236
|
-
|
|
237
|
-
# Capture output
|
|
238
|
-
captured = capfd.readouterr()
|
|
239
|
-
assert "Error: min_support must be in the range (0.0, 1.0]." in captured.out
|
|
240
|
-
|
|
241
|
-
# Cleanup
|
|
242
|
-
os.unlink(temp_file_name)
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
def test_main_entry_point(monkeypatch, capfd):
|
|
246
|
-
"""
|
|
247
|
-
Test the script entry point (`if __name__ == '__main__': main()`).
|
|
248
|
-
"""
|
|
249
|
-
# Create a valid JSON file
|
|
250
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w") as temp_file:
|
|
251
|
-
json.dump([["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]], temp_file)
|
|
252
|
-
temp_file_name = temp_file.name
|
|
253
|
-
|
|
254
|
-
# Mock CLI arguments - Simulating script call
|
|
255
|
-
monkeypatch.setattr(
|
|
256
|
-
'sys.argv', ['gsppy.cli', '--file', temp_file_name, '--min_support', '0.2']
|
|
257
|
-
)
|
|
258
|
-
|
|
259
|
-
# Remove the module from sys.modules before running it
|
|
260
|
-
if 'gsppy.cli' in sys.modules:
|
|
261
|
-
del sys.modules['gsppy.cli']
|
|
262
|
-
|
|
263
|
-
# Use `runpy` to execute the script as if it were run from the command line
|
|
264
|
-
runpy.run_module('gsppy.cli', run_name='__main__')
|
|
265
|
-
|
|
266
|
-
# Capture the output
|
|
267
|
-
captured = capfd.readouterr()
|
|
268
|
-
assert "Frequent Patterns Found:" in captured.out
|
|
269
|
-
|
|
270
|
-
# Cleanup
|
|
271
|
-
os.unlink(temp_file_name)
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
def test_main_edge_case_min_support(monkeypatch, capfd):
|
|
275
|
-
"""
|
|
276
|
-
Test `main()` with edge-case values for `min_support` (valid and invalid).
|
|
277
|
-
"""
|
|
278
|
-
# Create a valid JSON
|
|
279
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w") as temp_file:
|
|
280
|
-
json.dump([["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]], temp_file)
|
|
281
|
-
temp_file_name = temp_file.name
|
|
282
|
-
|
|
283
|
-
# Case 1: `min_support` = 1.0 (Valid Edge Case)
|
|
284
|
-
monkeypatch.setattr(
|
|
285
|
-
'sys.argv', ['main', '--file', temp_file_name, '--min_support', '1.0']
|
|
286
|
-
)
|
|
287
|
-
main()
|
|
288
|
-
captured = capfd.readouterr()
|
|
289
|
-
assert "Frequent Patterns Found:" in captured.out
|
|
290
|
-
|
|
291
|
-
# Case 2: `min_support` = -1.0 (Invalid Edge Case)
|
|
292
|
-
monkeypatch.setattr(
|
|
293
|
-
'sys.argv', ['main', '--file', temp_file_name, '--min_support', '-1.0']
|
|
294
|
-
)
|
|
295
|
-
main()
|
|
296
|
-
captured = capfd.readouterr()
|
|
297
|
-
assert "Error: min_support must be in the range (0.0, 1.0]." in captured.out
|
|
298
|
-
|
|
299
|
-
# Cleanup
|
|
300
|
-
os.unlink(temp_file_name)
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
def test_main_gsp_exception(monkeypatch, capfd):
|
|
304
|
-
"""
|
|
305
|
-
Test `main()` when the GSP algorithm raises an exception.
|
|
306
|
-
"""
|
|
307
|
-
# Step 1: Create a valid JSON file
|
|
308
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w") as temp_file:
|
|
309
|
-
json.dump([["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]], temp_file)
|
|
310
|
-
temp_file_name = temp_file.name
|
|
311
|
-
|
|
312
|
-
# Step 2: Mock CLI arguments
|
|
313
|
-
monkeypatch.setattr(
|
|
314
|
-
'sys.argv', ['main', '--file', temp_file_name, '--min_support', '0.2']
|
|
315
|
-
)
|
|
316
|
-
|
|
317
|
-
# Step 3: Mock GSP.search to raise an exception
|
|
318
|
-
with patch('gsppy.gsp.GSP.search', side_effect=Exception("Simulated GSP failure")):
|
|
319
|
-
main()
|
|
320
|
-
|
|
321
|
-
# Step 4: Capture output and assert the error message
|
|
322
|
-
captured = capfd.readouterr()
|
|
323
|
-
assert "Error executing GSP algorithm: Simulated GSP failure" in captured.out
|
|
324
|
-
|
|
325
|
-
# Step 5: Cleanup
|
|
326
|
-
os.unlink(temp_file_name)
|
gsppy/tests/test_gsp.py
DELETED
|
@@ -1,250 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Unit tests for the GSP (Generalized Sequential Pattern) algorithm.
|
|
3
|
-
|
|
4
|
-
This module contains tests for various scenarios including edge cases,
|
|
5
|
-
benchmarking, and normal use cases of the GSP algorithm. The tests use
|
|
6
|
-
`pytest` for assertions and include fixtures for reusable data.
|
|
7
|
-
|
|
8
|
-
Tests include:
|
|
9
|
-
- Empty transactions.
|
|
10
|
-
- Single transaction.
|
|
11
|
-
- High minimum support filtering.
|
|
12
|
-
- Typical supermarket transactions with known frequent patterns.
|
|
13
|
-
- Randomly generated transactions for flexibility.
|
|
14
|
-
- Large transactions with repetitive items.
|
|
15
|
-
- Partial matches and benchmarking.
|
|
16
|
-
|
|
17
|
-
Author: Jackson Antonio do Prado Lima
|
|
18
|
-
Email: jacksonpradolima@gmail.com
|
|
19
|
-
"""
|
|
20
|
-
import random
|
|
21
|
-
import re
|
|
22
|
-
|
|
23
|
-
import pytest
|
|
24
|
-
|
|
25
|
-
from gsppy.gsp import GSP
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
@pytest.fixture
|
|
29
|
-
def supermarket_transactions():
|
|
30
|
-
"""
|
|
31
|
-
Fixture to provide a dataset representing supermarket transactions.
|
|
32
|
-
|
|
33
|
-
Returns:
|
|
34
|
-
list: A list of transactions, where each transaction is a list of items.
|
|
35
|
-
"""
|
|
36
|
-
return [
|
|
37
|
-
['Bread', 'Milk'],
|
|
38
|
-
['Bread', 'Diaper', 'Beer', 'Eggs'],
|
|
39
|
-
['Milk', 'Diaper', 'Beer', 'Coke'],
|
|
40
|
-
['Bread', 'Milk', 'Diaper', 'Beer'],
|
|
41
|
-
['Bread', 'Milk', 'Diaper', 'Coke']
|
|
42
|
-
]
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
@pytest.fixture
|
|
46
|
-
def random_transactions():
|
|
47
|
-
"""
|
|
48
|
-
Fixture to generate a random dataset of transactions.
|
|
49
|
-
|
|
50
|
-
Returns:
|
|
51
|
-
list: A list of transactions with random items and varying lengths.
|
|
52
|
-
"""
|
|
53
|
-
return [[random.choice(['A', 'B', 'C', 'D', 'E']) for _ in range(random.randint(2, 10))] for _ in range(100)]
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def test_empty_transactions():
|
|
57
|
-
"""
|
|
58
|
-
Test the GSP algorithm with an empty dataset.
|
|
59
|
-
|
|
60
|
-
Asserts:
|
|
61
|
-
- A ValueError is raised indicating that the dataset is empty.
|
|
62
|
-
"""
|
|
63
|
-
transactions = []
|
|
64
|
-
with pytest.raises(ValueError, match="Input transactions are empty"):
|
|
65
|
-
GSP(transactions)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def test_single_transaction():
|
|
69
|
-
"""
|
|
70
|
-
Test the GSP algorithm with a single transaction.
|
|
71
|
-
|
|
72
|
-
Asserts:
|
|
73
|
-
- A ValueError is raised indicating that GSP requires multiple transactions.
|
|
74
|
-
"""
|
|
75
|
-
transactions = [['A', 'B', 'C']]
|
|
76
|
-
with pytest.raises(ValueError, match="GSP requires multiple transactions"):
|
|
77
|
-
GSP(transactions)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def test_invalid_transaction_format():
|
|
81
|
-
"""
|
|
82
|
-
Test the GSP algorithm with invalid transaction formats.
|
|
83
|
-
|
|
84
|
-
Asserts:
|
|
85
|
-
- A ValueError is raised indicating that the transactions must be lists of lists.
|
|
86
|
-
"""
|
|
87
|
-
invalid_data = ["A", "B"] # Invalid format: not a list of lists
|
|
88
|
-
with pytest.raises(ValueError, match="The dataset must be a list of transactions."):
|
|
89
|
-
GSP(invalid_data)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
@pytest.mark.parametrize(
|
|
93
|
-
"min_support, expected_error",
|
|
94
|
-
[
|
|
95
|
-
(-0.1, re.escape("Minimum support must be in the range (0.0, 1.0]")),
|
|
96
|
-
(0.0, re.escape("Minimum support must be in the range (0.0, 1.0]")),
|
|
97
|
-
(1.1, re.escape("Minimum support must be in the range (0.0, 1.0]")),
|
|
98
|
-
]
|
|
99
|
-
)
|
|
100
|
-
def test_invalid_min_support(supermarket_transactions, min_support, expected_error):
|
|
101
|
-
"""
|
|
102
|
-
Test the GSP algorithm with invalid minimum support values.
|
|
103
|
-
|
|
104
|
-
Asserts:
|
|
105
|
-
- A ValueError is raised if the min_support is outside the valid range.
|
|
106
|
-
"""
|
|
107
|
-
gsp = GSP(supermarket_transactions)
|
|
108
|
-
with pytest.raises(ValueError, match=expected_error):
|
|
109
|
-
gsp.search(min_support=min_support)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def test_valid_min_support_edge(supermarket_transactions):
|
|
113
|
-
"""
|
|
114
|
-
Test the GSP algorithm with a valid edge value for min_support.
|
|
115
|
-
|
|
116
|
-
Asserts:
|
|
117
|
-
- The algorithm runs successfully when min_support is set to 1.0.
|
|
118
|
-
"""
|
|
119
|
-
gsp = GSP(supermarket_transactions)
|
|
120
|
-
result = gsp.search(min_support=1.0) # Only patterns supported by ALL transactions should remain
|
|
121
|
-
assert not result, "Expected no frequent patterns with min_support = 1.0"
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
def test_min_support_valid(supermarket_transactions):
|
|
125
|
-
"""
|
|
126
|
-
Test the GSP algorithm with a minimum support set just above 0.0.
|
|
127
|
-
|
|
128
|
-
Asserts:
|
|
129
|
-
- Frequent patterns are generated correctly for a low min_support threshold.
|
|
130
|
-
"""
|
|
131
|
-
gsp = GSP(supermarket_transactions)
|
|
132
|
-
result = gsp.search(min_support=0.2) # At least 1 transaction should support the pattern
|
|
133
|
-
|
|
134
|
-
# All items should appear as 1-item patterns
|
|
135
|
-
level_1_patterns = {('Bread',), ('Milk',), ('Diaper',), ('Beer',), ('Coke',), ('Eggs',)}
|
|
136
|
-
result_level_1 = set(result[0].keys()) # Extract patterns from Level 1
|
|
137
|
-
|
|
138
|
-
assert result_level_1 == level_1_patterns, f"Level 1 patterns mismatch. Got {result_level_1}"
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
def test_no_frequent_items(supermarket_transactions):
|
|
142
|
-
"""
|
|
143
|
-
Test the GSP algorithm with a high minimum support value.
|
|
144
|
-
|
|
145
|
-
Asserts:
|
|
146
|
-
- The result should be an empty list due to filtering out all items.
|
|
147
|
-
"""
|
|
148
|
-
gsp = GSP(supermarket_transactions)
|
|
149
|
-
result = gsp.search(min_support=0.9) # High minimum support
|
|
150
|
-
assert not result, "High minimum support should filter out all items."
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
def test_worker_batch_static_method(supermarket_transactions):
|
|
154
|
-
"""
|
|
155
|
-
Test the _worker_batch method directly for checkpoint validation.
|
|
156
|
-
|
|
157
|
-
Asserts:
|
|
158
|
-
- Candidates below the minimum support are filtered out.
|
|
159
|
-
- Candidates meeting the minimum support are returned with correct counts.
|
|
160
|
-
"""
|
|
161
|
-
batch = [('Bread',), ('Milk',), ('Diaper',), ('Eggs',)] # 1-sequence candidates
|
|
162
|
-
transactions = [tuple(t) for t in supermarket_transactions]
|
|
163
|
-
min_support = 3 # Absolute support count
|
|
164
|
-
expected = [(('Bread',), 4), (('Milk',), 4), (('Diaper',), 4)]
|
|
165
|
-
|
|
166
|
-
# Call the '_worker_batch' method
|
|
167
|
-
# This test accesses `_worker_batch` to test internal functionality
|
|
168
|
-
results = GSP._worker_batch(batch, transactions, min_support) # pylint: disable=protected-access
|
|
169
|
-
assert results == expected, f"Expected results {expected}, but got {results}"
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
def test_frequent_patterns(supermarket_transactions):
|
|
173
|
-
"""
|
|
174
|
-
Test the GSP algorithm with supermarket transactions and a realistic minimum support.
|
|
175
|
-
|
|
176
|
-
Asserts:
|
|
177
|
-
- The frequent patterns should match the expected result.
|
|
178
|
-
"""
|
|
179
|
-
gsp = GSP(supermarket_transactions)
|
|
180
|
-
result = gsp.search(min_support=0.3)
|
|
181
|
-
expected = [
|
|
182
|
-
{('Bread',): 4, ('Milk',): 4, ('Diaper',): 4, ('Beer',): 3, ('Coke',): 2},
|
|
183
|
-
{('Bread', 'Milk'): 3, ('Milk', 'Diaper'): 3, ('Diaper', 'Beer'): 3},
|
|
184
|
-
{('Bread', 'Milk', 'Diaper'): 2, ('Milk', 'Diaper', 'Beer'): 2}
|
|
185
|
-
]
|
|
186
|
-
assert result == expected, "Frequent patterns do not match expected results."
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
def test_random_transactions(random_transactions):
|
|
190
|
-
"""
|
|
191
|
-
Test the GSP algorithm with a random dataset.
|
|
192
|
-
|
|
193
|
-
Asserts:
|
|
194
|
-
- The result should contain some frequent patterns with a low minimum support.
|
|
195
|
-
"""
|
|
196
|
-
gsp = GSP(random_transactions)
|
|
197
|
-
result = gsp.search(min_support=0.1) # Low support to ensure some patterns emerge
|
|
198
|
-
assert len(result) > 0, "Random transactions should yield some frequent patterns with low min_support."
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
def test_large_transactions():
|
|
202
|
-
"""
|
|
203
|
-
Test the GSP algorithm with a large single transaction.
|
|
204
|
-
|
|
205
|
-
Asserts:
|
|
206
|
-
- A ValueError is raised indicating that GSP requires multiple transactions.
|
|
207
|
-
"""
|
|
208
|
-
transactions = [['A'] * 1000] # Single transaction with 1000 identical items
|
|
209
|
-
with pytest.raises(ValueError, match="GSP requires multiple transactions to find meaningful patterns."):
|
|
210
|
-
GSP(transactions)
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
def test_partial_match(supermarket_transactions):
|
|
214
|
-
"""
|
|
215
|
-
Test the GSP algorithm with additional partial matches.
|
|
216
|
-
|
|
217
|
-
Asserts:
|
|
218
|
-
- Frequent patterns are generated correctly for the given transactions.
|
|
219
|
-
"""
|
|
220
|
-
transactions = supermarket_transactions + [['Diaper', 'Milk']]
|
|
221
|
-
gsp = GSP(transactions)
|
|
222
|
-
result = gsp.search(min_support=0.3) # Adjusted minimum support to match more patterns
|
|
223
|
-
|
|
224
|
-
# Debug output to inspect generated frequent patterns
|
|
225
|
-
print("Generated frequent patterns:", result)
|
|
226
|
-
|
|
227
|
-
# Check for the presence of valid frequent patterns
|
|
228
|
-
expected_patterns_level_1 = {('Bread',), ('Milk',), ('Diaper',), ('Beer',)}
|
|
229
|
-
expected_patterns_level_2 = {('Bread', 'Milk'), ('Milk', 'Diaper'), ('Diaper', 'Beer')}
|
|
230
|
-
|
|
231
|
-
# Convert results to sets for easier comparison
|
|
232
|
-
result_level_1 = set(result[0].keys())
|
|
233
|
-
assert result_level_1 >= expected_patterns_level_1, f"Level 1 patterns mismatch. Got {result_level_1}"
|
|
234
|
-
|
|
235
|
-
# Add a condition to avoid IndexError for empty results
|
|
236
|
-
if len(result) > 1:
|
|
237
|
-
result_level_2 = set(result[1].keys())
|
|
238
|
-
assert result_level_2 >= expected_patterns_level_2, f"Level 2 patterns mismatch. Got {result_level_2}"
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
@pytest.mark.parametrize("min_support", [0.1, 0.2, 0.3, 0.4, 0.5])
|
|
242
|
-
def test_benchmark(benchmark, supermarket_transactions, min_support):
|
|
243
|
-
"""
|
|
244
|
-
Benchmark the GSP algorithm's performance using the supermarket dataset.
|
|
245
|
-
|
|
246
|
-
Uses:
|
|
247
|
-
pytest-benchmark: To measure execution time.
|
|
248
|
-
"""
|
|
249
|
-
gsp = GSP(supermarket_transactions)
|
|
250
|
-
benchmark(gsp.search, min_support=min_support)
|
gsppy/tests/test_utils.py
DELETED
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Test suite for utility functions in the utils module.
|
|
3
|
-
|
|
4
|
-
This module tests the following functions:
|
|
5
|
-
1. `split_into_batches`: Ensures a list of items is properly split into smaller batches for efficient processing.
|
|
6
|
-
2. `is_subsequence_in_list`: Validates the detection of subsequences within a given list.
|
|
7
|
-
3. `generate_joined_candidates`: Tests the logic for generating candidate sequences by joining frequent patterns.
|
|
8
|
-
|
|
9
|
-
Each function is tested for standard cases, edge cases, and error handling to ensure robustness.
|
|
10
|
-
"""
|
|
11
|
-
from gsppy.utils import split_into_batches, is_subsequence_in_list, generate_candidates_from_previous
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def test_split_into_batches():
|
|
15
|
-
"""
|
|
16
|
-
Test the `split_into_batches` utility function.
|
|
17
|
-
"""
|
|
18
|
-
# Test with exact batches
|
|
19
|
-
items = [(1,), (2,), (3,), (4,), (5,)]
|
|
20
|
-
batch_size = 2
|
|
21
|
-
result = list(split_into_batches(items, batch_size))
|
|
22
|
-
assert result == [[(1,), (2,)], [(3,), (4,)], [(5,)]], "Failed exact batch split"
|
|
23
|
-
|
|
24
|
-
# Test with a batch size greater than the number of items
|
|
25
|
-
batch_size = 10
|
|
26
|
-
result = list(split_into_batches(items, batch_size))
|
|
27
|
-
assert result == [items], "Failed large batch size handling"
|
|
28
|
-
|
|
29
|
-
# Test with batch size of 1
|
|
30
|
-
batch_size = 1
|
|
31
|
-
result = list(split_into_batches(items, batch_size))
|
|
32
|
-
assert result == [[(1,)], [(2,)], [(3,)], [(4,)], [(5,)]], "Failed batch size of 1"
|
|
33
|
-
|
|
34
|
-
# Test empty input
|
|
35
|
-
items = []
|
|
36
|
-
batch_size = 3
|
|
37
|
-
result = list(split_into_batches(items, batch_size))
|
|
38
|
-
assert not result, "Failed empty input"
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def test_is_subsequence_in_list():
|
|
42
|
-
"""
|
|
43
|
-
Test the `is_subsequence_in_list` utility function.
|
|
44
|
-
"""
|
|
45
|
-
# Test when the subsequence is present
|
|
46
|
-
assert is_subsequence_in_list((1, 2), (0, 1, 2, 3)), "Failed to find subsequence"
|
|
47
|
-
assert is_subsequence_in_list((3,), (0, 1, 2, 3)), "Failed single-element subsequence"
|
|
48
|
-
|
|
49
|
-
# Test when the subsequence is not present
|
|
50
|
-
assert not is_subsequence_in_list((1, 3), (0, 1, 2, 3)), "Incorrectly found non-contiguous subsequence"
|
|
51
|
-
assert not is_subsequence_in_list((4,), (0, 1, 2, 3)), "Incorrectly found non-existent subsequence"
|
|
52
|
-
|
|
53
|
-
# Test when input sequence or subsequence is empty
|
|
54
|
-
assert not is_subsequence_in_list((), (0, 1, 2, 3)), "Incorrect positive result for empty subsequence"
|
|
55
|
-
assert not is_subsequence_in_list((1,), ()), "Incorrect positive result for empty sequence"
|
|
56
|
-
|
|
57
|
-
# Test when subsequence length exceeds sequence
|
|
58
|
-
assert not is_subsequence_in_list((1, 2, 3, 4), (1, 2, 3)), "Failed to reject long subsequence"
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def test_generate_candidates_from_previous():
|
|
62
|
-
"""
|
|
63
|
-
Test the `generate_candidates_from_previous` utility function.
|
|
64
|
-
"""
|
|
65
|
-
# Test if candidates are generated correctly
|
|
66
|
-
prev_patterns = {
|
|
67
|
-
(1, 2): 3,
|
|
68
|
-
(2, 3): 4,
|
|
69
|
-
(3, 4): 5,
|
|
70
|
-
(1, 3): 2 # Non-joinable with others as a k-1 match
|
|
71
|
-
}
|
|
72
|
-
result = set(generate_candidates_from_previous(prev_patterns))
|
|
73
|
-
|
|
74
|
-
# Expected candidates: joining (1, 2) with (2, 3) and (2, 3) with (3, 4)
|
|
75
|
-
expected = {(1, 2, 3), (2, 3, 4)}
|
|
76
|
-
assert expected.issubset(result), f"Missing expected candidates. Got {result}, expected at least {expected}"
|
|
77
|
-
|
|
78
|
-
# Test with no joinable patterns
|
|
79
|
-
prev_patterns = {
|
|
80
|
-
(1,): 3,
|
|
81
|
-
(2,): 4
|
|
82
|
-
}
|
|
83
|
-
result = set(generate_candidates_from_previous(prev_patterns))
|
|
84
|
-
|
|
85
|
-
# For single-element disjoint patterns, candidates may still be generated but GSP will filter later
|
|
86
|
-
assert result == {(1, 2), (2, 1)}, f"Unexpected disjoint candidates. Got {result}"
|
|
87
|
-
|
|
88
|
-
# Test with empty patterns
|
|
89
|
-
prev_patterns = {}
|
|
90
|
-
result = set(generate_candidates_from_previous(prev_patterns))
|
|
91
|
-
assert result == set(), f"Failed empty input handling. Got {result}"
|
gsppy-2.0.1.dist-info/RECORD
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
gsppy/cli.py,sha256=3G20xK79LeIq3jA_UnajEjcuVAebBuZwzefNRnW1o9Q,6090
|
|
3
|
-
gsppy/gsp.py,sha256=wTsVPziYqYuGP0tkeUBwV-nIo1OF83tzv7WF_VVAbZM,13641
|
|
4
|
-
gsppy/utils.py,sha256=K-oIwE6XDi-dsIlFzBlVPa1f2A_2DrUZW4hMiThRPAo,3350
|
|
5
|
-
gsppy/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
gsppy/tests/test_cli.py,sha256=HpzCiESyIA_wsCQh7NMlHbT4k3GQ72TU9J7rEdllP5I,11425
|
|
7
|
-
gsppy/tests/test_gsp.py,sha256=SnWw4hlp-F453zJGnWUHW3A9iqhPyUTYHrmGPH5fTm4,8794
|
|
8
|
-
gsppy/tests/test_utils.py,sha256=Z27IefPYSVKg-dGOmnUO9tvAcd5OQMDyKVq3HAy3XtQ,3697
|
|
9
|
-
gsppy-2.0.1.dist-info/LICENSE,sha256=co1jy5VZd1wXOPdUC2uk1hn7zsBm6aJNgVmhPOZ47g8,1086
|
|
10
|
-
gsppy-2.0.1.dist-info/METADATA,sha256=LM1ouw47MAzkUFtcomwz2UHbbFihdXbVfEwCsoz-l6Y,9925
|
|
11
|
-
gsppy-2.0.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
12
|
-
gsppy-2.0.1.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
13
|
-
gsppy-2.0.1.dist-info/top_level.txt,sha256=sovAgdiFF0V3Dz2pPAwAdIkHeR-ShUchyrH3q8qU120,6
|
|
14
|
-
gsppy-2.0.1.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
gsppy
|
|
File without changes
|
|
File without changes
|