gsppy 2.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsppy/accelerate.py +269 -0
- gsppy/cli.py +43 -49
- gsppy/gsp.py +51 -28
- gsppy/utils.py +4 -5
- {gsppy-2.2.0.dist-info → gsppy-3.0.0.dist-info}/METADATA +231 -42
- gsppy-3.0.0.dist-info/RECORD +10 -0
- {gsppy-2.2.0.dist-info → gsppy-3.0.0.dist-info}/licenses/LICENSE +1 -1
- gsppy-2.2.0.dist-info/RECORD +0 -9
- {gsppy-2.2.0.dist-info → gsppy-3.0.0.dist-info}/WHEEL +0 -0
- {gsppy-2.2.0.dist-info → gsppy-3.0.0.dist-info}/entry_points.txt +0 -0
gsppy/accelerate.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Optional acceleration layer for GSP support counting.
|
|
3
|
+
|
|
4
|
+
This module attempts to use a Rust extension for the hot loop
|
|
5
|
+
(support counting via contiguous subsequence search). If the Rust
|
|
6
|
+
module is unavailable, it gracefully falls back to the pure-Python
|
|
7
|
+
implementation.
|
|
8
|
+
|
|
9
|
+
Control backend via env var:
|
|
10
|
+
- GSPPY_BACKEND=rust -> require Rust extension (raise if missing)
|
|
11
|
+
- GSPPY_BACKEND=python -> force Python implementation
|
|
12
|
+
- unset/other -> try Rust first, then fallback to Python
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
from typing import Any, Dict, List, Tuple, Optional, cast
|
|
19
|
+
|
|
20
|
+
from .utils import split_into_batches, is_subsequence_in_list
|
|
21
|
+
|
|
22
|
+
# Optional GPU (CuPy) support
|
|
23
|
+
_gpu_available = False
|
|
24
|
+
try: # pragma: no cover - optional dependency path
|
|
25
|
+
import cupy as _cp_mod # type: ignore[import-not-found]
|
|
26
|
+
|
|
27
|
+
cp = cast(Any, _cp_mod)
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
_gpu_available = cp.cuda.runtime.getDeviceCount() > 0 # type: ignore[attr-defined]
|
|
31
|
+
except Exception:
|
|
32
|
+
_gpu_available = False
|
|
33
|
+
except Exception: # pragma: no cover - optional dependency path
|
|
34
|
+
cp = None # type: ignore[assignment]
|
|
35
|
+
_gpu_available = False
|
|
36
|
+
|
|
37
|
+
# Simple per-process cache for encoded transactions keyed by the list object's id
|
|
38
|
+
_ENCODED_CACHE: Dict[int, Tuple[List[List[int]], Dict[int, str], Dict[str, int], int]] = {}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _get_encoded_transactions(
|
|
42
|
+
transactions: List[Tuple[str, ...]],
|
|
43
|
+
) -> Tuple[List[List[int]], Dict[int, str], Dict[str, int]]:
|
|
44
|
+
"""Return encoded transactions using a small in-memory cache.
|
|
45
|
+
|
|
46
|
+
Cache key is the id() of the transactions list and we also track the number of
|
|
47
|
+
transactions to detect trivial changes. This assumes transactions aren't mutated after
|
|
48
|
+
GSP is constructed (which is the common case).
|
|
49
|
+
"""
|
|
50
|
+
key = id(transactions)
|
|
51
|
+
cached = _ENCODED_CACHE.get(key)
|
|
52
|
+
if cached is not None:
|
|
53
|
+
enc_tx, inv_vocab, vocab, n_tx = cached
|
|
54
|
+
if n_tx == len(transactions):
|
|
55
|
+
return enc_tx, inv_vocab, vocab
|
|
56
|
+
enc_tx, inv_vocab, vocab = _encode_transactions(transactions)
|
|
57
|
+
_ENCODED_CACHE[key] = (enc_tx, inv_vocab, vocab, len(transactions))
|
|
58
|
+
return enc_tx, inv_vocab, vocab
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Try importing the Rust extension
|
|
62
|
+
_rust_available = False
|
|
63
|
+
_compute_supports_rust: Any = None
|
|
64
|
+
try:
|
|
65
|
+
from _gsppy_rust import compute_supports_py as _compute_supports_rust # type: ignore
|
|
66
|
+
|
|
67
|
+
_rust_available = True
|
|
68
|
+
except Exception:
|
|
69
|
+
_compute_supports_rust = None
|
|
70
|
+
_rust_available = False
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _env_backend() -> str:
|
|
74
|
+
return os.environ.get("GSPPY_BACKEND", "auto").lower()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _encode_transactions(transactions: List[Tuple[str, ...]]) -> Tuple[List[List[int]], Dict[int, str], Dict[str, int]]:
|
|
78
|
+
"""Encode transactions of strings into integer IDs.
|
|
79
|
+
|
|
80
|
+
Parameters:
|
|
81
|
+
transactions: List of transactions where each transaction is a tuple of strings.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
A tuple of:
|
|
85
|
+
- enc_tx: List[List[int]] encoded transactions
|
|
86
|
+
- inv_vocab: Dict[int, str] mapping back from id to original string
|
|
87
|
+
- vocab: Dict[str, int] mapping from original string to integer id
|
|
88
|
+
"""
|
|
89
|
+
vocab: Dict[str, int] = {}
|
|
90
|
+
enc_tx: List[List[int]] = []
|
|
91
|
+
for t in transactions:
|
|
92
|
+
row: List[int] = []
|
|
93
|
+
for s in t:
|
|
94
|
+
if s not in vocab:
|
|
95
|
+
vocab[s] = len(vocab)
|
|
96
|
+
row.append(vocab[s])
|
|
97
|
+
enc_tx.append(row)
|
|
98
|
+
inv_vocab = {v: k for k, v in vocab.items()}
|
|
99
|
+
return enc_tx, inv_vocab, vocab
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _encode_candidates(candidates: List[Tuple[str, ...]], vocab: Dict[str, int]) -> List[List[int]]:
|
|
103
|
+
"""Encode candidate patterns using a provided vocabulary mapping."""
|
|
104
|
+
return [[vocab[s] for s in cand] for cand in candidates]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _support_counts_gpu_singletons(
|
|
108
|
+
enc_tx: List[List[int]],
|
|
109
|
+
cand_ids: List[int],
|
|
110
|
+
min_support_abs: int,
|
|
111
|
+
vocab_size: int,
|
|
112
|
+
) -> List[Tuple[List[int], int]]:
|
|
113
|
+
"""GPU-accelerated support counts for singleton candidates using CuPy.
|
|
114
|
+
|
|
115
|
+
This computes the number of transactions containing each candidate item ID.
|
|
116
|
+
It uniquifies items per transaction on CPU to preserve presence semantics,
|
|
117
|
+
then performs a single bincount on GPU.
|
|
118
|
+
"""
|
|
119
|
+
# Ensure one contribution per transaction
|
|
120
|
+
unique_rows: List[List[int]] = [list(set(row)) for row in enc_tx]
|
|
121
|
+
if not unique_rows:
|
|
122
|
+
return []
|
|
123
|
+
|
|
124
|
+
# Flatten to a 1D list of item ids, then move to GPU
|
|
125
|
+
flat: List[int] = [item for row in unique_rows for item in row]
|
|
126
|
+
if not flat:
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
cp_flat = cp.asarray(flat, dtype=cp.int32) # type: ignore[name-defined]
|
|
130
|
+
counts = cp.bincount(cp_flat, minlength=vocab_size) # type: ignore[attr-defined]
|
|
131
|
+
counts_host: Any = counts.get() # back to host as a NumPy array
|
|
132
|
+
|
|
133
|
+
out: List[Tuple[List[int], int]] = []
|
|
134
|
+
for cid in cand_ids:
|
|
135
|
+
freq = int(counts_host[cid])
|
|
136
|
+
if freq >= min_support_abs:
|
|
137
|
+
out.append(([cid], freq))
|
|
138
|
+
return out
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def support_counts_python(
|
|
142
|
+
transactions: List[Tuple[str, ...]],
|
|
143
|
+
candidates: List[Tuple[str, ...]],
|
|
144
|
+
min_support_abs: int,
|
|
145
|
+
batch_size: int = 100,
|
|
146
|
+
) -> Dict[Tuple[str, ...], int]:
|
|
147
|
+
"""Pure-Python fallback for support counting (single-process).
|
|
148
|
+
|
|
149
|
+
Evaluates each candidate pattern's frequency across all transactions
|
|
150
|
+
using the same contiguous-subsequence semantics as the Rust backend.
|
|
151
|
+
|
|
152
|
+
Note: This implementation is single-process and optimized for simplicity.
|
|
153
|
+
Heavy workloads may benefit from the Rust backend.
|
|
154
|
+
"""
|
|
155
|
+
# Simple non-multiprocessing version to avoid import cycles.
|
|
156
|
+
results: Dict[Tuple[str, ...], int] = {}
|
|
157
|
+
for batch in split_into_batches(candidates, batch_size):
|
|
158
|
+
for cand in batch:
|
|
159
|
+
freq = sum(1 for t in transactions if is_subsequence_in_list(cand, t))
|
|
160
|
+
if freq >= min_support_abs:
|
|
161
|
+
results[cand] = freq
|
|
162
|
+
return results
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def support_counts(
|
|
166
|
+
transactions: List[Tuple[str, ...]],
|
|
167
|
+
candidates: List[Tuple[str, ...]],
|
|
168
|
+
min_support_abs: int,
|
|
169
|
+
batch_size: int = 100,
|
|
170
|
+
backend: Optional[str] = None,
|
|
171
|
+
) -> Dict[Tuple[str, ...], int]:
|
|
172
|
+
"""Choose the best available backend for support counting.
|
|
173
|
+
|
|
174
|
+
Backend selection is controlled by the `backend` argument when provided,
|
|
175
|
+
otherwise by the env var GSPPY_BACKEND:
|
|
176
|
+
- "rust": require Rust extension (raise if missing)
|
|
177
|
+
- "gpu": try GPU path when available (currently singletons optimized),
|
|
178
|
+
fall back to CPU for the rest
|
|
179
|
+
- "python": force pure-Python fallback
|
|
180
|
+
- otherwise: try Rust first and fall back to Python
|
|
181
|
+
"""
|
|
182
|
+
backend_sel = (backend or _env_backend()).lower()
|
|
183
|
+
|
|
184
|
+
if backend_sel == "gpu":
|
|
185
|
+
if not _gpu_available:
|
|
186
|
+
raise RuntimeError("GSPPY_BACKEND=gpu but CuPy GPU is not available")
|
|
187
|
+
# Encode once
|
|
188
|
+
enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
|
|
189
|
+
enc_cands = _encode_candidates(candidates, vocab)
|
|
190
|
+
|
|
191
|
+
# Partition candidates into singletons and non-singletons
|
|
192
|
+
singletons: List[Tuple[int, Tuple[str, ...]]] = []
|
|
193
|
+
others: List[Tuple[List[int], Tuple[str, ...]]] = []
|
|
194
|
+
# Pair original and encoded candidates; lengths should match
|
|
195
|
+
assert len(candidates) == len(enc_cands), "Encoded candidates length mismatch"
|
|
196
|
+
for orig, enc in zip(candidates, enc_cands): # noqa: B905 - lengths checked above
|
|
197
|
+
if len(enc) == 1:
|
|
198
|
+
singletons.append((enc[0], orig))
|
|
199
|
+
else:
|
|
200
|
+
others.append((enc, orig))
|
|
201
|
+
|
|
202
|
+
out: Dict[Tuple[str, ...], int] = {}
|
|
203
|
+
|
|
204
|
+
# GPU path for singletons
|
|
205
|
+
if singletons:
|
|
206
|
+
vocab_size = max(vocab.values()) + 1 if vocab else 0
|
|
207
|
+
gpu_res = _support_counts_gpu_singletons(
|
|
208
|
+
enc_tx=enc_tx,
|
|
209
|
+
cand_ids=[cid for cid, _ in singletons],
|
|
210
|
+
min_support_abs=min_support_abs,
|
|
211
|
+
vocab_size=vocab_size,
|
|
212
|
+
)
|
|
213
|
+
# Map back to original strings
|
|
214
|
+
cand_by_id: Dict[int, Tuple[str, ...]] = {cid: orig for cid, orig in singletons}
|
|
215
|
+
for enc_cand, freq in gpu_res:
|
|
216
|
+
cid = enc_cand[0]
|
|
217
|
+
out[cand_by_id[cid]] = int(freq)
|
|
218
|
+
|
|
219
|
+
# Fallback for others (prefer rust when available)
|
|
220
|
+
if others:
|
|
221
|
+
if _rust_available:
|
|
222
|
+
try:
|
|
223
|
+
other_enc = [enc for enc, _ in others]
|
|
224
|
+
res = cast(
|
|
225
|
+
List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, other_enc, int(min_support_abs))
|
|
226
|
+
)
|
|
227
|
+
for enc_cand, freq in res:
|
|
228
|
+
out[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
|
|
229
|
+
except Exception:
|
|
230
|
+
# fallback to python
|
|
231
|
+
out.update(
|
|
232
|
+
support_counts_python(transactions, [orig for _, orig in others], min_support_abs, batch_size)
|
|
233
|
+
)
|
|
234
|
+
else:
|
|
235
|
+
out.update(
|
|
236
|
+
support_counts_python(transactions, [orig for _, orig in others], min_support_abs, batch_size)
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
return out
|
|
240
|
+
|
|
241
|
+
if backend_sel == "python":
|
|
242
|
+
return support_counts_python(transactions, candidates, min_support_abs, batch_size)
|
|
243
|
+
|
|
244
|
+
if backend_sel == "rust":
|
|
245
|
+
if not _rust_available:
|
|
246
|
+
raise RuntimeError("GSPPY_BACKEND=rust but Rust extension _gsppy_rust is not available")
|
|
247
|
+
# use rust
|
|
248
|
+
enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
|
|
249
|
+
enc_cands = _encode_candidates(candidates, vocab)
|
|
250
|
+
result = cast(List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, enc_cands, int(min_support_abs)))
|
|
251
|
+
out_rust: Dict[Tuple[str, ...], int] = {}
|
|
252
|
+
for enc_cand, freq in result:
|
|
253
|
+
out_rust[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
|
|
254
|
+
return out_rust
|
|
255
|
+
|
|
256
|
+
# auto: try rust then fallback
|
|
257
|
+
if _rust_available:
|
|
258
|
+
enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
|
|
259
|
+
enc_cands = _encode_candidates(candidates, vocab)
|
|
260
|
+
try:
|
|
261
|
+
result = cast(List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, enc_cands, int(min_support_abs)))
|
|
262
|
+
out2: Dict[Tuple[str, ...], int] = {}
|
|
263
|
+
for enc_cand, freq in result:
|
|
264
|
+
out2[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
|
|
265
|
+
return out2
|
|
266
|
+
except Exception:
|
|
267
|
+
pass
|
|
268
|
+
|
|
269
|
+
return support_counts_python(transactions, candidates, min_support_abs, batch_size)
|
gsppy/cli.py
CHANGED
|
@@ -27,14 +27,16 @@ Key Features:
|
|
|
27
27
|
This CLI empowers users to perform sequential pattern mining on transactional data efficiently through
|
|
28
28
|
a simple command-line interface.
|
|
29
29
|
"""
|
|
30
|
+
|
|
30
31
|
import os
|
|
31
32
|
import csv
|
|
32
33
|
import sys
|
|
33
34
|
import json
|
|
34
35
|
import logging
|
|
35
|
-
import argparse
|
|
36
36
|
from typing import Dict, List, Tuple
|
|
37
37
|
|
|
38
|
+
import click
|
|
39
|
+
|
|
38
40
|
from gsppy.gsp import GSP
|
|
39
41
|
|
|
40
42
|
# Configure logging
|
|
@@ -71,7 +73,7 @@ def read_transactions_from_json(file_path: str) -> List[List[str]]:
|
|
|
71
73
|
ValueError: If the file cannot be read or does not contain valid JSON.
|
|
72
74
|
"""
|
|
73
75
|
try:
|
|
74
|
-
with open(file_path,
|
|
76
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
75
77
|
transactions: List[List[str]] = json.load(f)
|
|
76
78
|
return transactions
|
|
77
79
|
except Exception as e:
|
|
@@ -95,7 +97,7 @@ def read_transactions_from_csv(file_path: str) -> List[List[str]]:
|
|
|
95
97
|
"""
|
|
96
98
|
try:
|
|
97
99
|
transactions: List[List[str]] = []
|
|
98
|
-
with open(file_path, newline=
|
|
100
|
+
with open(file_path, newline="", encoding="utf-8") as csvfile:
|
|
99
101
|
reader = csv.reader(csvfile)
|
|
100
102
|
for row in reader:
|
|
101
103
|
# Check if the row is empty
|
|
@@ -138,65 +140,56 @@ def detect_and_read_file(file_path: str) -> List[List[str]]:
|
|
|
138
140
|
raise ValueError("Unsupported file format. Please provide a JSON or CSV file.")
|
|
139
141
|
|
|
140
142
|
|
|
141
|
-
|
|
143
|
+
# Click-based CLI
|
|
144
|
+
@click.command()
|
|
145
|
+
@click.option(
|
|
146
|
+
"--file",
|
|
147
|
+
"file_path",
|
|
148
|
+
required=True,
|
|
149
|
+
type=click.Path(exists=True),
|
|
150
|
+
help="Path to a JSON or CSV file containing transactions.",
|
|
151
|
+
)
|
|
152
|
+
@click.option(
|
|
153
|
+
"--min_support",
|
|
154
|
+
default=0.2,
|
|
155
|
+
show_default=True,
|
|
156
|
+
type=float,
|
|
157
|
+
help="Minimum support threshold as a fraction of total transactions.",
|
|
158
|
+
)
|
|
159
|
+
@click.option(
|
|
160
|
+
"--backend",
|
|
161
|
+
type=click.Choice(["auto", "python", "rust", "gpu"], case_sensitive=False),
|
|
162
|
+
default="auto",
|
|
163
|
+
show_default=True,
|
|
164
|
+
help="Backend to use for support counting.",
|
|
165
|
+
)
|
|
166
|
+
@click.option("--verbose", is_flag=True, help="Enable verbose output for debugging purposes.")
|
|
167
|
+
def main(file_path: str, min_support: float, backend: str, verbose: bool) -> None:
|
|
142
168
|
"""
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
Arguments:
|
|
146
|
-
- `--file` (str): Path to a JSON or CSV file containing transactions.
|
|
147
|
-
- `--min_support` (float): Minimum support threshold (default: 0.2).
|
|
169
|
+
Run the GSP algorithm on transactional data from a file.
|
|
148
170
|
"""
|
|
149
|
-
|
|
150
|
-
description="GSP (Generalized Sequential Pattern) Algorithm - "
|
|
151
|
-
"Find frequent sequential patterns in transactional data."
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
# Single file argument
|
|
155
|
-
parser.add_argument(
|
|
156
|
-
'--file',
|
|
157
|
-
type=str,
|
|
158
|
-
required=True,
|
|
159
|
-
help='Path to a JSON or CSV file containing transactions (e.g., [["A", "B"], ["B", "C"]] '
|
|
160
|
-
'or CSV rows per transaction)'
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
# Minimum support argument
|
|
164
|
-
parser.add_argument(
|
|
165
|
-
'--min_support',
|
|
166
|
-
type=float,
|
|
167
|
-
default=0.2,
|
|
168
|
-
help="Minimum support threshold as a fraction of total transactions (default: 0.2)"
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
# Verbose output argument
|
|
172
|
-
parser.add_argument(
|
|
173
|
-
'--verbose',
|
|
174
|
-
action='store_true',
|
|
175
|
-
help='Enable verbose output for debugging purposes.'
|
|
176
|
-
)
|
|
177
|
-
|
|
178
|
-
# Parse arguments
|
|
179
|
-
args = parser.parse_args()
|
|
180
|
-
|
|
181
|
-
# Setup logging verbosity
|
|
182
|
-
setup_logging(args.verbose)
|
|
171
|
+
setup_logging(verbose)
|
|
183
172
|
|
|
184
173
|
# Automatically detect and load transactions
|
|
185
174
|
try:
|
|
186
|
-
transactions = detect_and_read_file(
|
|
175
|
+
transactions = detect_and_read_file(file_path)
|
|
187
176
|
except ValueError as e:
|
|
188
177
|
logger.error(f"Error: {e}")
|
|
189
|
-
|
|
178
|
+
sys.exit(1)
|
|
190
179
|
|
|
191
180
|
# Check min_support
|
|
192
|
-
if
|
|
181
|
+
if min_support <= 0.0 or min_support > 1.0:
|
|
193
182
|
logger.error("Error: min_support must be in the range (0.0, 1.0].")
|
|
194
|
-
|
|
183
|
+
sys.exit(1)
|
|
184
|
+
|
|
185
|
+
# Select backend for acceleration layer
|
|
186
|
+
if backend and backend.lower() != "auto":
|
|
187
|
+
os.environ["GSPPY_BACKEND"] = backend.lower()
|
|
195
188
|
|
|
196
189
|
# Initialize and run GSP algorithm
|
|
197
190
|
try:
|
|
198
191
|
gsp = GSP(transactions)
|
|
199
|
-
patterns: List[Dict[Tuple[str, ...], int]] = gsp.search(min_support=
|
|
192
|
+
patterns: List[Dict[Tuple[str, ...], int]] = gsp.search(min_support=min_support)
|
|
200
193
|
logger.info("Frequent Patterns Found:")
|
|
201
194
|
for i, level in enumerate(patterns, start=1):
|
|
202
195
|
logger.info(f"\n{i}-Sequence Patterns:")
|
|
@@ -204,7 +197,8 @@ def main() -> None:
|
|
|
204
197
|
logger.info(f"Pattern: {pattern}, Support: {support}")
|
|
205
198
|
except Exception as e:
|
|
206
199
|
logger.error(f"Error executing GSP algorithm: {e}")
|
|
200
|
+
sys.exit(1)
|
|
207
201
|
|
|
208
202
|
|
|
209
|
-
if __name__ ==
|
|
203
|
+
if __name__ == "__main__":
|
|
210
204
|
main()
|
gsppy/gsp.py
CHANGED
|
@@ -34,11 +34,11 @@ Example Usage:
|
|
|
34
34
|
```python
|
|
35
35
|
# Define the transactional dataset
|
|
36
36
|
transactions = [
|
|
37
|
-
[
|
|
38
|
-
[
|
|
39
|
-
[
|
|
40
|
-
[
|
|
41
|
-
[
|
|
37
|
+
["Bread", "Milk"],
|
|
38
|
+
["Bread", "Diaper", "Beer", "Eggs"],
|
|
39
|
+
["Milk", "Diaper", "Beer", "Coke"],
|
|
40
|
+
["Bread", "Milk", "Diaper", "Beer"],
|
|
41
|
+
["Bread", "Milk", "Diaper", "Coke"],
|
|
42
42
|
]
|
|
43
43
|
|
|
44
44
|
# Initialize GSP with the transactional dataset
|
|
@@ -84,13 +84,16 @@ Version:
|
|
|
84
84
|
--------
|
|
85
85
|
- Current Version: 2.0
|
|
86
86
|
"""
|
|
87
|
+
|
|
88
|
+
import math
|
|
87
89
|
import logging
|
|
88
90
|
import multiprocessing as mp
|
|
89
|
-
from typing import
|
|
91
|
+
from typing import Dict, List, Tuple, Optional
|
|
90
92
|
from itertools import chain
|
|
91
93
|
from collections import Counter
|
|
92
94
|
|
|
93
95
|
from gsppy.utils import split_into_batches, is_subsequence_in_list, generate_candidates_from_previous
|
|
96
|
+
from gsppy.accelerate import support_counts as support_counts_accel
|
|
94
97
|
|
|
95
98
|
logger = logging.getLogger(__name__)
|
|
96
99
|
|
|
@@ -171,14 +174,13 @@ class GSP:
|
|
|
171
174
|
self.max_size = max(len(item) for item in raw_transactions)
|
|
172
175
|
self.transactions: List[Tuple[str, ...]] = [tuple(transaction) for transaction in raw_transactions]
|
|
173
176
|
counts: Counter[str] = Counter(chain.from_iterable(raw_transactions))
|
|
174
|
-
|
|
177
|
+
# Start with singleton candidates (1-sequences)
|
|
178
|
+
self.unique_candidates: List[Tuple[str, ...]] = [(item,) for item in counts.keys()]
|
|
175
179
|
logger.debug("Unique candidates: %s", self.unique_candidates)
|
|
176
180
|
|
|
177
181
|
@staticmethod
|
|
178
182
|
def _worker_batch(
|
|
179
|
-
batch: List[Tuple[str, ...]],
|
|
180
|
-
transactions: List[Tuple[str, ...]],
|
|
181
|
-
min_support: int
|
|
183
|
+
batch: List[Tuple[str, ...]], transactions: List[Tuple[str, ...]], min_support: int
|
|
182
184
|
) -> List[Tuple[Tuple[str, ...], int]]:
|
|
183
185
|
"""
|
|
184
186
|
Evaluate a batch of candidate sequences to compute their support.
|
|
@@ -204,20 +206,15 @@ class GSP:
|
|
|
204
206
|
results.append((item, frequency))
|
|
205
207
|
return results
|
|
206
208
|
|
|
207
|
-
def
|
|
208
|
-
self,
|
|
209
|
-
items: List[Tuple[str, ...]], min_support: float = 0, batch_size: int = 100
|
|
209
|
+
def _support_python(
|
|
210
|
+
self, items: List[Tuple[str, ...]], min_support: int = 0, batch_size: int = 100
|
|
210
211
|
) -> Dict[Tuple[str, ...], int]:
|
|
211
212
|
"""
|
|
212
|
-
Calculate support counts for candidate sequences
|
|
213
|
-
|
|
214
|
-
To improve efficiency, candidate sequences are processed in parallel batches using the
|
|
215
|
-
`multiprocessing` module. Each sequence is checked against transactions, and its support
|
|
216
|
-
count is calculated.
|
|
213
|
+
Calculate support counts for candidate sequences using Python multiprocessing.
|
|
217
214
|
|
|
218
215
|
Parameters:
|
|
219
216
|
items (List[Tuple]): Candidate sequences to evaluate.
|
|
220
|
-
min_support (
|
|
217
|
+
min_support (int): Absolute minimum support count required for a sequence to be considered frequent.
|
|
221
218
|
batch_size (int): Maximum number of candidates to process per batch.
|
|
222
219
|
|
|
223
220
|
Returns:
|
|
@@ -231,12 +228,30 @@ class GSP:
|
|
|
231
228
|
with mp.Pool(processes=mp.cpu_count()) as pool:
|
|
232
229
|
batch_results = pool.starmap(
|
|
233
230
|
self._worker_batch, # Process a batch at a time
|
|
234
|
-
[(batch, self.transactions, min_support) for batch in batches]
|
|
231
|
+
[(batch, self.transactions, min_support) for batch in batches],
|
|
235
232
|
)
|
|
236
233
|
|
|
237
234
|
# Flatten the list of results and convert to a dictionary
|
|
238
235
|
return {item: freq for batch in batch_results for item, freq in batch}
|
|
239
236
|
|
|
237
|
+
def _support(
|
|
238
|
+
self,
|
|
239
|
+
items: List[Tuple[str, ...]],
|
|
240
|
+
min_support: int = 0,
|
|
241
|
+
batch_size: int = 100,
|
|
242
|
+
backend: Optional[str] = None,
|
|
243
|
+
) -> Dict[Tuple[str, ...], int]:
|
|
244
|
+
"""
|
|
245
|
+
Calculate support counts for candidate sequences using the fastest available backend.
|
|
246
|
+
This will try the Rust extension if available (and configured), otherwise fall back to
|
|
247
|
+
the Python multiprocessing implementation.
|
|
248
|
+
"""
|
|
249
|
+
try:
|
|
250
|
+
return support_counts_accel(self.transactions, items, min_support, batch_size, backend=backend)
|
|
251
|
+
except Exception:
|
|
252
|
+
# Fallback to Python implementation on any acceleration failure
|
|
253
|
+
return self._support_python(items, min_support, batch_size)
|
|
254
|
+
|
|
240
255
|
def _print_status(self, run: int, candidates: List[Tuple[str, ...]]) -> None:
|
|
241
256
|
"""
|
|
242
257
|
Log progress information for the current GSP iteration.
|
|
@@ -248,10 +263,14 @@ class GSP:
|
|
|
248
263
|
run (int): Current k-sequence generation level (e.g., 1 for 1-item sequences).
|
|
249
264
|
candidates (List[Tuple]): Candidate sequences generated at this level.
|
|
250
265
|
"""
|
|
251
|
-
logger.info("Run %d: %d candidates filtered to %d.",
|
|
252
|
-
run, len(candidates), len(self.freq_patterns[run - 1]))
|
|
266
|
+
logger.info("Run %d: %d candidates filtered to %d.", run, len(candidates), len(self.freq_patterns[run - 1]))
|
|
253
267
|
|
|
254
|
-
def search(
|
|
268
|
+
def search(
|
|
269
|
+
self,
|
|
270
|
+
min_support: float = 0.2,
|
|
271
|
+
max_k: Optional[int] = None,
|
|
272
|
+
backend: Optional[str] = None,
|
|
273
|
+
) -> List[Dict[Tuple[str, ...], int]]:
|
|
255
274
|
"""
|
|
256
275
|
Execute the Generalized Sequential Pattern (GSP) mining algorithm.
|
|
257
276
|
|
|
@@ -280,9 +299,10 @@ class GSP:
|
|
|
280
299
|
if not 0.0 < min_support <= 1.0:
|
|
281
300
|
raise ValueError("Minimum support must be in the range (0.0, 1.0]")
|
|
282
301
|
|
|
283
|
-
|
|
302
|
+
logger.info(f"Starting GSP algorithm with min_support={min_support}...")
|
|
284
303
|
|
|
285
|
-
|
|
304
|
+
# Convert fractional support to absolute count (ceil to preserve threshold semantics)
|
|
305
|
+
abs_min_support = int(math.ceil(len(self.transactions) * float(min_support)))
|
|
286
306
|
|
|
287
307
|
# the set of frequent 1-sequence: all singleton sequences
|
|
288
308
|
# (k-itemsets/k-sequence = 1) - Initially, every item in DB is a
|
|
@@ -291,7 +311,7 @@ class GSP:
|
|
|
291
311
|
|
|
292
312
|
# scan transactions to collect support count for each candidate
|
|
293
313
|
# sequence & filter
|
|
294
|
-
self.freq_patterns.append(self._support(candidates,
|
|
314
|
+
self.freq_patterns.append(self._support(candidates, abs_min_support, backend=backend))
|
|
295
315
|
|
|
296
316
|
# (k-itemsets/k-sequence = 1)
|
|
297
317
|
k_items = 1
|
|
@@ -299,7 +319,10 @@ class GSP:
|
|
|
299
319
|
self._print_status(k_items, candidates)
|
|
300
320
|
|
|
301
321
|
# repeat until no frequent sequence or no candidate can be found
|
|
302
|
-
|
|
322
|
+
# If max_k is provided, stop generating candidates beyond that length
|
|
323
|
+
while (
|
|
324
|
+
self.freq_patterns[k_items - 1] and k_items + 1 <= self.max_size and (max_k is None or k_items + 1 <= max_k)
|
|
325
|
+
):
|
|
303
326
|
k_items += 1
|
|
304
327
|
|
|
305
328
|
# Generate candidate sets Ck (set of candidate k-sequences) -
|
|
@@ -309,7 +332,7 @@ class GSP:
|
|
|
309
332
|
|
|
310
333
|
# candidate pruning - eliminates candidates who are not potentially
|
|
311
334
|
# frequent (using support as threshold)
|
|
312
|
-
self.freq_patterns.append(self._support(candidates,
|
|
335
|
+
self.freq_patterns.append(self._support(candidates, abs_min_support, backend=backend))
|
|
313
336
|
|
|
314
337
|
self._print_status(k_items, candidates)
|
|
315
338
|
logger.info("GSP algorithm completed.")
|
gsppy/utils.py
CHANGED
|
@@ -20,6 +20,7 @@ Main functionalities:
|
|
|
20
20
|
These utilities are designed to support sequence processing tasks and can be
|
|
21
21
|
adapted to various domains, such as data mining, recommendation systems, and sequence analysis.
|
|
22
22
|
"""
|
|
23
|
+
|
|
23
24
|
from typing import Dict, List, Tuple, Sequence, Generator
|
|
24
25
|
from functools import lru_cache
|
|
25
26
|
from itertools import product
|
|
@@ -39,7 +40,7 @@ def split_into_batches(
|
|
|
39
40
|
Generator[Sequence[Tuple], None, None]: A generator yielding batches of items.
|
|
40
41
|
"""
|
|
41
42
|
for i in range(0, len(items), batch_size):
|
|
42
|
-
yield items[i:i + batch_size]
|
|
43
|
+
yield items[i : i + batch_size]
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
@lru_cache(maxsize=None)
|
|
@@ -65,12 +66,10 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
|
|
|
65
66
|
return False
|
|
66
67
|
|
|
67
68
|
# Use any to check if any slice matches the sequence
|
|
68
|
-
return any(sequence[i:i + len_sub] == subsequence for i in range(len_seq - len_sub + 1))
|
|
69
|
+
return any(sequence[i : i + len_sub] == subsequence for i in range(len_seq - len_sub + 1))
|
|
69
70
|
|
|
70
71
|
|
|
71
|
-
def generate_candidates_from_previous(
|
|
72
|
-
prev_patterns: Dict[Tuple[str, ...], int]
|
|
73
|
-
) -> List[Tuple[str, ...]]:
|
|
72
|
+
def generate_candidates_from_previous(prev_patterns: Dict[Tuple[str, ...], int]) -> List[Tuple[str, ...]]:
|
|
74
73
|
"""
|
|
75
74
|
Generate joined candidates from the previous level's frequent patterns.
|
|
76
75
|
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gsppy
|
|
3
|
-
Version:
|
|
3
|
+
Version: 3.0.0
|
|
4
4
|
Summary: GSP (Generalized Sequence Pattern) algorithm in Python
|
|
5
5
|
Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
|
|
6
6
|
Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
7
7
|
Maintainer-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
8
8
|
License: MIT License
|
|
9
9
|
|
|
10
|
-
Copyright (c)
|
|
10
|
+
Copyright (c) 2025 Jackson Antonio do Prado Lima
|
|
11
11
|
|
|
12
12
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
13
13
|
of this software and associated documentation files (the "Software"), to deal
|
|
@@ -32,31 +32,34 @@ Classifier: Intended Audience :: Science/Research
|
|
|
32
32
|
Classifier: License :: OSI Approved :: MIT License
|
|
33
33
|
Classifier: Natural Language :: English
|
|
34
34
|
Classifier: Operating System :: OS Independent
|
|
35
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
36
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
37
35
|
Classifier: Programming Language :: Python :: 3.10
|
|
38
36
|
Classifier: Programming Language :: Python :: 3.11
|
|
39
37
|
Classifier: Programming Language :: Python :: 3.12
|
|
40
38
|
Classifier: Programming Language :: Python :: 3.13
|
|
41
39
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
42
40
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
43
|
-
Requires-Python: >=3.
|
|
41
|
+
Requires-Python: >=3.10
|
|
42
|
+
Requires-Dist: click>=8.0.0
|
|
44
43
|
Provides-Extra: dev
|
|
45
|
-
Requires-Dist: cython==3.
|
|
44
|
+
Requires-Dist: cython==3.1.3; extra == 'dev'
|
|
46
45
|
Requires-Dist: hatch==1.14.0; extra == 'dev'
|
|
47
46
|
Requires-Dist: hatchling==1.27.0; extra == 'dev'
|
|
48
|
-
Requires-Dist: mypy==1.
|
|
49
|
-
Requires-Dist: pylint==3.
|
|
50
|
-
Requires-Dist: pyright==1.1.
|
|
47
|
+
Requires-Dist: mypy==1.18.1; extra == 'dev'
|
|
48
|
+
Requires-Dist: pylint==3.2.7; extra == 'dev'
|
|
49
|
+
Requires-Dist: pyright==1.1.405; extra == 'dev'
|
|
51
50
|
Requires-Dist: pytest-benchmark==5.1.0; extra == 'dev'
|
|
52
|
-
Requires-Dist: pytest-cov==
|
|
51
|
+
Requires-Dist: pytest-cov==5.0.0; extra == 'dev'
|
|
53
52
|
Requires-Dist: pytest==8.3.4; extra == 'dev'
|
|
54
|
-
Requires-Dist: ruff==0.
|
|
55
|
-
Requires-Dist: tox==4.
|
|
53
|
+
Requires-Dist: ruff==0.13.0; extra == 'dev'
|
|
54
|
+
Requires-Dist: tox==4.30.2; extra == 'dev'
|
|
55
|
+
Provides-Extra: gpu
|
|
56
|
+
Requires-Dist: cupy<14,>=11; extra == 'gpu'
|
|
57
|
+
Provides-Extra: rust
|
|
58
|
+
Requires-Dist: maturin==1.6.0; extra == 'rust'
|
|
56
59
|
Description-Content-Type: text/markdown
|
|
57
60
|
|
|
58
61
|
[]()
|
|
59
|
-

|
|
60
63
|
[](https://doi.org/10.5281/zenodo.3333987)
|
|
61
64
|
|
|
62
65
|
[](https://pypi.org/project/gsppy/)
|
|
@@ -72,7 +75,7 @@ Description-Content-Type: text/markdown
|
|
|
72
75
|
Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal mining, and user journey discovery.
|
|
73
76
|
|
|
74
77
|
> [!IMPORTANT]
|
|
75
|
-
> GSP-Py is compatible with Python 3.
|
|
78
|
+
> GSP-Py is compatible with Python 3.10 and later versions!
|
|
76
79
|
|
|
77
80
|
---
|
|
78
81
|
|
|
@@ -130,14 +133,15 @@ GSP-Py can be easily installed from either the **repository** or PyPI.
|
|
|
130
133
|
|
|
131
134
|
### Option 1: Clone the Repository
|
|
132
135
|
|
|
133
|
-
To manually clone the repository and
|
|
136
|
+
To manually clone the repository and set up the environment:
|
|
134
137
|
|
|
135
138
|
```bash
|
|
136
139
|
git clone https://github.com/jacksonpradolima/gsp-py.git
|
|
137
140
|
cd gsp-py
|
|
138
|
-
python setup.py install
|
|
139
141
|
```
|
|
140
142
|
|
|
143
|
+
Refer to the [Developer Installation](#developer-installation) section and run the setup with uv.
|
|
144
|
+
|
|
141
145
|
### Option 2: Install via `pip`
|
|
142
146
|
|
|
143
147
|
Alternatively, install GSP-Py from PyPI with:
|
|
@@ -150,50 +154,228 @@ pip install gsppy
|
|
|
150
154
|
|
|
151
155
|
## 🛠️ Developer Installation
|
|
152
156
|
|
|
153
|
-
This project uses [
|
|
157
|
+
This project now uses [uv](https://github.com/astral-sh/uv) for dependency management and virtual environments.
|
|
154
158
|
|
|
155
|
-
#### 1. Install
|
|
156
|
-
|
|
159
|
+
#### 1. Install uv
|
|
160
|
+
```bash
|
|
161
|
+
curl -Ls https://astral.sh/uv/install.sh | bash
|
|
162
|
+
```
|
|
157
163
|
|
|
164
|
+
Make sure uv is on your PATH (for most Linux setups):
|
|
158
165
|
```bash
|
|
159
|
-
|
|
166
|
+
export PATH="$HOME/.local/bin:$PATH"
|
|
160
167
|
```
|
|
161
168
|
|
|
162
|
-
|
|
169
|
+
#### 2. Set up the project environment
|
|
170
|
+
Create a local virtual environment and install dependencies from uv.lock (single source of truth):
|
|
163
171
|
|
|
164
172
|
```bash
|
|
165
|
-
|
|
173
|
+
uv venv .venv
|
|
174
|
+
uv sync --frozen --extra dev # uses uv.lock
|
|
175
|
+
uv pip install -e .
|
|
166
176
|
```
|
|
167
177
|
|
|
168
|
-
|
|
178
|
+
#### 3. Optional: Enable Rust acceleration
|
|
169
179
|
|
|
180
|
+
Rust acceleration is optional and provides faster support counting using a PyO3 extension. Python fallback remains available.
|
|
181
|
+
|
|
182
|
+
Build the extension locally:
|
|
170
183
|
```bash
|
|
171
|
-
|
|
184
|
+
make rust-build
|
|
172
185
|
```
|
|
173
186
|
|
|
174
|
-
|
|
175
|
-
|
|
187
|
+
Select backend at runtime (auto tries Rust, then falls back to Python):
|
|
188
|
+
```bash
|
|
189
|
+
export GSPPY_BACKEND=rust # or python, or unset for auto
|
|
190
|
+
```
|
|
176
191
|
|
|
192
|
+
Run benchmarks (adjust to your machine):
|
|
177
193
|
```bash
|
|
178
|
-
|
|
194
|
+
make bench-small
|
|
195
|
+
make bench-big # may use significant memory/CPU
|
|
196
|
+
# or customize:
|
|
197
|
+
GSPPY_BACKEND=auto uv run --python .venv/bin/python --no-project \
|
|
198
|
+
python benchmarks/bench_support.py --n_tx 1000000 --tx_len 8 --vocab 50000 --min_support 0.2 --warmup
|
|
179
199
|
```
|
|
180
200
|
|
|
181
|
-
####
|
|
182
|
-
|
|
201
|
+
#### 4. Optional: Enable GPU (CuPy) acceleration
|
|
202
|
+
|
|
203
|
+
GPU acceleration is experimental and currently optimizes singleton (k=1) support counting using CuPy.
|
|
204
|
+
Non-singleton candidates fall back to the Rust/Python backend.
|
|
205
|
+
|
|
206
|
+
Install the optional extra (choose a CuPy build that matches your CUDA/ROCm setup if needed):
|
|
183
207
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
208
|
+
```bash
|
|
209
|
+
uv run pip install -e .[gpu]
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
Select the GPU backend at runtime:
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
export GSPPY_BACKEND=gpu
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
If a GPU isn't available, an error will be raised when GSPPY_BACKEND=gpu is set. Otherwise, the default "auto" uses CPU.
|
|
219
|
+
|
|
220
|
+
#### 5. Common development tasks
|
|
221
|
+
After the environment is ready, activate it and run tasks with standard tools:
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
source .venv/bin/activate
|
|
225
|
+
pytest -n auto
|
|
226
|
+
ruff check .
|
|
227
|
+
pyright
|
|
228
|
+
```
|
|
188
229
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
230
|
+
If you prefer, you can also prefix commands with uv without activating:
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
uv run pytest -n auto
|
|
234
|
+
uv run ruff check .
|
|
235
|
+
uv run pyright
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
#### 5. Makefile (shortcuts)
|
|
239
|
+
You can use the Makefile to automate common tasks:
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
make setup # create .venv with uv and pin Python
|
|
243
|
+
make install # sync deps (from uv.lock) + install project (-e .)
|
|
244
|
+
make test # pytest -n auto
|
|
245
|
+
make lint # ruff check .
|
|
246
|
+
make format # ruff --fix
|
|
247
|
+
make typecheck # pyright (and mypy if configured)
|
|
248
|
+
make pre-commit-install # install the pre-commit hook
|
|
249
|
+
make pre-commit-run # run pre-commit on all files
|
|
250
|
+
|
|
251
|
+
# Rust-specific shortcuts
|
|
252
|
+
make rust-setup # install rustup toolchain
|
|
253
|
+
make rust-build # build PyO3 extension with maturin
|
|
254
|
+
make bench-small # run small benchmark
|
|
255
|
+
make bench-big # run large benchmark
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
> [!NOTE]
|
|
259
|
+
> Tox in this project uses the "tox-uv" plugin. When running `make tox` or `tox`, missing Python interpreters can be provisioned automatically via uv (no need to pre-install all versions). This makes local setup faster.
|
|
192
260
|
|
|
193
261
|
## 💡 Usage
|
|
194
262
|
|
|
195
|
-
The library is designed to be easy to use and integrate with your own projects.
|
|
196
|
-
|
|
263
|
+
The library is designed to be easy to use and integrate with your own projects. You can use GSP-Py either programmatically (Python API) or directly from the command line (CLI).
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## 🚦 Using GSP-Py via CLI
|
|
268
|
+
|
|
269
|
+
GSP-Py provides a command-line interface (CLI) for running the Generalized Sequential Pattern algorithm on transactional data. This allows you to mine frequent sequential patterns from JSON or CSV files without writing any code.
|
|
270
|
+
|
|
271
|
+
### Installation
|
|
272
|
+
|
|
273
|
+
First, install GSP-Py (if not already installed):
|
|
274
|
+
|
|
275
|
+
```bash
|
|
276
|
+
pip install gsppy
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
This will make the `gsppy` CLI command available in your environment.
|
|
280
|
+
|
|
281
|
+
### Preparing Your Data
|
|
282
|
+
|
|
283
|
+
Your input file should be either:
|
|
284
|
+
|
|
285
|
+
- **JSON**: A list of transactions, each transaction is a list of items. Example:
|
|
286
|
+
```json
|
|
287
|
+
[
|
|
288
|
+
["Bread", "Milk"],
|
|
289
|
+
["Bread", "Diaper", "Beer", "Eggs"],
|
|
290
|
+
["Milk", "Diaper", "Beer", "Coke"],
|
|
291
|
+
["Bread", "Milk", "Diaper", "Beer"],
|
|
292
|
+
["Bread", "Milk", "Diaper", "Coke"]
|
|
293
|
+
]
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
- **CSV**: Each row is a transaction, items separated by commas. Example:
|
|
297
|
+
```csv
|
|
298
|
+
Bread,Milk
|
|
299
|
+
Bread,Diaper,Beer,Eggs
|
|
300
|
+
Milk,Diaper,Beer,Coke
|
|
301
|
+
Bread,Milk,Diaper,Beer
|
|
302
|
+
Bread,Milk,Diaper,Coke
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
### Running the CLI
|
|
306
|
+
|
|
307
|
+
Use the following command to run GSPPy on your data:
|
|
308
|
+
|
|
309
|
+
```bash
|
|
310
|
+
gsppy --file path/to/transactions.json --min_support 0.3 --backend auto
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
Or for CSV files:
|
|
314
|
+
|
|
315
|
+
```bash
|
|
316
|
+
gsppy --file path/to/transactions.csv --min_support 0.3 --backend rust
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
#### CLI Options
|
|
320
|
+
|
|
321
|
+
- `--file`: Path to your input file (JSON or CSV). **Required**.
|
|
322
|
+
- `--min_support`: Minimum support threshold as a fraction (e.g., `0.3` for 30%). Default is `0.2`.
|
|
323
|
+
- `--backend`: Backend to use for support counting. One of `auto` (default), `python`, `rust`, or `gpu`.
|
|
324
|
+
- `--verbose`: (Optional) Enable detailed output for debugging.
|
|
325
|
+
|
|
326
|
+
#### Example
|
|
327
|
+
|
|
328
|
+
Suppose you have a file `transactions.json` as shown above. To find patterns with at least 30% support:
|
|
329
|
+
|
|
330
|
+
```bash
|
|
331
|
+
gsppy --file transactions.json --min_support 0.3
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
Sample output:
|
|
335
|
+
|
|
336
|
+
```
|
|
337
|
+
Pre-processing transactions...
|
|
338
|
+
Starting GSP algorithm with min_support=0.3...
|
|
339
|
+
Run 1: 6 candidates filtered to 5.
|
|
340
|
+
Run 2: 20 candidates filtered to 3.
|
|
341
|
+
Run 3: 2 candidates filtered to 2.
|
|
342
|
+
Run 4: 1 candidates filtered to 0.
|
|
343
|
+
GSP algorithm completed.
|
|
344
|
+
Frequent Patterns Found:
|
|
345
|
+
|
|
346
|
+
1-Sequence Patterns:
|
|
347
|
+
Pattern: ('Bread',), Support: 4
|
|
348
|
+
Pattern: ('Milk',), Support: 4
|
|
349
|
+
Pattern: ('Diaper',), Support: 4
|
|
350
|
+
Pattern: ('Beer',), Support: 3
|
|
351
|
+
Pattern: ('Coke',), Support: 2
|
|
352
|
+
|
|
353
|
+
2-Sequence Patterns:
|
|
354
|
+
Pattern: ('Bread', 'Milk'), Support: 3
|
|
355
|
+
Pattern: ('Milk', 'Diaper'), Support: 3
|
|
356
|
+
Pattern: ('Diaper', 'Beer'), Support: 3
|
|
357
|
+
|
|
358
|
+
3-Sequence Patterns:
|
|
359
|
+
Pattern: ('Bread', 'Milk', 'Diaper'), Support: 2
|
|
360
|
+
Pattern: ('Milk', 'Diaper', 'Beer'), Support: 2
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
#### Error Handling
|
|
364
|
+
|
|
365
|
+
- If the file does not exist or is in an unsupported format, a clear error message will be shown.
|
|
366
|
+
- The `min_support` value must be between 0.0 and 1.0 (exclusive of 0.0, inclusive of 1.0).
|
|
367
|
+
|
|
368
|
+
#### Advanced: Verbose Output
|
|
369
|
+
|
|
370
|
+
To see detailed logs for debugging, add the `--verbose` flag:
|
|
371
|
+
|
|
372
|
+
```bash
|
|
373
|
+
gsppy --file transactions.json --min_support 0.3 --verbose
|
|
374
|
+
```
|
|
375
|
+
|
|
376
|
+
---
|
|
377
|
+
|
|
378
|
+
The following example shows how to use GSP-Py programmatically in Python:
|
|
197
379
|
|
|
198
380
|
### Example Input Data
|
|
199
381
|
|
|
@@ -294,11 +476,18 @@ improvement? [Open a discussion or issue!](https://github.com/jacksonpradolima/g
|
|
|
294
476
|
We welcome contributions from the community! If you'd like to help improve GSP-Py, read
|
|
295
477
|
our [CONTRIBUTING.md](CONTRIBUTING.md) guide to get started.
|
|
296
478
|
|
|
297
|
-
Development dependencies (e.g., testing and linting tools) are
|
|
298
|
-
|
|
479
|
+
Development dependencies (e.g., testing and linting tools) are handled via uv.
|
|
480
|
+
To set up and run the main tasks:
|
|
299
481
|
|
|
300
482
|
```bash
|
|
301
|
-
|
|
483
|
+
uv venv .venv
|
|
484
|
+
uv sync --frozen --extra dev
|
|
485
|
+
uv pip install -e .
|
|
486
|
+
|
|
487
|
+
# Run tasks
|
|
488
|
+
uv run pytest -n auto
|
|
489
|
+
uv run ruff check .
|
|
490
|
+
uv run pyright
|
|
302
491
|
```
|
|
303
492
|
|
|
304
493
|
### General Steps:
|
|
@@ -328,7 +517,7 @@ If GSP-Py contributed to your research or project that led to a publication, we
|
|
|
328
517
|
author = {Prado Lima, Jackson Antonio do},
|
|
329
518
|
title = {{GSP-Py - Generalized Sequence Pattern algorithm in Python}},
|
|
330
519
|
month = Dec,
|
|
331
|
-
year =
|
|
520
|
+
year = 2025,
|
|
332
521
|
doi = {10.5281/zenodo.3333987},
|
|
333
522
|
url = {https://doi.org/10.5281/zenodo.3333987}
|
|
334
523
|
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
gsppy/accelerate.py,sha256=YO3YQFzo2VAC6IXOTnQnOajkZO7SabkieGb1IPgWdSI,10407
|
|
3
|
+
gsppy/cli.py,sha256=wsGoc_utxpRfgCF9vPOAyLDTOJZ8NaiwiUny5VyIYvQ,6567
|
|
4
|
+
gsppy/gsp.py,sha256=GCHFhOu-DyHEPsse_OXzf9IaZoigF8ouRqgn_OsZBvA,14855
|
|
5
|
+
gsppy/utils.py,sha256=YlV0F64lnd2Xymf6XnYr6mMLYWV2f2yjaHkZbAS1Qs0,3362
|
|
6
|
+
gsppy-3.0.0.dist-info/METADATA,sha256=5Q6iWC2tabQyDFjEztrgK4nsOWzz4z21oSXmFvQ0wU8,17670
|
|
7
|
+
gsppy-3.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
8
|
+
gsppy-3.0.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
9
|
+
gsppy-3.0.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
|
|
10
|
+
gsppy-3.0.0.dist-info/RECORD,,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
MIT License
|
|
2
2
|
|
|
3
|
-
Copyright (c)
|
|
3
|
+
Copyright (c) 2025 Jackson Antonio do Prado Lima
|
|
4
4
|
|
|
5
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
6
|
of this software and associated documentation files (the "Software"), to deal
|
gsppy-2.2.0.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
gsppy/cli.py,sha256=YxBL341LJzb6EN-RBkhW3o4ZCexOGiQXq_aRovKccA8,6790
|
|
3
|
-
gsppy/gsp.py,sha256=CUCC1W5GGlGbWkC_td0qDfnSJiuzbWoMapR0qciejw8,13800
|
|
4
|
-
gsppy/utils.py,sha256=gOT3USxmC0MrBnSHOQ8avxghWmjQe59hS4jNQ3eiENQ,3363
|
|
5
|
-
gsppy-2.2.0.dist-info/METADATA,sha256=1Y8LcuU7engLWoCWFIKRwRMNsgkAawnpvX6s1BoXP_8,12485
|
|
6
|
-
gsppy-2.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
7
|
-
gsppy-2.2.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
8
|
-
gsppy-2.2.0.dist-info/licenses/LICENSE,sha256=co1jy5VZd1wXOPdUC2uk1hn7zsBm6aJNgVmhPOZ47g8,1086
|
|
9
|
-
gsppy-2.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|