gsppy 2.2.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsppy/accelerate.py ADDED
@@ -0,0 +1,269 @@
1
+ """
2
+ Optional acceleration layer for GSP support counting.
3
+
4
+ This module attempts to use a Rust extension for the hot loop
5
+ (support counting via contiguous subsequence search). If the Rust
6
+ module is unavailable, it gracefully falls back to the pure-Python
7
+ implementation.
8
+
9
+ Control backend via env var:
10
+ - GSPPY_BACKEND=rust -> require Rust extension (raise if missing)
11
+ - GSPPY_BACKEND=python -> force Python implementation
12
+ - unset/other -> try Rust first, then fallback to Python
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import os
18
+ from typing import Any, Dict, List, Tuple, Optional, cast
19
+
20
+ from .utils import split_into_batches, is_subsequence_in_list
21
+
22
+ # Optional GPU (CuPy) support
23
+ _gpu_available = False
24
+ try: # pragma: no cover - optional dependency path
25
+ import cupy as _cp_mod # type: ignore[import-not-found]
26
+
27
+ cp = cast(Any, _cp_mod)
28
+
29
+ try:
30
+ _gpu_available = cp.cuda.runtime.getDeviceCount() > 0 # type: ignore[attr-defined]
31
+ except Exception:
32
+ _gpu_available = False
33
+ except Exception: # pragma: no cover - optional dependency path
34
+ cp = None # type: ignore[assignment]
35
+ _gpu_available = False
36
+
37
+ # Simple per-process cache for encoded transactions keyed by the list object's id
38
+ _ENCODED_CACHE: Dict[int, Tuple[List[List[int]], Dict[int, str], Dict[str, int], int]] = {}
39
+
40
+
41
+ def _get_encoded_transactions(
42
+ transactions: List[Tuple[str, ...]],
43
+ ) -> Tuple[List[List[int]], Dict[int, str], Dict[str, int]]:
44
+ """Return encoded transactions using a small in-memory cache.
45
+
46
+ Cache key is the id() of the transactions list and we also track the number of
47
+ transactions to detect trivial changes. This assumes transactions aren't mutated after
48
+ GSP is constructed (which is the common case).
49
+ """
50
+ key = id(transactions)
51
+ cached = _ENCODED_CACHE.get(key)
52
+ if cached is not None:
53
+ enc_tx, inv_vocab, vocab, n_tx = cached
54
+ if n_tx == len(transactions):
55
+ return enc_tx, inv_vocab, vocab
56
+ enc_tx, inv_vocab, vocab = _encode_transactions(transactions)
57
+ _ENCODED_CACHE[key] = (enc_tx, inv_vocab, vocab, len(transactions))
58
+ return enc_tx, inv_vocab, vocab
59
+
60
+
61
+ # Try importing the Rust extension
62
+ _rust_available = False
63
+ _compute_supports_rust: Any = None
64
+ try:
65
+ from _gsppy_rust import compute_supports_py as _compute_supports_rust # type: ignore
66
+
67
+ _rust_available = True
68
+ except Exception:
69
+ _compute_supports_rust = None
70
+ _rust_available = False
71
+
72
+
73
+ def _env_backend() -> str:
74
+ return os.environ.get("GSPPY_BACKEND", "auto").lower()
75
+
76
+
77
+ def _encode_transactions(transactions: List[Tuple[str, ...]]) -> Tuple[List[List[int]], Dict[int, str], Dict[str, int]]:
78
+ """Encode transactions of strings into integer IDs.
79
+
80
+ Parameters:
81
+ transactions: List of transactions where each transaction is a tuple of strings.
82
+
83
+ Returns:
84
+ A tuple of:
85
+ - enc_tx: List[List[int]] encoded transactions
86
+ - inv_vocab: Dict[int, str] mapping back from id to original string
87
+ - vocab: Dict[str, int] mapping from original string to integer id
88
+ """
89
+ vocab: Dict[str, int] = {}
90
+ enc_tx: List[List[int]] = []
91
+ for t in transactions:
92
+ row: List[int] = []
93
+ for s in t:
94
+ if s not in vocab:
95
+ vocab[s] = len(vocab)
96
+ row.append(vocab[s])
97
+ enc_tx.append(row)
98
+ inv_vocab = {v: k for k, v in vocab.items()}
99
+ return enc_tx, inv_vocab, vocab
100
+
101
+
102
+ def _encode_candidates(candidates: List[Tuple[str, ...]], vocab: Dict[str, int]) -> List[List[int]]:
103
+ """Encode candidate patterns using a provided vocabulary mapping."""
104
+ return [[vocab[s] for s in cand] for cand in candidates]
105
+
106
+
107
+ def _support_counts_gpu_singletons(
108
+ enc_tx: List[List[int]],
109
+ cand_ids: List[int],
110
+ min_support_abs: int,
111
+ vocab_size: int,
112
+ ) -> List[Tuple[List[int], int]]:
113
+ """GPU-accelerated support counts for singleton candidates using CuPy.
114
+
115
+ This computes the number of transactions containing each candidate item ID.
116
+ It uniquifies items per transaction on CPU to preserve presence semantics,
117
+ then performs a single bincount on GPU.
118
+ """
119
+ # Ensure one contribution per transaction
120
+ unique_rows: List[List[int]] = [list(set(row)) for row in enc_tx]
121
+ if not unique_rows:
122
+ return []
123
+
124
+ # Flatten to a 1D list of item ids, then move to GPU
125
+ flat: List[int] = [item for row in unique_rows for item in row]
126
+ if not flat:
127
+ return []
128
+
129
+ cp_flat = cp.asarray(flat, dtype=cp.int32) # type: ignore[name-defined]
130
+ counts = cp.bincount(cp_flat, minlength=vocab_size) # type: ignore[attr-defined]
131
+ counts_host: Any = counts.get() # back to host as a NumPy array
132
+
133
+ out: List[Tuple[List[int], int]] = []
134
+ for cid in cand_ids:
135
+ freq = int(counts_host[cid])
136
+ if freq >= min_support_abs:
137
+ out.append(([cid], freq))
138
+ return out
139
+
140
+
141
+ def support_counts_python(
142
+ transactions: List[Tuple[str, ...]],
143
+ candidates: List[Tuple[str, ...]],
144
+ min_support_abs: int,
145
+ batch_size: int = 100,
146
+ ) -> Dict[Tuple[str, ...], int]:
147
+ """Pure-Python fallback for support counting (single-process).
148
+
149
+ Evaluates each candidate pattern's frequency across all transactions
150
+ using the same contiguous-subsequence semantics as the Rust backend.
151
+
152
+ Note: This implementation is single-process and optimized for simplicity.
153
+ Heavy workloads may benefit from the Rust backend.
154
+ """
155
+ # Simple non-multiprocessing version to avoid import cycles.
156
+ results: Dict[Tuple[str, ...], int] = {}
157
+ for batch in split_into_batches(candidates, batch_size):
158
+ for cand in batch:
159
+ freq = sum(1 for t in transactions if is_subsequence_in_list(cand, t))
160
+ if freq >= min_support_abs:
161
+ results[cand] = freq
162
+ return results
163
+
164
+
165
+ def support_counts(
166
+ transactions: List[Tuple[str, ...]],
167
+ candidates: List[Tuple[str, ...]],
168
+ min_support_abs: int,
169
+ batch_size: int = 100,
170
+ backend: Optional[str] = None,
171
+ ) -> Dict[Tuple[str, ...], int]:
172
+ """Choose the best available backend for support counting.
173
+
174
+ Backend selection is controlled by the `backend` argument when provided,
175
+ otherwise by the env var GSPPY_BACKEND:
176
+ - "rust": require Rust extension (raise if missing)
177
+ - "gpu": try GPU path when available (currently singletons optimized),
178
+ fall back to CPU for the rest
179
+ - "python": force pure-Python fallback
180
+ - otherwise: try Rust first and fall back to Python
181
+ """
182
+ backend_sel = (backend or _env_backend()).lower()
183
+
184
+ if backend_sel == "gpu":
185
+ if not _gpu_available:
186
+ raise RuntimeError("GSPPY_BACKEND=gpu but CuPy GPU is not available")
187
+ # Encode once
188
+ enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
189
+ enc_cands = _encode_candidates(candidates, vocab)
190
+
191
+ # Partition candidates into singletons and non-singletons
192
+ singletons: List[Tuple[int, Tuple[str, ...]]] = []
193
+ others: List[Tuple[List[int], Tuple[str, ...]]] = []
194
+ # Pair original and encoded candidates; lengths should match
195
+ assert len(candidates) == len(enc_cands), "Encoded candidates length mismatch"
196
+ for orig, enc in zip(candidates, enc_cands): # noqa: B905 - lengths checked above
197
+ if len(enc) == 1:
198
+ singletons.append((enc[0], orig))
199
+ else:
200
+ others.append((enc, orig))
201
+
202
+ out: Dict[Tuple[str, ...], int] = {}
203
+
204
+ # GPU path for singletons
205
+ if singletons:
206
+ vocab_size = max(vocab.values()) + 1 if vocab else 0
207
+ gpu_res = _support_counts_gpu_singletons(
208
+ enc_tx=enc_tx,
209
+ cand_ids=[cid for cid, _ in singletons],
210
+ min_support_abs=min_support_abs,
211
+ vocab_size=vocab_size,
212
+ )
213
+ # Map back to original strings
214
+ cand_by_id: Dict[int, Tuple[str, ...]] = {cid: orig for cid, orig in singletons}
215
+ for enc_cand, freq in gpu_res:
216
+ cid = enc_cand[0]
217
+ out[cand_by_id[cid]] = int(freq)
218
+
219
+ # Fallback for others (prefer rust when available)
220
+ if others:
221
+ if _rust_available:
222
+ try:
223
+ other_enc = [enc for enc, _ in others]
224
+ res = cast(
225
+ List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, other_enc, int(min_support_abs))
226
+ )
227
+ for enc_cand, freq in res:
228
+ out[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
229
+ except Exception:
230
+ # fallback to python
231
+ out.update(
232
+ support_counts_python(transactions, [orig for _, orig in others], min_support_abs, batch_size)
233
+ )
234
+ else:
235
+ out.update(
236
+ support_counts_python(transactions, [orig for _, orig in others], min_support_abs, batch_size)
237
+ )
238
+
239
+ return out
240
+
241
+ if backend_sel == "python":
242
+ return support_counts_python(transactions, candidates, min_support_abs, batch_size)
243
+
244
+ if backend_sel == "rust":
245
+ if not _rust_available:
246
+ raise RuntimeError("GSPPY_BACKEND=rust but Rust extension _gsppy_rust is not available")
247
+ # use rust
248
+ enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
249
+ enc_cands = _encode_candidates(candidates, vocab)
250
+ result = cast(List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, enc_cands, int(min_support_abs)))
251
+ out_rust: Dict[Tuple[str, ...], int] = {}
252
+ for enc_cand, freq in result:
253
+ out_rust[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
254
+ return out_rust
255
+
256
+ # auto: try rust then fallback
257
+ if _rust_available:
258
+ enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
259
+ enc_cands = _encode_candidates(candidates, vocab)
260
+ try:
261
+ result = cast(List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, enc_cands, int(min_support_abs)))
262
+ out2: Dict[Tuple[str, ...], int] = {}
263
+ for enc_cand, freq in result:
264
+ out2[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
265
+ return out2
266
+ except Exception:
267
+ pass
268
+
269
+ return support_counts_python(transactions, candidates, min_support_abs, batch_size)
gsppy/cli.py CHANGED
@@ -27,14 +27,16 @@ Key Features:
27
27
  This CLI empowers users to perform sequential pattern mining on transactional data efficiently through
28
28
  a simple command-line interface.
29
29
  """
30
+
30
31
  import os
31
32
  import csv
32
33
  import sys
33
34
  import json
34
35
  import logging
35
- import argparse
36
36
  from typing import Dict, List, Tuple
37
37
 
38
+ import click
39
+
38
40
  from gsppy.gsp import GSP
39
41
 
40
42
  # Configure logging
@@ -71,7 +73,7 @@ def read_transactions_from_json(file_path: str) -> List[List[str]]:
71
73
  ValueError: If the file cannot be read or does not contain valid JSON.
72
74
  """
73
75
  try:
74
- with open(file_path, 'r', encoding='utf-8') as f:
76
+ with open(file_path, "r", encoding="utf-8") as f:
75
77
  transactions: List[List[str]] = json.load(f)
76
78
  return transactions
77
79
  except Exception as e:
@@ -95,7 +97,7 @@ def read_transactions_from_csv(file_path: str) -> List[List[str]]:
95
97
  """
96
98
  try:
97
99
  transactions: List[List[str]] = []
98
- with open(file_path, newline='', encoding='utf-8') as csvfile:
100
+ with open(file_path, newline="", encoding="utf-8") as csvfile:
99
101
  reader = csv.reader(csvfile)
100
102
  for row in reader:
101
103
  # Check if the row is empty
@@ -138,65 +140,56 @@ def detect_and_read_file(file_path: str) -> List[List[str]]:
138
140
  raise ValueError("Unsupported file format. Please provide a JSON or CSV file.")
139
141
 
140
142
 
141
- def main() -> None:
143
+ # Click-based CLI
144
+ @click.command()
145
+ @click.option(
146
+ "--file",
147
+ "file_path",
148
+ required=True,
149
+ type=click.Path(exists=True),
150
+ help="Path to a JSON or CSV file containing transactions.",
151
+ )
152
+ @click.option(
153
+ "--min_support",
154
+ default=0.2,
155
+ show_default=True,
156
+ type=float,
157
+ help="Minimum support threshold as a fraction of total transactions.",
158
+ )
159
+ @click.option(
160
+ "--backend",
161
+ type=click.Choice(["auto", "python", "rust", "gpu"], case_sensitive=False),
162
+ default="auto",
163
+ show_default=True,
164
+ help="Backend to use for support counting.",
165
+ )
166
+ @click.option("--verbose", is_flag=True, help="Enable verbose output for debugging purposes.")
167
+ def main(file_path: str, min_support: float, backend: str, verbose: bool) -> None:
142
168
  """
143
- Main function to handle CLI input and run the GSP algorithm.
144
-
145
- Arguments:
146
- - `--file` (str): Path to a JSON or CSV file containing transactions.
147
- - `--min_support` (float): Minimum support threshold (default: 0.2).
169
+ Run the GSP algorithm on transactional data from a file.
148
170
  """
149
- parser = argparse.ArgumentParser(
150
- description="GSP (Generalized Sequential Pattern) Algorithm - "
151
- "Find frequent sequential patterns in transactional data."
152
- )
153
-
154
- # Single file argument
155
- parser.add_argument(
156
- '--file',
157
- type=str,
158
- required=True,
159
- help='Path to a JSON or CSV file containing transactions (e.g., [["A", "B"], ["B", "C"]] '
160
- 'or CSV rows per transaction)'
161
- )
162
-
163
- # Minimum support argument
164
- parser.add_argument(
165
- '--min_support',
166
- type=float,
167
- default=0.2,
168
- help="Minimum support threshold as a fraction of total transactions (default: 0.2)"
169
- )
170
-
171
- # Verbose output argument
172
- parser.add_argument(
173
- '--verbose',
174
- action='store_true',
175
- help='Enable verbose output for debugging purposes.'
176
- )
177
-
178
- # Parse arguments
179
- args = parser.parse_args()
180
-
181
- # Setup logging verbosity
182
- setup_logging(args.verbose)
171
+ setup_logging(verbose)
183
172
 
184
173
  # Automatically detect and load transactions
185
174
  try:
186
- transactions = detect_and_read_file(args.file)
175
+ transactions = detect_and_read_file(file_path)
187
176
  except ValueError as e:
188
177
  logger.error(f"Error: {e}")
189
- return
178
+ sys.exit(1)
190
179
 
191
180
  # Check min_support
192
- if args.min_support <= 0.0 or args.min_support > 1.0:
181
+ if min_support <= 0.0 or min_support > 1.0:
193
182
  logger.error("Error: min_support must be in the range (0.0, 1.0].")
194
- return
183
+ sys.exit(1)
184
+
185
+ # Select backend for acceleration layer
186
+ if backend and backend.lower() != "auto":
187
+ os.environ["GSPPY_BACKEND"] = backend.lower()
195
188
 
196
189
  # Initialize and run GSP algorithm
197
190
  try:
198
191
  gsp = GSP(transactions)
199
- patterns: List[Dict[Tuple[str, ...], int]] = gsp.search(min_support=args.min_support)
192
+ patterns: List[Dict[Tuple[str, ...], int]] = gsp.search(min_support=min_support)
200
193
  logger.info("Frequent Patterns Found:")
201
194
  for i, level in enumerate(patterns, start=1):
202
195
  logger.info(f"\n{i}-Sequence Patterns:")
@@ -204,7 +197,8 @@ def main() -> None:
204
197
  logger.info(f"Pattern: {pattern}, Support: {support}")
205
198
  except Exception as e:
206
199
  logger.error(f"Error executing GSP algorithm: {e}")
200
+ sys.exit(1)
207
201
 
208
202
 
209
- if __name__ == '__main__':
203
+ if __name__ == "__main__":
210
204
  main()
gsppy/gsp.py CHANGED
@@ -34,11 +34,11 @@ Example Usage:
34
34
  ```python
35
35
  # Define the transactional dataset
36
36
  transactions = [
37
- ['Bread', 'Milk'],
38
- ['Bread', 'Diaper', 'Beer', 'Eggs'],
39
- ['Milk', 'Diaper', 'Beer', 'Coke'],
40
- ['Bread', 'Milk', 'Diaper', 'Beer'],
41
- ['Bread', 'Milk', 'Diaper', 'Coke']
37
+ ["Bread", "Milk"],
38
+ ["Bread", "Diaper", "Beer", "Eggs"],
39
+ ["Milk", "Diaper", "Beer", "Coke"],
40
+ ["Bread", "Milk", "Diaper", "Beer"],
41
+ ["Bread", "Milk", "Diaper", "Coke"],
42
42
  ]
43
43
 
44
44
  # Initialize GSP with the transactional dataset
@@ -84,13 +84,16 @@ Version:
84
84
  --------
85
85
  - Current Version: 2.0
86
86
  """
87
+
88
+ import math
87
89
  import logging
88
90
  import multiprocessing as mp
89
- from typing import Any, Dict, List, Tuple
91
+ from typing import Dict, List, Tuple, Optional
90
92
  from itertools import chain
91
93
  from collections import Counter
92
94
 
93
95
  from gsppy.utils import split_into_batches, is_subsequence_in_list, generate_candidates_from_previous
96
+ from gsppy.accelerate import support_counts as support_counts_accel
94
97
 
95
98
  logger = logging.getLogger(__name__)
96
99
 
@@ -171,14 +174,13 @@ class GSP:
171
174
  self.max_size = max(len(item) for item in raw_transactions)
172
175
  self.transactions: List[Tuple[str, ...]] = [tuple(transaction) for transaction in raw_transactions]
173
176
  counts: Counter[str] = Counter(chain.from_iterable(raw_transactions))
174
- self.unique_candidates: list[tuple[str, Any]] = [(item,) for item in counts.keys()]
177
+ # Start with singleton candidates (1-sequences)
178
+ self.unique_candidates: List[Tuple[str, ...]] = [(item,) for item in counts.keys()]
175
179
  logger.debug("Unique candidates: %s", self.unique_candidates)
176
180
 
177
181
  @staticmethod
178
182
  def _worker_batch(
179
- batch: List[Tuple[str, ...]],
180
- transactions: List[Tuple[str, ...]],
181
- min_support: int
183
+ batch: List[Tuple[str, ...]], transactions: List[Tuple[str, ...]], min_support: int
182
184
  ) -> List[Tuple[Tuple[str, ...], int]]:
183
185
  """
184
186
  Evaluate a batch of candidate sequences to compute their support.
@@ -204,20 +206,15 @@ class GSP:
204
206
  results.append((item, frequency))
205
207
  return results
206
208
 
207
- def _support(
208
- self,
209
- items: List[Tuple[str, ...]], min_support: float = 0, batch_size: int = 100
209
+ def _support_python(
210
+ self, items: List[Tuple[str, ...]], min_support: int = 0, batch_size: int = 100
210
211
  ) -> Dict[Tuple[str, ...], int]:
211
212
  """
212
- Calculate support counts for candidate sequences, using parallel processing.
213
-
214
- To improve efficiency, candidate sequences are processed in parallel batches using the
215
- `multiprocessing` module. Each sequence is checked against transactions, and its support
216
- count is calculated.
213
+ Calculate support counts for candidate sequences using Python multiprocessing.
217
214
 
218
215
  Parameters:
219
216
  items (List[Tuple]): Candidate sequences to evaluate.
220
- min_support (float): Absolute minimum support count required for a sequence to be considered frequent.
217
+ min_support (int): Absolute minimum support count required for a sequence to be considered frequent.
221
218
  batch_size (int): Maximum number of candidates to process per batch.
222
219
 
223
220
  Returns:
@@ -231,12 +228,30 @@ class GSP:
231
228
  with mp.Pool(processes=mp.cpu_count()) as pool:
232
229
  batch_results = pool.starmap(
233
230
  self._worker_batch, # Process a batch at a time
234
- [(batch, self.transactions, min_support) for batch in batches]
231
+ [(batch, self.transactions, min_support) for batch in batches],
235
232
  )
236
233
 
237
234
  # Flatten the list of results and convert to a dictionary
238
235
  return {item: freq for batch in batch_results for item, freq in batch}
239
236
 
237
+ def _support(
238
+ self,
239
+ items: List[Tuple[str, ...]],
240
+ min_support: int = 0,
241
+ batch_size: int = 100,
242
+ backend: Optional[str] = None,
243
+ ) -> Dict[Tuple[str, ...], int]:
244
+ """
245
+ Calculate support counts for candidate sequences using the fastest available backend.
246
+ This will try the Rust extension if available (and configured), otherwise fall back to
247
+ the Python multiprocessing implementation.
248
+ """
249
+ try:
250
+ return support_counts_accel(self.transactions, items, min_support, batch_size, backend=backend)
251
+ except Exception:
252
+ # Fallback to Python implementation on any acceleration failure
253
+ return self._support_python(items, min_support, batch_size)
254
+
240
255
  def _print_status(self, run: int, candidates: List[Tuple[str, ...]]) -> None:
241
256
  """
242
257
  Log progress information for the current GSP iteration.
@@ -248,10 +263,14 @@ class GSP:
248
263
  run (int): Current k-sequence generation level (e.g., 1 for 1-item sequences).
249
264
  candidates (List[Tuple]): Candidate sequences generated at this level.
250
265
  """
251
- logger.info("Run %d: %d candidates filtered to %d.",
252
- run, len(candidates), len(self.freq_patterns[run - 1]))
266
+ logger.info("Run %d: %d candidates filtered to %d.", run, len(candidates), len(self.freq_patterns[run - 1]))
253
267
 
254
- def search(self, min_support: float = 0.2) -> List[Dict[Tuple[str, ...], int]]:
268
+ def search(
269
+ self,
270
+ min_support: float = 0.2,
271
+ max_k: Optional[int] = None,
272
+ backend: Optional[str] = None,
273
+ ) -> List[Dict[Tuple[str, ...], int]]:
255
274
  """
256
275
  Execute the Generalized Sequential Pattern (GSP) mining algorithm.
257
276
 
@@ -280,9 +299,10 @@ class GSP:
280
299
  if not 0.0 < min_support <= 1.0:
281
300
  raise ValueError("Minimum support must be in the range (0.0, 1.0]")
282
301
 
283
- min_support = len(self.transactions) * min_support
302
+ logger.info(f"Starting GSP algorithm with min_support={min_support}...")
284
303
 
285
- logger.info("Starting GSP algorithm with min_support=%.2f...", min_support)
304
+ # Convert fractional support to absolute count (ceil to preserve threshold semantics)
305
+ abs_min_support = int(math.ceil(len(self.transactions) * float(min_support)))
286
306
 
287
307
  # the set of frequent 1-sequence: all singleton sequences
288
308
  # (k-itemsets/k-sequence = 1) - Initially, every item in DB is a
@@ -291,7 +311,7 @@ class GSP:
291
311
 
292
312
  # scan transactions to collect support count for each candidate
293
313
  # sequence & filter
294
- self.freq_patterns.append(self._support(candidates, min_support))
314
+ self.freq_patterns.append(self._support(candidates, abs_min_support, backend=backend))
295
315
 
296
316
  # (k-itemsets/k-sequence = 1)
297
317
  k_items = 1
@@ -299,7 +319,10 @@ class GSP:
299
319
  self._print_status(k_items, candidates)
300
320
 
301
321
  # repeat until no frequent sequence or no candidate can be found
302
- while self.freq_patterns[k_items - 1] and k_items + 1 <= self.max_size:
322
+ # If max_k is provided, stop generating candidates beyond that length
323
+ while (
324
+ self.freq_patterns[k_items - 1] and k_items + 1 <= self.max_size and (max_k is None or k_items + 1 <= max_k)
325
+ ):
303
326
  k_items += 1
304
327
 
305
328
  # Generate candidate sets Ck (set of candidate k-sequences) -
@@ -309,7 +332,7 @@ class GSP:
309
332
 
310
333
  # candidate pruning - eliminates candidates who are not potentially
311
334
  # frequent (using support as threshold)
312
- self.freq_patterns.append(self._support(candidates, min_support))
335
+ self.freq_patterns.append(self._support(candidates, abs_min_support, backend=backend))
313
336
 
314
337
  self._print_status(k_items, candidates)
315
338
  logger.info("GSP algorithm completed.")
gsppy/utils.py CHANGED
@@ -20,6 +20,7 @@ Main functionalities:
20
20
  These utilities are designed to support sequence processing tasks and can be
21
21
  adapted to various domains, such as data mining, recommendation systems, and sequence analysis.
22
22
  """
23
+
23
24
  from typing import Dict, List, Tuple, Sequence, Generator
24
25
  from functools import lru_cache
25
26
  from itertools import product
@@ -39,7 +40,7 @@ def split_into_batches(
39
40
  Generator[Sequence[Tuple], None, None]: A generator yielding batches of items.
40
41
  """
41
42
  for i in range(0, len(items), batch_size):
42
- yield items[i:i + batch_size]
43
+ yield items[i : i + batch_size]
43
44
 
44
45
 
45
46
  @lru_cache(maxsize=None)
@@ -65,12 +66,10 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
65
66
  return False
66
67
 
67
68
  # Use any to check if any slice matches the sequence
68
- return any(sequence[i:i + len_sub] == subsequence for i in range(len_seq - len_sub + 1))
69
+ return any(sequence[i : i + len_sub] == subsequence for i in range(len_seq - len_sub + 1))
69
70
 
70
71
 
71
- def generate_candidates_from_previous(
72
- prev_patterns: Dict[Tuple[str, ...], int]
73
- ) -> List[Tuple[str, ...]]:
72
+ def generate_candidates_from_previous(prev_patterns: Dict[Tuple[str, ...], int]) -> List[Tuple[str, ...]]:
74
73
  """
75
74
  Generate joined candidates from the previous level's frequent patterns.
76
75
 
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gsppy
3
- Version: 2.2.0
3
+ Version: 3.0.0
4
4
  Summary: GSP (Generalized Sequence Pattern) algorithm in Python
5
5
  Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
6
6
  Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
7
7
  Maintainer-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
8
8
  License: MIT License
9
9
 
10
- Copyright (c) 2024 Jackson Antonio do Prado Lima
10
+ Copyright (c) 2025 Jackson Antonio do Prado Lima
11
11
 
12
12
  Permission is hereby granted, free of charge, to any person obtaining a copy
13
13
  of this software and associated documentation files (the "Software"), to deal
@@ -32,31 +32,34 @@ Classifier: Intended Audience :: Science/Research
32
32
  Classifier: License :: OSI Approved :: MIT License
33
33
  Classifier: Natural Language :: English
34
34
  Classifier: Operating System :: OS Independent
35
- Classifier: Programming Language :: Python :: 3.8
36
- Classifier: Programming Language :: Python :: 3.9
37
35
  Classifier: Programming Language :: Python :: 3.10
38
36
  Classifier: Programming Language :: Python :: 3.11
39
37
  Classifier: Programming Language :: Python :: 3.12
40
38
  Classifier: Programming Language :: Python :: 3.13
41
39
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
42
40
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
43
- Requires-Python: >=3.8
41
+ Requires-Python: >=3.10
42
+ Requires-Dist: click>=8.0.0
44
43
  Provides-Extra: dev
45
- Requires-Dist: cython==3.0.11; extra == 'dev'
44
+ Requires-Dist: cython==3.1.3; extra == 'dev'
46
45
  Requires-Dist: hatch==1.14.0; extra == 'dev'
47
46
  Requires-Dist: hatchling==1.27.0; extra == 'dev'
48
- Requires-Dist: mypy==1.14.0; extra == 'dev'
49
- Requires-Dist: pylint==3.3.3; extra == 'dev'
50
- Requires-Dist: pyright==1.1.391; extra == 'dev'
47
+ Requires-Dist: mypy==1.18.1; extra == 'dev'
48
+ Requires-Dist: pylint==3.2.7; extra == 'dev'
49
+ Requires-Dist: pyright==1.1.405; extra == 'dev'
51
50
  Requires-Dist: pytest-benchmark==5.1.0; extra == 'dev'
52
- Requires-Dist: pytest-cov==6.0.0; extra == 'dev'
51
+ Requires-Dist: pytest-cov==5.0.0; extra == 'dev'
53
52
  Requires-Dist: pytest==8.3.4; extra == 'dev'
54
- Requires-Dist: ruff==0.8.4; extra == 'dev'
55
- Requires-Dist: tox==4.23.2; extra == 'dev'
53
+ Requires-Dist: ruff==0.13.0; extra == 'dev'
54
+ Requires-Dist: tox==4.30.2; extra == 'dev'
55
+ Provides-Extra: gpu
56
+ Requires-Dist: cupy<14,>=11; extra == 'gpu'
57
+ Provides-Extra: rust
58
+ Requires-Dist: maturin==1.6.0; extra == 'rust'
56
59
  Description-Content-Type: text/markdown
57
60
 
58
61
  [![PyPI License](https://img.shields.io/pypi/l/gsppy.svg?style=flat-square)]()
59
- ![](https://img.shields.io/badge/python-3.8+-blue.svg)
62
+ ![](https://img.shields.io/badge/python-3.10+-blue.svg)
60
63
  [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3333987.svg)](https://doi.org/10.5281/zenodo.3333987)
61
64
 
62
65
  [![PyPI Downloads](https://img.shields.io/pypi/dm/gsppy.svg?style=flat-square)](https://pypi.org/project/gsppy/)
@@ -72,7 +75,7 @@ Description-Content-Type: text/markdown
72
75
  Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal mining, and user journey discovery.
73
76
 
74
77
  > [!IMPORTANT]
75
- > GSP-Py is compatible with Python 3.8 and later versions!
78
+ > GSP-Py is compatible with Python 3.10 and later versions!
76
79
 
77
80
  ---
78
81
 
@@ -130,14 +133,15 @@ GSP-Py can be easily installed from either the **repository** or PyPI.
130
133
 
131
134
  ### Option 1: Clone the Repository
132
135
 
133
- To manually clone the repository and install:
136
+ To manually clone the repository and set up the environment:
134
137
 
135
138
  ```bash
136
139
  git clone https://github.com/jacksonpradolima/gsp-py.git
137
140
  cd gsp-py
138
- python setup.py install
139
141
  ```
140
142
 
143
+ Refer to the [Developer Installation](#developer-installation) section and run the setup with uv.
144
+
141
145
  ### Option 2: Install via `pip`
142
146
 
143
147
  Alternatively, install GSP-Py from PyPI with:
@@ -150,50 +154,228 @@ pip install gsppy
150
154
 
151
155
  ## 🛠️ Developer Installation
152
156
 
153
- This project uses [Rye](https://github.com/mitsuhiko/rye) for managing dependencies, running scripts, and setting up the environment. Follow these steps to install and set up Rye for this project:
157
+ This project now uses [uv](https://github.com/astral-sh/uv) for dependency management and virtual environments.
154
158
 
155
- #### 1. Install Rye
156
- Run the following command to install Rye:
159
+ #### 1. Install uv
160
+ ```bash
161
+ curl -Ls https://astral.sh/uv/install.sh | bash
162
+ ```
157
163
 
164
+ Make sure uv is on your PATH (for most Linux setups):
158
165
  ```bash
159
- curl -sSf https://rye.astral.sh/get | bash
166
+ export PATH="$HOME/.local/bin:$PATH"
160
167
  ```
161
168
 
162
- If the `~/.rye/bin` directory is not in your PATH, add the following line to your shell configuration file (e.g., `~/.bashrc`, `~/.zshrc`, etc.):
169
+ #### 2. Set up the project environment
170
+ Create a local virtual environment and install dependencies from uv.lock (single source of truth):
163
171
 
164
172
  ```bash
165
- export PATH="$HOME/.rye/bin:$PATH"
173
+ uv venv .venv
174
+ uv sync --frozen --extra dev # uses uv.lock
175
+ uv pip install -e .
166
176
  ```
167
177
 
168
- Reload your shell configuration file:
178
+ #### 3. Optional: Enable Rust acceleration
169
179
 
180
+ Rust acceleration is optional and provides faster support counting using a PyO3 extension. Python fallback remains available.
181
+
182
+ Build the extension locally:
170
183
  ```bash
171
- source ~/.bashrc # or `source ~/.zshrc`
184
+ make rust-build
172
185
  ```
173
186
 
174
- #### 2. Set Up the Project Environment
175
- To configure the project environment and install its dependencies, run:
187
+ Select backend at runtime (auto tries Rust, then falls back to Python):
188
+ ```bash
189
+ export GSPPY_BACKEND=rust # or python, or unset for auto
190
+ ```
176
191
 
192
+ Run benchmarks (adjust to your machine):
177
193
  ```bash
178
- rye sync
194
+ make bench-small
195
+ make bench-big # may use significant memory/CPU
196
+ # or customize:
197
+ GSPPY_BACKEND=auto uv run --python .venv/bin/python --no-project \
198
+ python benchmarks/bench_support.py --n_tx 1000000 --tx_len 8 --vocab 50000 --min_support 0.2 --warmup
179
199
  ```
180
200
 
181
- #### 3. Use Rye Scripts
182
- Once the environment is set up, you can run the following commands to simplify project tasks:
201
+ #### 4. Optional: Enable GPU (CuPy) acceleration
202
+
203
+ GPU acceleration is experimental and currently optimizes singleton (k=1) support counting using CuPy.
204
+ Non-singleton candidates fall back to the Rust/Python backend.
205
+
206
+ Install the optional extra (choose a CuPy build that matches your CUDA/ROCm setup if needed):
183
207
 
184
- - Run tests: `rye run test`
185
- - Format code: `rye run format`
186
- - Lint code: `rye run lint`
187
- - Type-check: `rye run typecheck`
208
+ ```bash
209
+ uv run pip install -e .[gpu]
210
+ ```
211
+
212
+ Select the GPU backend at runtime:
213
+
214
+ ```bash
215
+ export GSPPY_BACKEND=gpu
216
+ ```
217
+
218
+ If a GPU isn't available, an error will be raised when GSPPY_BACKEND=gpu is set. Otherwise, the default "auto" uses CPU.
219
+
220
+ #### 5. Common development tasks
221
+ After the environment is ready, activate it and run tasks with standard tools:
222
+
223
+ ```bash
224
+ source .venv/bin/activate
225
+ pytest -n auto
226
+ ruff check .
227
+ pyright
228
+ ```
188
229
 
189
- #### Notes
190
- - Rye automatically reads dependencies and scripts from the `pyproject.toml` file.
191
- - No need for `requirements.txt`, as Rye manages all dependencies!
230
+ If you prefer, you can also prefix commands with uv without activating:
231
+
232
+ ```bash
233
+ uv run pytest -n auto
234
+ uv run ruff check .
235
+ uv run pyright
236
+ ```
237
+
238
+ #### 5. Makefile (shortcuts)
239
+ You can use the Makefile to automate common tasks:
240
+
241
+ ```bash
242
+ make setup # create .venv with uv and pin Python
243
+ make install # sync deps (from uv.lock) + install project (-e .)
244
+ make test # pytest -n auto
245
+ make lint # ruff check .
246
+ make format # ruff --fix
247
+ make typecheck # pyright (and mypy if configured)
248
+ make pre-commit-install # install the pre-commit hook
249
+ make pre-commit-run # run pre-commit on all files
250
+
251
+ # Rust-specific shortcuts
252
+ make rust-setup # install rustup toolchain
253
+ make rust-build # build PyO3 extension with maturin
254
+ make bench-small # run small benchmark
255
+ make bench-big # run large benchmark
256
+ ```
257
+
258
+ > [!NOTE]
259
+ > Tox in this project uses the "tox-uv" plugin. When running `make tox` or `tox`, missing Python interpreters can be provisioned automatically via uv (no need to pre-install all versions). This makes local setup faster.
192
260
 
193
261
  ## 💡 Usage
194
262
 
195
- The library is designed to be easy to use and integrate with your own projects. Below is an example of how you can
196
- configure and run GSP-Py.
263
+ The library is designed to be easy to use and integrate with your own projects. You can use GSP-Py either programmatically (Python API) or directly from the command line (CLI).
264
+
265
+ ---
266
+
267
+ ## 🚦 Using GSP-Py via CLI
268
+
269
+ GSP-Py provides a command-line interface (CLI) for running the Generalized Sequential Pattern algorithm on transactional data. This allows you to mine frequent sequential patterns from JSON or CSV files without writing any code.
270
+
271
+ ### Installation
272
+
273
+ First, install GSP-Py (if not already installed):
274
+
275
+ ```bash
276
+ pip install gsppy
277
+ ```
278
+
279
+ This will make the `gsppy` CLI command available in your environment.
280
+
281
+ ### Preparing Your Data
282
+
283
+ Your input file should be either:
284
+
285
+ - **JSON**: A list of transactions, each transaction is a list of items. Example:
286
+ ```json
287
+ [
288
+ ["Bread", "Milk"],
289
+ ["Bread", "Diaper", "Beer", "Eggs"],
290
+ ["Milk", "Diaper", "Beer", "Coke"],
291
+ ["Bread", "Milk", "Diaper", "Beer"],
292
+ ["Bread", "Milk", "Diaper", "Coke"]
293
+ ]
294
+ ```
295
+
296
+ - **CSV**: Each row is a transaction, items separated by commas. Example:
297
+ ```csv
298
+ Bread,Milk
299
+ Bread,Diaper,Beer,Eggs
300
+ Milk,Diaper,Beer,Coke
301
+ Bread,Milk,Diaper,Beer
302
+ Bread,Milk,Diaper,Coke
303
+ ```
304
+
305
+ ### Running the CLI
306
+
307
+ Use the following command to run GSPPy on your data:
308
+
309
+ ```bash
310
+ gsppy --file path/to/transactions.json --min_support 0.3 --backend auto
311
+ ```
312
+
313
+ Or for CSV files:
314
+
315
+ ```bash
316
+ gsppy --file path/to/transactions.csv --min_support 0.3 --backend rust
317
+ ```
318
+
319
+ #### CLI Options
320
+
321
+ - `--file`: Path to your input file (JSON or CSV). **Required**.
322
+ - `--min_support`: Minimum support threshold as a fraction (e.g., `0.3` for 30%). Default is `0.2`.
323
+ - `--backend`: Backend to use for support counting. One of `auto` (default), `python`, `rust`, or `gpu`.
324
+ - `--verbose`: (Optional) Enable detailed output for debugging.
325
+
326
+ #### Example
327
+
328
+ Suppose you have a file `transactions.json` as shown above. To find patterns with at least 30% support:
329
+
330
+ ```bash
331
+ gsppy --file transactions.json --min_support 0.3
332
+ ```
333
+
334
+ Sample output:
335
+
336
+ ```
337
+ Pre-processing transactions...
338
+ Starting GSP algorithm with min_support=0.3...
339
+ Run 1: 6 candidates filtered to 5.
340
+ Run 2: 20 candidates filtered to 3.
341
+ Run 3: 2 candidates filtered to 2.
342
+ Run 4: 1 candidates filtered to 0.
343
+ GSP algorithm completed.
344
+ Frequent Patterns Found:
345
+
346
+ 1-Sequence Patterns:
347
+ Pattern: ('Bread',), Support: 4
348
+ Pattern: ('Milk',), Support: 4
349
+ Pattern: ('Diaper',), Support: 4
350
+ Pattern: ('Beer',), Support: 3
351
+ Pattern: ('Coke',), Support: 2
352
+
353
+ 2-Sequence Patterns:
354
+ Pattern: ('Bread', 'Milk'), Support: 3
355
+ Pattern: ('Milk', 'Diaper'), Support: 3
356
+ Pattern: ('Diaper', 'Beer'), Support: 3
357
+
358
+ 3-Sequence Patterns:
359
+ Pattern: ('Bread', 'Milk', 'Diaper'), Support: 2
360
+ Pattern: ('Milk', 'Diaper', 'Beer'), Support: 2
361
+ ```
362
+
363
+ #### Error Handling
364
+
365
+ - If the file does not exist or is in an unsupported format, a clear error message will be shown.
366
+ - The `min_support` value must be between 0.0 and 1.0 (exclusive of 0.0, inclusive of 1.0).
367
+
368
+ #### Advanced: Verbose Output
369
+
370
+ To see detailed logs for debugging, add the `--verbose` flag:
371
+
372
+ ```bash
373
+ gsppy --file transactions.json --min_support 0.3 --verbose
374
+ ```
375
+
376
+ ---
377
+
378
+ The following example shows how to use GSP-Py programmatically in Python:
197
379
 
198
380
  ### Example Input Data
199
381
 
@@ -294,11 +476,18 @@ improvement? [Open a discussion or issue!](https://github.com/jacksonpradolima/g
294
476
  We welcome contributions from the community! If you'd like to help improve GSP-Py, read
295
477
  our [CONTRIBUTING.md](CONTRIBUTING.md) guide to get started.
296
478
 
297
- Development dependencies (e.g., testing and linting tools) are included in the `dev` category in `setup.py`. To install
298
- these dependencies, run:
479
+ Development dependencies (e.g., testing and linting tools) are handled via uv.
480
+ To set up and run the main tasks:
299
481
 
300
482
  ```bash
301
- pip install .[dev]
483
+ uv venv .venv
484
+ uv sync --frozen --extra dev
485
+ uv pip install -e .
486
+
487
+ # Run tasks
488
+ uv run pytest -n auto
489
+ uv run ruff check .
490
+ uv run pyright
302
491
  ```
303
492
 
304
493
  ### General Steps:
@@ -328,7 +517,7 @@ If GSP-Py contributed to your research or project that led to a publication, we
328
517
  author = {Prado Lima, Jackson Antonio do},
329
518
  title = {{GSP-Py - Generalized Sequence Pattern algorithm in Python}},
330
519
  month = Dec,
331
- year = 2024,
520
+ year = 2025,
332
521
  doi = {10.5281/zenodo.3333987},
333
522
  url = {https://doi.org/10.5281/zenodo.3333987}
334
523
  }
@@ -0,0 +1,10 @@
1
+ gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ gsppy/accelerate.py,sha256=YO3YQFzo2VAC6IXOTnQnOajkZO7SabkieGb1IPgWdSI,10407
3
+ gsppy/cli.py,sha256=wsGoc_utxpRfgCF9vPOAyLDTOJZ8NaiwiUny5VyIYvQ,6567
4
+ gsppy/gsp.py,sha256=GCHFhOu-DyHEPsse_OXzf9IaZoigF8ouRqgn_OsZBvA,14855
5
+ gsppy/utils.py,sha256=YlV0F64lnd2Xymf6XnYr6mMLYWV2f2yjaHkZbAS1Qs0,3362
6
+ gsppy-3.0.0.dist-info/METADATA,sha256=5Q6iWC2tabQyDFjEztrgK4nsOWzz4z21oSXmFvQ0wU8,17670
7
+ gsppy-3.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
8
+ gsppy-3.0.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
9
+ gsppy-3.0.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
10
+ gsppy-3.0.0.dist-info/RECORD,,
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2024 Jackson Antonio do Prado Lima
3
+ Copyright (c) 2025 Jackson Antonio do Prado Lima
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,9 +0,0 @@
1
- gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- gsppy/cli.py,sha256=YxBL341LJzb6EN-RBkhW3o4ZCexOGiQXq_aRovKccA8,6790
3
- gsppy/gsp.py,sha256=CUCC1W5GGlGbWkC_td0qDfnSJiuzbWoMapR0qciejw8,13800
4
- gsppy/utils.py,sha256=gOT3USxmC0MrBnSHOQ8avxghWmjQe59hS4jNQ3eiENQ,3363
5
- gsppy-2.2.0.dist-info/METADATA,sha256=1Y8LcuU7engLWoCWFIKRwRMNsgkAawnpvX6s1BoXP_8,12485
6
- gsppy-2.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
7
- gsppy-2.2.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
8
- gsppy-2.2.0.dist-info/licenses/LICENSE,sha256=co1jy5VZd1wXOPdUC2uk1hn7zsBm6aJNgVmhPOZ47g8,1086
9
- gsppy-2.2.0.dist-info/RECORD,,
File without changes