gsppy 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsppy/accelerate.py ADDED
@@ -0,0 +1,269 @@
1
+ """
2
+ Optional acceleration layer for GSP support counting.
3
+
4
+ This module attempts to use a Rust extension for the hot loop
5
+ (support counting via contiguous subsequence search). If the Rust
6
+ module is unavailable, it gracefully falls back to the pure-Python
7
+ implementation.
8
+
9
+ Control backend via env var:
10
+ - GSPPY_BACKEND=rust -> require Rust extension (raise if missing)
11
+ - GSPPY_BACKEND=python -> force Python implementation
12
+ - unset/other -> try Rust first, then fallback to Python
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import os
18
+ from typing import Any, Dict, List, Tuple, Optional, cast
19
+
20
+ from .utils import split_into_batches, is_subsequence_in_list
21
+
22
+ # Optional GPU (CuPy) support
23
+ _gpu_available = False
24
+ try: # pragma: no cover - optional dependency path
25
+ import cupy as _cp_mod # type: ignore[import-not-found]
26
+
27
+ cp = cast(Any, _cp_mod)
28
+
29
+ try:
30
+ _gpu_available = cp.cuda.runtime.getDeviceCount() > 0 # type: ignore[attr-defined]
31
+ except Exception:
32
+ _gpu_available = False
33
+ except Exception: # pragma: no cover - optional dependency path
34
+ cp = None # type: ignore[assignment]
35
+ _gpu_available = False
36
+
37
+ # Simple per-process cache for encoded transactions keyed by the list object's id
38
+ _ENCODED_CACHE: Dict[int, Tuple[List[List[int]], Dict[int, str], Dict[str, int], int]] = {}
39
+
40
+
41
+ def _get_encoded_transactions(
42
+ transactions: List[Tuple[str, ...]],
43
+ ) -> Tuple[List[List[int]], Dict[int, str], Dict[str, int]]:
44
+ """Return encoded transactions using a small in-memory cache.
45
+
46
+ Cache key is the id() of the transactions list and we also track the number of
47
+ transactions to detect trivial changes. This assumes transactions aren't mutated after
48
+ GSP is constructed (which is the common case).
49
+ """
50
+ key = id(transactions)
51
+ cached = _ENCODED_CACHE.get(key)
52
+ if cached is not None:
53
+ enc_tx, inv_vocab, vocab, n_tx = cached
54
+ if n_tx == len(transactions):
55
+ return enc_tx, inv_vocab, vocab
56
+ enc_tx, inv_vocab, vocab = _encode_transactions(transactions)
57
+ _ENCODED_CACHE[key] = (enc_tx, inv_vocab, vocab, len(transactions))
58
+ return enc_tx, inv_vocab, vocab
59
+
60
+
61
+ # Try importing the Rust extension
62
+ _rust_available = False
63
+ _compute_supports_rust: Any = None
64
+ try:
65
+ from _gsppy_rust import compute_supports_py as _compute_supports_rust # type: ignore
66
+
67
+ _rust_available = True
68
+ except Exception:
69
+ _compute_supports_rust = None
70
+ _rust_available = False
71
+
72
+
73
+ def _env_backend() -> str:
74
+ return os.environ.get("GSPPY_BACKEND", "auto").lower()
75
+
76
+
77
+ def _encode_transactions(transactions: List[Tuple[str, ...]]) -> Tuple[List[List[int]], Dict[int, str], Dict[str, int]]:
78
+ """Encode transactions of strings into integer IDs.
79
+
80
+ Parameters:
81
+ transactions: List of transactions where each transaction is a tuple of strings.
82
+
83
+ Returns:
84
+ A tuple of:
85
+ - enc_tx: List[List[int]] encoded transactions
86
+ - inv_vocab: Dict[int, str] mapping back from id to original string
87
+ - vocab: Dict[str, int] mapping from original string to integer id
88
+ """
89
+ vocab: Dict[str, int] = {}
90
+ enc_tx: List[List[int]] = []
91
+ for t in transactions:
92
+ row: List[int] = []
93
+ for s in t:
94
+ if s not in vocab:
95
+ vocab[s] = len(vocab)
96
+ row.append(vocab[s])
97
+ enc_tx.append(row)
98
+ inv_vocab = {v: k for k, v in vocab.items()}
99
+ return enc_tx, inv_vocab, vocab
100
+
101
+
102
+ def _encode_candidates(candidates: List[Tuple[str, ...]], vocab: Dict[str, int]) -> List[List[int]]:
103
+ """Encode candidate patterns using a provided vocabulary mapping."""
104
+ return [[vocab[s] for s in cand] for cand in candidates]
105
+
106
+
107
+ def _support_counts_gpu_singletons(
108
+ enc_tx: List[List[int]],
109
+ cand_ids: List[int],
110
+ min_support_abs: int,
111
+ vocab_size: int,
112
+ ) -> List[Tuple[List[int], int]]:
113
+ """GPU-accelerated support counts for singleton candidates using CuPy.
114
+
115
+ This computes the number of transactions containing each candidate item ID.
116
+ It uniquifies items per transaction on CPU to preserve presence semantics,
117
+ then performs a single bincount on GPU.
118
+ """
119
+ # Ensure one contribution per transaction
120
+ unique_rows: List[List[int]] = [list(set(row)) for row in enc_tx]
121
+ if not unique_rows:
122
+ return []
123
+
124
+ # Flatten to a 1D list of item ids, then move to GPU
125
+ flat: List[int] = [item for row in unique_rows for item in row]
126
+ if not flat:
127
+ return []
128
+
129
+ cp_flat = cp.asarray(flat, dtype=cp.int32) # type: ignore[name-defined]
130
+ counts = cp.bincount(cp_flat, minlength=vocab_size) # type: ignore[attr-defined]
131
+ counts_host: Any = counts.get() # back to host as a NumPy array
132
+
133
+ out: List[Tuple[List[int], int]] = []
134
+ for cid in cand_ids:
135
+ freq = int(counts_host[cid])
136
+ if freq >= min_support_abs:
137
+ out.append(([cid], freq))
138
+ return out
139
+
140
+
141
+ def support_counts_python(
142
+ transactions: List[Tuple[str, ...]],
143
+ candidates: List[Tuple[str, ...]],
144
+ min_support_abs: int,
145
+ batch_size: int = 100,
146
+ ) -> Dict[Tuple[str, ...], int]:
147
+ """Pure-Python fallback for support counting (single-process).
148
+
149
+ Evaluates each candidate pattern's frequency across all transactions
150
+ using the same contiguous-subsequence semantics as the Rust backend.
151
+
152
+ Note: This implementation is single-process and optimized for simplicity.
153
+ Heavy workloads may benefit from the Rust backend.
154
+ """
155
+ # Simple non-multiprocessing version to avoid import cycles.
156
+ results: Dict[Tuple[str, ...], int] = {}
157
+ for batch in split_into_batches(candidates, batch_size):
158
+ for cand in batch:
159
+ freq = sum(1 for t in transactions if is_subsequence_in_list(cand, t))
160
+ if freq >= min_support_abs:
161
+ results[cand] = freq
162
+ return results
163
+
164
+
165
+ def support_counts(
166
+ transactions: List[Tuple[str, ...]],
167
+ candidates: List[Tuple[str, ...]],
168
+ min_support_abs: int,
169
+ batch_size: int = 100,
170
+ backend: Optional[str] = None,
171
+ ) -> Dict[Tuple[str, ...], int]:
172
+ """Choose the best available backend for support counting.
173
+
174
+ Backend selection is controlled by the `backend` argument when provided,
175
+ otherwise by the env var GSPPY_BACKEND:
176
+ - "rust": require Rust extension (raise if missing)
177
+ - "gpu": try GPU path when available (currently singletons optimized),
178
+ fall back to CPU for the rest
179
+ - "python": force pure-Python fallback
180
+ - otherwise: try Rust first and fall back to Python
181
+ """
182
+ backend_sel = (backend or _env_backend()).lower()
183
+
184
+ if backend_sel == "gpu":
185
+ if not _gpu_available:
186
+ raise RuntimeError("GSPPY_BACKEND=gpu but CuPy GPU is not available")
187
+ # Encode once
188
+ enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
189
+ enc_cands = _encode_candidates(candidates, vocab)
190
+
191
+ # Partition candidates into singletons and non-singletons
192
+ singletons: List[Tuple[int, Tuple[str, ...]]] = []
193
+ others: List[Tuple[List[int], Tuple[str, ...]]] = []
194
+ # Pair original and encoded candidates; lengths should match
195
+ assert len(candidates) == len(enc_cands), "Encoded candidates length mismatch"
196
+ for orig, enc in zip(candidates, enc_cands): # noqa: B905 - lengths checked above
197
+ if len(enc) == 1:
198
+ singletons.append((enc[0], orig))
199
+ else:
200
+ others.append((enc, orig))
201
+
202
+ out: Dict[Tuple[str, ...], int] = {}
203
+
204
+ # GPU path for singletons
205
+ if singletons:
206
+ vocab_size = max(vocab.values()) + 1 if vocab else 0
207
+ gpu_res = _support_counts_gpu_singletons(
208
+ enc_tx=enc_tx,
209
+ cand_ids=[cid for cid, _ in singletons],
210
+ min_support_abs=min_support_abs,
211
+ vocab_size=vocab_size,
212
+ )
213
+ # Map back to original strings
214
+ cand_by_id: Dict[int, Tuple[str, ...]] = {cid: orig for cid, orig in singletons}
215
+ for enc_cand, freq in gpu_res:
216
+ cid = enc_cand[0]
217
+ out[cand_by_id[cid]] = int(freq)
218
+
219
+ # Fallback for others (prefer rust when available)
220
+ if others:
221
+ if _rust_available:
222
+ try:
223
+ other_enc = [enc for enc, _ in others]
224
+ res = cast(
225
+ List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, other_enc, int(min_support_abs))
226
+ )
227
+ for enc_cand, freq in res:
228
+ out[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
229
+ except Exception:
230
+ # fallback to python
231
+ out.update(
232
+ support_counts_python(transactions, [orig for _, orig in others], min_support_abs, batch_size)
233
+ )
234
+ else:
235
+ out.update(
236
+ support_counts_python(transactions, [orig for _, orig in others], min_support_abs, batch_size)
237
+ )
238
+
239
+ return out
240
+
241
+ if backend_sel == "python":
242
+ return support_counts_python(transactions, candidates, min_support_abs, batch_size)
243
+
244
+ if backend_sel == "rust":
245
+ if not _rust_available:
246
+ raise RuntimeError("GSPPY_BACKEND=rust but Rust extension _gsppy_rust is not available")
247
+ # use rust
248
+ enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
249
+ enc_cands = _encode_candidates(candidates, vocab)
250
+ result = cast(List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, enc_cands, int(min_support_abs)))
251
+ out_rust: Dict[Tuple[str, ...], int] = {}
252
+ for enc_cand, freq in result:
253
+ out_rust[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
254
+ return out_rust
255
+
256
+ # auto: try rust then fallback
257
+ if _rust_available:
258
+ enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
259
+ enc_cands = _encode_candidates(candidates, vocab)
260
+ try:
261
+ result = cast(List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, enc_cands, int(min_support_abs)))
262
+ out2: Dict[Tuple[str, ...], int] = {}
263
+ for enc_cand, freq in result:
264
+ out2[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
265
+ return out2
266
+ except Exception:
267
+ pass
268
+
269
+ return support_counts_python(transactions, candidates, min_support_abs, batch_size)
gsppy/cli.py CHANGED
@@ -27,14 +27,16 @@ Key Features:
27
27
  This CLI empowers users to perform sequential pattern mining on transactional data efficiently through
28
28
  a simple command-line interface.
29
29
  """
30
+
30
31
  import os
31
32
  import csv
32
33
  import sys
33
34
  import json
34
35
  import logging
35
- import argparse
36
36
  from typing import Dict, List, Tuple
37
37
 
38
+ import click
39
+
38
40
  from gsppy.gsp import GSP
39
41
 
40
42
  # Configure logging
@@ -71,7 +73,7 @@ def read_transactions_from_json(file_path: str) -> List[List[str]]:
71
73
  ValueError: If the file cannot be read or does not contain valid JSON.
72
74
  """
73
75
  try:
74
- with open(file_path, 'r', encoding='utf-8') as f:
76
+ with open(file_path, "r", encoding="utf-8") as f:
75
77
  transactions: List[List[str]] = json.load(f)
76
78
  return transactions
77
79
  except Exception as e:
@@ -95,7 +97,7 @@ def read_transactions_from_csv(file_path: str) -> List[List[str]]:
95
97
  """
96
98
  try:
97
99
  transactions: List[List[str]] = []
98
- with open(file_path, newline='', encoding='utf-8') as csvfile:
100
+ with open(file_path, newline="", encoding="utf-8") as csvfile:
99
101
  reader = csv.reader(csvfile)
100
102
  for row in reader:
101
103
  # Check if the row is empty
@@ -138,65 +140,56 @@ def detect_and_read_file(file_path: str) -> List[List[str]]:
138
140
  raise ValueError("Unsupported file format. Please provide a JSON or CSV file.")
139
141
 
140
142
 
141
- def main() -> None:
143
+ # Click-based CLI
144
+ @click.command()
145
+ @click.option(
146
+ "--file",
147
+ "file_path",
148
+ required=True,
149
+ type=click.Path(exists=True),
150
+ help="Path to a JSON or CSV file containing transactions.",
151
+ )
152
+ @click.option(
153
+ "--min_support",
154
+ default=0.2,
155
+ show_default=True,
156
+ type=float,
157
+ help="Minimum support threshold as a fraction of total transactions.",
158
+ )
159
+ @click.option(
160
+ "--backend",
161
+ type=click.Choice(["auto", "python", "rust", "gpu"], case_sensitive=False),
162
+ default="auto",
163
+ show_default=True,
164
+ help="Backend to use for support counting.",
165
+ )
166
+ @click.option("--verbose", is_flag=True, help="Enable verbose output for debugging purposes.")
167
+ def main(file_path: str, min_support: float, backend: str, verbose: bool) -> None:
142
168
  """
143
- Main function to handle CLI input and run the GSP algorithm.
144
-
145
- Arguments:
146
- - `--file` (str): Path to a JSON or CSV file containing transactions.
147
- - `--min_support` (float): Minimum support threshold (default: 0.2).
169
+ Run the GSP algorithm on transactional data from a file.
148
170
  """
149
- parser = argparse.ArgumentParser(
150
- description="GSP (Generalized Sequential Pattern) Algorithm - "
151
- "Find frequent sequential patterns in transactional data."
152
- )
153
-
154
- # Single file argument
155
- parser.add_argument(
156
- '--file',
157
- type=str,
158
- required=True,
159
- help='Path to a JSON or CSV file containing transactions (e.g., [["A", "B"], ["B", "C"]] '
160
- 'or CSV rows per transaction)'
161
- )
162
-
163
- # Minimum support argument
164
- parser.add_argument(
165
- '--min_support',
166
- type=float,
167
- default=0.2,
168
- help="Minimum support threshold as a fraction of total transactions (default: 0.2)"
169
- )
170
-
171
- # Verbose output argument
172
- parser.add_argument(
173
- '--verbose',
174
- action='store_true',
175
- help='Enable verbose output for debugging purposes.'
176
- )
177
-
178
- # Parse arguments
179
- args = parser.parse_args()
180
-
181
- # Setup logging verbosity
182
- setup_logging(args.verbose)
171
+ setup_logging(verbose)
183
172
 
184
173
  # Automatically detect and load transactions
185
174
  try:
186
- transactions = detect_and_read_file(args.file)
175
+ transactions = detect_and_read_file(file_path)
187
176
  except ValueError as e:
188
177
  logger.error(f"Error: {e}")
189
- return
178
+ sys.exit(1)
190
179
 
191
180
  # Check min_support
192
- if args.min_support <= 0.0 or args.min_support > 1.0:
181
+ if min_support <= 0.0 or min_support > 1.0:
193
182
  logger.error("Error: min_support must be in the range (0.0, 1.0].")
194
- return
183
+ sys.exit(1)
184
+
185
+ # Select backend for acceleration layer
186
+ if backend and backend.lower() != "auto":
187
+ os.environ["GSPPY_BACKEND"] = backend.lower()
195
188
 
196
189
  # Initialize and run GSP algorithm
197
190
  try:
198
191
  gsp = GSP(transactions)
199
- patterns: List[Dict[Tuple[str, ...], int]] = gsp.search(min_support=args.min_support)
192
+ patterns: List[Dict[Tuple[str, ...], int]] = gsp.search(min_support=min_support)
200
193
  logger.info("Frequent Patterns Found:")
201
194
  for i, level in enumerate(patterns, start=1):
202
195
  logger.info(f"\n{i}-Sequence Patterns:")
@@ -204,7 +197,8 @@ def main() -> None:
204
197
  logger.info(f"Pattern: {pattern}, Support: {support}")
205
198
  except Exception as e:
206
199
  logger.error(f"Error executing GSP algorithm: {e}")
200
+ sys.exit(1)
207
201
 
208
202
 
209
- if __name__ == '__main__':
203
+ if __name__ == "__main__":
210
204
  main()
gsppy/gsp.py CHANGED
@@ -34,11 +34,11 @@ Example Usage:
34
34
  ```python
35
35
  # Define the transactional dataset
36
36
  transactions = [
37
- ['Bread', 'Milk'],
38
- ['Bread', 'Diaper', 'Beer', 'Eggs'],
39
- ['Milk', 'Diaper', 'Beer', 'Coke'],
40
- ['Bread', 'Milk', 'Diaper', 'Beer'],
41
- ['Bread', 'Milk', 'Diaper', 'Coke']
37
+ ["Bread", "Milk"],
38
+ ["Bread", "Diaper", "Beer", "Eggs"],
39
+ ["Milk", "Diaper", "Beer", "Coke"],
40
+ ["Bread", "Milk", "Diaper", "Beer"],
41
+ ["Bread", "Milk", "Diaper", "Coke"],
42
42
  ]
43
43
 
44
44
  # Initialize GSP with the transactional dataset
@@ -84,13 +84,16 @@ Version:
84
84
  --------
85
85
  - Current Version: 2.0
86
86
  """
87
+
88
+ import math
87
89
  import logging
88
90
  import multiprocessing as mp
89
- from typing import Any, Dict, List, Tuple
91
+ from typing import Dict, List, Tuple, Optional
90
92
  from itertools import chain
91
93
  from collections import Counter
92
94
 
93
95
  from gsppy.utils import split_into_batches, is_subsequence_in_list, generate_candidates_from_previous
96
+ from gsppy.accelerate import support_counts as support_counts_accel
94
97
 
95
98
  logger = logging.getLogger(__name__)
96
99
 
@@ -171,14 +174,13 @@ class GSP:
171
174
  self.max_size = max(len(item) for item in raw_transactions)
172
175
  self.transactions: List[Tuple[str, ...]] = [tuple(transaction) for transaction in raw_transactions]
173
176
  counts: Counter[str] = Counter(chain.from_iterable(raw_transactions))
174
- self.unique_candidates: list[tuple[str, Any]] = [(item,) for item in counts.keys()]
177
+ # Start with singleton candidates (1-sequences)
178
+ self.unique_candidates: List[Tuple[str, ...]] = [(item,) for item in counts.keys()]
175
179
  logger.debug("Unique candidates: %s", self.unique_candidates)
176
180
 
177
181
  @staticmethod
178
182
  def _worker_batch(
179
- batch: List[Tuple[str, ...]],
180
- transactions: List[Tuple[str, ...]],
181
- min_support: int
183
+ batch: List[Tuple[str, ...]], transactions: List[Tuple[str, ...]], min_support: int
182
184
  ) -> List[Tuple[Tuple[str, ...], int]]:
183
185
  """
184
186
  Evaluate a batch of candidate sequences to compute their support.
@@ -204,20 +206,15 @@ class GSP:
204
206
  results.append((item, frequency))
205
207
  return results
206
208
 
207
- def _support(
208
- self,
209
- items: List[Tuple[str, ...]], min_support: float = 0, batch_size: int = 100
209
+ def _support_python(
210
+ self, items: List[Tuple[str, ...]], min_support: int = 0, batch_size: int = 100
210
211
  ) -> Dict[Tuple[str, ...], int]:
211
212
  """
212
- Calculate support counts for candidate sequences, using parallel processing.
213
-
214
- To improve efficiency, candidate sequences are processed in parallel batches using the
215
- `multiprocessing` module. Each sequence is checked against transactions, and its support
216
- count is calculated.
213
+ Calculate support counts for candidate sequences using Python multiprocessing.
217
214
 
218
215
  Parameters:
219
216
  items (List[Tuple]): Candidate sequences to evaluate.
220
- min_support (float): Absolute minimum support count required for a sequence to be considered frequent.
217
+ min_support (int): Absolute minimum support count required for a sequence to be considered frequent.
221
218
  batch_size (int): Maximum number of candidates to process per batch.
222
219
 
223
220
  Returns:
@@ -231,12 +228,30 @@ class GSP:
231
228
  with mp.Pool(processes=mp.cpu_count()) as pool:
232
229
  batch_results = pool.starmap(
233
230
  self._worker_batch, # Process a batch at a time
234
- [(batch, self.transactions, min_support) for batch in batches]
231
+ [(batch, self.transactions, min_support) for batch in batches],
235
232
  )
236
233
 
237
234
  # Flatten the list of results and convert to a dictionary
238
235
  return {item: freq for batch in batch_results for item, freq in batch}
239
236
 
237
+ def _support(
238
+ self,
239
+ items: List[Tuple[str, ...]],
240
+ min_support: int = 0,
241
+ batch_size: int = 100,
242
+ backend: Optional[str] = None,
243
+ ) -> Dict[Tuple[str, ...], int]:
244
+ """
245
+ Calculate support counts for candidate sequences using the fastest available backend.
246
+ This will try the Rust extension if available (and configured), otherwise fall back to
247
+ the Python multiprocessing implementation.
248
+ """
249
+ try:
250
+ return support_counts_accel(self.transactions, items, min_support, batch_size, backend=backend)
251
+ except Exception:
252
+ # Fallback to Python implementation on any acceleration failure
253
+ return self._support_python(items, min_support, batch_size)
254
+
240
255
  def _print_status(self, run: int, candidates: List[Tuple[str, ...]]) -> None:
241
256
  """
242
257
  Log progress information for the current GSP iteration.
@@ -248,10 +263,14 @@ class GSP:
248
263
  run (int): Current k-sequence generation level (e.g., 1 for 1-item sequences).
249
264
  candidates (List[Tuple]): Candidate sequences generated at this level.
250
265
  """
251
- logger.info("Run %d: %d candidates filtered to %d.",
252
- run, len(candidates), len(self.freq_patterns[run - 1]))
266
+ logger.info("Run %d: %d candidates filtered to %d.", run, len(candidates), len(self.freq_patterns[run - 1]))
253
267
 
254
- def search(self, min_support: float = 0.2) -> List[Dict[Tuple[str, ...], int]]:
268
+ def search(
269
+ self,
270
+ min_support: float = 0.2,
271
+ max_k: Optional[int] = None,
272
+ backend: Optional[str] = None,
273
+ ) -> List[Dict[Tuple[str, ...], int]]:
255
274
  """
256
275
  Execute the Generalized Sequential Pattern (GSP) mining algorithm.
257
276
 
@@ -280,9 +299,10 @@ class GSP:
280
299
  if not 0.0 < min_support <= 1.0:
281
300
  raise ValueError("Minimum support must be in the range (0.0, 1.0]")
282
301
 
283
- min_support = len(self.transactions) * min_support
302
+ logger.info(f"Starting GSP algorithm with min_support={min_support}...")
284
303
 
285
- logger.info("Starting GSP algorithm with min_support=%.2f...", min_support)
304
+ # Convert fractional support to absolute count (ceil to preserve threshold semantics)
305
+ abs_min_support = int(math.ceil(len(self.transactions) * float(min_support)))
286
306
 
287
307
  # the set of frequent 1-sequence: all singleton sequences
288
308
  # (k-itemsets/k-sequence = 1) - Initially, every item in DB is a
@@ -291,7 +311,7 @@ class GSP:
291
311
 
292
312
  # scan transactions to collect support count for each candidate
293
313
  # sequence & filter
294
- self.freq_patterns.append(self._support(candidates, min_support))
314
+ self.freq_patterns.append(self._support(candidates, abs_min_support, backend=backend))
295
315
 
296
316
  # (k-itemsets/k-sequence = 1)
297
317
  k_items = 1
@@ -299,7 +319,10 @@ class GSP:
299
319
  self._print_status(k_items, candidates)
300
320
 
301
321
  # repeat until no frequent sequence or no candidate can be found
302
- while self.freq_patterns[k_items - 1] and k_items + 1 <= self.max_size:
322
+ # If max_k is provided, stop generating candidates beyond that length
323
+ while (
324
+ self.freq_patterns[k_items - 1] and k_items + 1 <= self.max_size and (max_k is None or k_items + 1 <= max_k)
325
+ ):
303
326
  k_items += 1
304
327
 
305
328
  # Generate candidate sets Ck (set of candidate k-sequences) -
@@ -309,7 +332,7 @@ class GSP:
309
332
 
310
333
  # candidate pruning - eliminates candidates who are not potentially
311
334
  # frequent (using support as threshold)
312
- self.freq_patterns.append(self._support(candidates, min_support))
335
+ self.freq_patterns.append(self._support(candidates, abs_min_support, backend=backend))
313
336
 
314
337
  self._print_status(k_items, candidates)
315
338
  logger.info("GSP algorithm completed.")
gsppy/utils.py CHANGED
@@ -20,6 +20,7 @@ Main functionalities:
20
20
  These utilities are designed to support sequence processing tasks and can be
21
21
  adapted to various domains, such as data mining, recommendation systems, and sequence analysis.
22
22
  """
23
+
23
24
  from typing import Dict, List, Tuple, Sequence, Generator
24
25
  from functools import lru_cache
25
26
  from itertools import product
@@ -39,7 +40,7 @@ def split_into_batches(
39
40
  Generator[Sequence[Tuple], None, None]: A generator yielding batches of items.
40
41
  """
41
42
  for i in range(0, len(items), batch_size):
42
- yield items[i:i + batch_size]
43
+ yield items[i : i + batch_size]
43
44
 
44
45
 
45
46
  @lru_cache(maxsize=None)
@@ -65,12 +66,10 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
65
66
  return False
66
67
 
67
68
  # Use any to check if any slice matches the sequence
68
- return any(sequence[i:i + len_sub] == subsequence for i in range(len_seq - len_sub + 1))
69
+ return any(sequence[i : i + len_sub] == subsequence for i in range(len_seq - len_sub + 1))
69
70
 
70
71
 
71
- def generate_candidates_from_previous(
72
- prev_patterns: Dict[Tuple[str, ...], int]
73
- ) -> List[Tuple[str, ...]]:
72
+ def generate_candidates_from_previous(prev_patterns: Dict[Tuple[str, ...], int]) -> List[Tuple[str, ...]]:
74
73
  """
75
74
  Generate joined candidates from the previous level's frequent patterns.
76
75
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gsppy
3
- Version: 2.3.0
3
+ Version: 3.0.0
4
4
  Summary: GSP (Generalized Sequence Pattern) algorithm in Python
5
5
  Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
6
6
  Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
@@ -32,31 +32,34 @@ Classifier: Intended Audience :: Science/Research
32
32
  Classifier: License :: OSI Approved :: MIT License
33
33
  Classifier: Natural Language :: English
34
34
  Classifier: Operating System :: OS Independent
35
- Classifier: Programming Language :: Python :: 3.8
36
- Classifier: Programming Language :: Python :: 3.9
37
35
  Classifier: Programming Language :: Python :: 3.10
38
36
  Classifier: Programming Language :: Python :: 3.11
39
37
  Classifier: Programming Language :: Python :: 3.12
40
38
  Classifier: Programming Language :: Python :: 3.13
41
39
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
42
40
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
43
- Requires-Python: >=3.8
41
+ Requires-Python: >=3.10
42
+ Requires-Dist: click>=8.0.0
44
43
  Provides-Extra: dev
45
- Requires-Dist: cython==3.0.11; extra == 'dev'
44
+ Requires-Dist: cython==3.1.3; extra == 'dev'
46
45
  Requires-Dist: hatch==1.14.0; extra == 'dev'
47
46
  Requires-Dist: hatchling==1.27.0; extra == 'dev'
48
- Requires-Dist: mypy==1.14.1; extra == 'dev'
49
- Requires-Dist: pylint==3.3.3; extra == 'dev'
50
- Requires-Dist: pyright==1.1.391; extra == 'dev'
47
+ Requires-Dist: mypy==1.18.1; extra == 'dev'
48
+ Requires-Dist: pylint==3.2.7; extra == 'dev'
49
+ Requires-Dist: pyright==1.1.405; extra == 'dev'
51
50
  Requires-Dist: pytest-benchmark==5.1.0; extra == 'dev'
52
- Requires-Dist: pytest-cov==6.0.0; extra == 'dev'
51
+ Requires-Dist: pytest-cov==5.0.0; extra == 'dev'
53
52
  Requires-Dist: pytest==8.3.4; extra == 'dev'
54
- Requires-Dist: ruff==0.8.5; extra == 'dev'
55
- Requires-Dist: tox==4.23.2; extra == 'dev'
53
+ Requires-Dist: ruff==0.13.0; extra == 'dev'
54
+ Requires-Dist: tox==4.30.2; extra == 'dev'
55
+ Provides-Extra: gpu
56
+ Requires-Dist: cupy<14,>=11; extra == 'gpu'
57
+ Provides-Extra: rust
58
+ Requires-Dist: maturin==1.6.0; extra == 'rust'
56
59
  Description-Content-Type: text/markdown
57
60
 
58
61
  [![PyPI License](https://img.shields.io/pypi/l/gsppy.svg?style=flat-square)]()
59
- ![](https://img.shields.io/badge/python-3.8+-blue.svg)
62
+ ![](https://img.shields.io/badge/python-3.10+-blue.svg)
60
63
  [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3333987.svg)](https://doi.org/10.5281/zenodo.3333987)
61
64
 
62
65
  [![PyPI Downloads](https://img.shields.io/pypi/dm/gsppy.svg?style=flat-square)](https://pypi.org/project/gsppy/)
@@ -72,7 +75,7 @@ Description-Content-Type: text/markdown
72
75
  Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal mining, and user journey discovery.
73
76
 
74
77
  > [!IMPORTANT]
75
- > GSP-Py is compatible with Python 3.8 and later versions!
78
+ > GSP-Py is compatible with Python 3.10 and later versions!
76
79
 
77
80
  ---
78
81
 
@@ -137,11 +140,7 @@ git clone https://github.com/jacksonpradolima/gsp-py.git
137
140
  cd gsp-py
138
141
  ```
139
142
 
140
- Refer to the [Developer Installation](#developer-installation) section and run:
141
-
142
- ```bash
143
- rye sync
144
- ```
143
+ Refer to the [Developer Installation](#developer-installation) section and run the setup with uv.
145
144
 
146
145
  ### Option 2: Install via `pip`
147
146
 
@@ -155,52 +154,228 @@ pip install gsppy
155
154
 
156
155
  ## 🛠️ Developer Installation
157
156
 
158
- This project uses [Rye](https://github.com/mitsuhiko/rye) for managing dependencies, running scripts, and setting up the environment. Follow these steps to install and set up Rye for this project:
157
+ This project now uses [uv](https://github.com/astral-sh/uv) for dependency management and virtual environments.
158
+
159
+ #### 1. Install uv
160
+ ```bash
161
+ curl -Ls https://astral.sh/uv/install.sh | bash
162
+ ```
163
+
164
+ Make sure uv is on your PATH (for most Linux setups):
165
+ ```bash
166
+ export PATH="$HOME/.local/bin:$PATH"
167
+ ```
168
+
169
+ #### 2. Set up the project environment
170
+ Create a local virtual environment and install dependencies from uv.lock (single source of truth):
159
171
 
160
- #### 1. Install Rye
161
- Run the following command to install Rye:
172
+ ```bash
173
+ uv venv .venv
174
+ uv sync --frozen --extra dev # uses uv.lock
175
+ uv pip install -e .
176
+ ```
177
+
178
+ #### 3. Optional: Enable Rust acceleration
162
179
 
180
+ Rust acceleration is optional and provides faster support counting using a PyO3 extension. Python fallback remains available.
181
+
182
+ Build the extension locally:
163
183
  ```bash
164
- curl -sSf https://rye.astral.sh/get | bash
184
+ make rust-build
165
185
  ```
166
186
 
167
- If the `~/.rye/bin` directory is not in your PATH, add the following line to your shell configuration file (e.g., `~/.bashrc`, `~/.zshrc`, etc.):
187
+ Select backend at runtime (auto tries Rust, then falls back to Python):
188
+ ```bash
189
+ export GSPPY_BACKEND=rust # or python, or unset for auto
190
+ ```
168
191
 
192
+ Run benchmarks (adjust to your machine):
169
193
  ```bash
170
- export PATH="$HOME/.rye/bin:$PATH"
194
+ make bench-small
195
+ make bench-big # may use significant memory/CPU
196
+ # or customize:
197
+ GSPPY_BACKEND=auto uv run --python .venv/bin/python --no-project \
198
+ python benchmarks/bench_support.py --n_tx 1000000 --tx_len 8 --vocab 50000 --min_support 0.2 --warmup
171
199
  ```
172
200
 
173
- Reload your shell configuration file:
201
+ #### 4. Optional: Enable GPU (CuPy) acceleration
202
+
203
+ GPU acceleration is experimental and currently optimizes singleton (k=1) support counting using CuPy.
204
+ Non-singleton candidates fall back to the Rust/Python backend.
205
+
206
+ Install the optional extra (choose a CuPy build that matches your CUDA/ROCm setup if needed):
174
207
 
175
208
  ```bash
176
- source ~/.bashrc # or `source ~/.zshrc`
209
+ uv run pip install -e .[gpu]
177
210
  ```
178
211
 
179
- #### 2. Set Up the Project Environment
180
- To configure the project environment and install its dependencies, run:
212
+ Select the GPU backend at runtime:
181
213
 
182
214
  ```bash
183
- rye sync
215
+ export GSPPY_BACKEND=gpu
184
216
  ```
185
217
 
186
- #### 3. Use Rye Scripts
187
- Once the environment is set up, you can run the following commands to simplify project tasks:
218
+ If a GPU isn't available, an error will be raised when GSPPY_BACKEND=gpu is set. Otherwise, the default "auto" uses CPU.
188
219
 
189
- - Run tests (in parallel): `rye run test`
190
- - Format code: `rye run format`
191
- - Lint code: `rye run lint`
192
- - Type-check: `rye run typecheck`
193
- - Add new dependencies: `rye add <package-name>`
194
- - Add new dependency to dev dependencies: `rye add --dev <package-name>`
220
+ #### 5. Common development tasks
221
+ After the environment is ready, activate it and run tasks with standard tools:
195
222
 
196
- #### Notes
197
- - Rye automatically reads dependencies and scripts from the `pyproject.toml` file.
198
- - No need for `requirements.txt`, as Rye manages all dependencies!
223
+ ```bash
224
+ source .venv/bin/activate
225
+ pytest -n auto
226
+ ruff check .
227
+ pyright
228
+ ```
229
+
230
+ If you prefer, you can also prefix commands with uv without activating:
231
+
232
+ ```bash
233
+ uv run pytest -n auto
234
+ uv run ruff check .
235
+ uv run pyright
236
+ ```
237
+
238
+ #### 5. Makefile (shortcuts)
239
+ You can use the Makefile to automate common tasks:
240
+
241
+ ```bash
242
+ make setup # create .venv with uv and pin Python
243
+ make install # sync deps (from uv.lock) + install project (-e .)
244
+ make test # pytest -n auto
245
+ make lint # ruff check .
246
+ make format # ruff --fix
247
+ make typecheck # pyright (and mypy if configured)
248
+ make pre-commit-install # install the pre-commit hook
249
+ make pre-commit-run # run pre-commit on all files
250
+
251
+ # Rust-specific shortcuts
252
+ make rust-setup # install rustup toolchain
253
+ make rust-build # build PyO3 extension with maturin
254
+ make bench-small # run small benchmark
255
+ make bench-big # run large benchmark
256
+ ```
257
+
258
+ > [!NOTE]
259
+ > Tox in this project uses the "tox-uv" plugin. When running `make tox` or `tox`, missing Python interpreters can be provisioned automatically via uv (no need to pre-install all versions). This makes local setup faster.
199
260
 
200
261
  ## 💡 Usage
201
262
 
202
- The library is designed to be easy to use and integrate with your own projects. Below is an example of how you can
203
- configure and run GSP-Py.
263
+ The library is designed to be easy to use and integrate with your own projects. You can use GSP-Py either programmatically (Python API) or directly from the command line (CLI).
264
+
265
+ ---
266
+
267
+ ## 🚦 Using GSP-Py via CLI
268
+
269
+ GSP-Py provides a command-line interface (CLI) for running the Generalized Sequential Pattern algorithm on transactional data. This allows you to mine frequent sequential patterns from JSON or CSV files without writing any code.
270
+
271
+ ### Installation
272
+
273
+ First, install GSP-Py (if not already installed):
274
+
275
+ ```bash
276
+ pip install gsppy
277
+ ```
278
+
279
+ This will make the `gsppy` CLI command available in your environment.
280
+
281
+ ### Preparing Your Data
282
+
283
+ Your input file should be either:
284
+
285
+ - **JSON**: A list of transactions, each transaction is a list of items. Example:
286
+ ```json
287
+ [
288
+ ["Bread", "Milk"],
289
+ ["Bread", "Diaper", "Beer", "Eggs"],
290
+ ["Milk", "Diaper", "Beer", "Coke"],
291
+ ["Bread", "Milk", "Diaper", "Beer"],
292
+ ["Bread", "Milk", "Diaper", "Coke"]
293
+ ]
294
+ ```
295
+
296
+ - **CSV**: Each row is a transaction, items separated by commas. Example:
297
+ ```csv
298
+ Bread,Milk
299
+ Bread,Diaper,Beer,Eggs
300
+ Milk,Diaper,Beer,Coke
301
+ Bread,Milk,Diaper,Beer
302
+ Bread,Milk,Diaper,Coke
303
+ ```
304
+
305
+ ### Running the CLI
306
+
307
+ Use the following command to run GSPPy on your data:
308
+
309
+ ```bash
310
+ gsppy --file path/to/transactions.json --min_support 0.3 --backend auto
311
+ ```
312
+
313
+ Or for CSV files:
314
+
315
+ ```bash
316
+ gsppy --file path/to/transactions.csv --min_support 0.3 --backend rust
317
+ ```
318
+
319
+ #### CLI Options
320
+
321
+ - `--file`: Path to your input file (JSON or CSV). **Required**.
322
+ - `--min_support`: Minimum support threshold as a fraction (e.g., `0.3` for 30%). Default is `0.2`.
323
+ - `--backend`: Backend to use for support counting. One of `auto` (default), `python`, `rust`, or `gpu`.
324
+ - `--verbose`: (Optional) Enable detailed output for debugging.
325
+
326
+ #### Example
327
+
328
+ Suppose you have a file `transactions.json` as shown above. To find patterns with at least 30% support:
329
+
330
+ ```bash
331
+ gsppy --file transactions.json --min_support 0.3
332
+ ```
333
+
334
+ Sample output:
335
+
336
+ ```
337
+ Pre-processing transactions...
338
+ Starting GSP algorithm with min_support=0.3...
339
+ Run 1: 6 candidates filtered to 5.
340
+ Run 2: 20 candidates filtered to 3.
341
+ Run 3: 2 candidates filtered to 2.
342
+ Run 4: 1 candidates filtered to 0.
343
+ GSP algorithm completed.
344
+ Frequent Patterns Found:
345
+
346
+ 1-Sequence Patterns:
347
+ Pattern: ('Bread',), Support: 4
348
+ Pattern: ('Milk',), Support: 4
349
+ Pattern: ('Diaper',), Support: 4
350
+ Pattern: ('Beer',), Support: 3
351
+ Pattern: ('Coke',), Support: 2
352
+
353
+ 2-Sequence Patterns:
354
+ Pattern: ('Bread', 'Milk'), Support: 3
355
+ Pattern: ('Milk', 'Diaper'), Support: 3
356
+ Pattern: ('Diaper', 'Beer'), Support: 3
357
+
358
+ 3-Sequence Patterns:
359
+ Pattern: ('Bread', 'Milk', 'Diaper'), Support: 2
360
+ Pattern: ('Milk', 'Diaper', 'Beer'), Support: 2
361
+ ```
362
+
363
+ #### Error Handling
364
+
365
+ - If the file does not exist or is in an unsupported format, a clear error message will be shown.
366
+ - The `min_support` value must be between 0.0 and 1.0 (exclusive of 0.0, inclusive of 1.0).
367
+
368
+ #### Advanced: Verbose Output
369
+
370
+ To see detailed logs for debugging, add the `--verbose` flag:
371
+
372
+ ```bash
373
+ gsppy --file transactions.json --min_support 0.3 --verbose
374
+ ```
375
+
376
+ ---
377
+
378
+ The following example shows how to use GSP-Py programmatically in Python:
204
379
 
205
380
  ### Example Input Data
206
381
 
@@ -301,20 +476,20 @@ improvement? [Open a discussion or issue!](https://github.com/jacksonpradolima/g
301
476
  We welcome contributions from the community! If you'd like to help improve GSP-Py, read
302
477
  our [CONTRIBUTING.md](CONTRIBUTING.md) guide to get started.
303
478
 
304
- Development dependencies (e.g., testing and linting tools) are automatically managed using Rye. To install
305
- these dependencies and set up the environment, run:
479
+ Development dependencies (e.g., testing and linting tools) are handled via uv.
480
+ To set up and run the main tasks:
306
481
 
307
482
  ```bash
308
- rye sync
483
+ uv venv .venv
484
+ uv sync --frozen --extra dev
485
+ uv pip install -e .
486
+
487
+ # Run tasks
488
+ uv run pytest -n auto
489
+ uv run ruff check .
490
+ uv run pyright
309
491
  ```
310
492
 
311
- After syncing, you can run the following scripts using Rye for development tasks:
312
-
313
- - Run tests (in parallel): `rye run test`
314
- - Lint code: `rye run lint`
315
- - Type-check: `rye run typecheck`
316
- - Format code: `rye run format`
317
-
318
493
  ### General Steps:
319
494
 
320
495
  1. Fork the repository.
@@ -0,0 +1,10 @@
1
+ gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ gsppy/accelerate.py,sha256=YO3YQFzo2VAC6IXOTnQnOajkZO7SabkieGb1IPgWdSI,10407
3
+ gsppy/cli.py,sha256=wsGoc_utxpRfgCF9vPOAyLDTOJZ8NaiwiUny5VyIYvQ,6567
4
+ gsppy/gsp.py,sha256=GCHFhOu-DyHEPsse_OXzf9IaZoigF8ouRqgn_OsZBvA,14855
5
+ gsppy/utils.py,sha256=YlV0F64lnd2Xymf6XnYr6mMLYWV2f2yjaHkZbAS1Qs0,3362
6
+ gsppy-3.0.0.dist-info/METADATA,sha256=5Q6iWC2tabQyDFjEztrgK4nsOWzz4z21oSXmFvQ0wU8,17670
7
+ gsppy-3.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
8
+ gsppy-3.0.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
9
+ gsppy-3.0.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
10
+ gsppy-3.0.0.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- gsppy/cli.py,sha256=YxBL341LJzb6EN-RBkhW3o4ZCexOGiQXq_aRovKccA8,6790
3
- gsppy/gsp.py,sha256=CUCC1W5GGlGbWkC_td0qDfnSJiuzbWoMapR0qciejw8,13800
4
- gsppy/utils.py,sha256=gOT3USxmC0MrBnSHOQ8avxghWmjQe59hS4jNQ3eiENQ,3363
5
- gsppy-2.3.0.dist-info/METADATA,sha256=bgEnT2H2FGQx_ha4Unqz40qVVu-IICaCkLJ0ppOwUgs,12941
6
- gsppy-2.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
7
- gsppy-2.3.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
8
- gsppy-2.3.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
9
- gsppy-2.3.0.dist-info/RECORD,,
File without changes