PyPI - gsppy - Versions diffs - 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl - Mend

gsppy 2.3.0py3-none-any.whl → 3.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

gsppy/accelerate.py +269 -0
gsppy/cli.py +43 -49
gsppy/gsp.py +51 -28
gsppy/utils.py +4 -5
{gsppy-2.3.0.dist-info → gsppy-3.0.0.dist-info}/METADATA +227 -52
gsppy-3.0.0.dist-info/RECORD +10 -0
gsppy-2.3.0.dist-info/RECORD +0 -9
{gsppy-2.3.0.dist-info → gsppy-3.0.0.dist-info}/WHEEL +0 -0
{gsppy-2.3.0.dist-info → gsppy-3.0.0.dist-info}/entry_points.txt +0 -0
{gsppy-2.3.0.dist-info → gsppy-3.0.0.dist-info}/licenses/LICENSE +0 -0

gsppy/accelerate.py ADDED Viewed

@@ -0,0 +1,269 @@
+"""
+Optional acceleration layer for GSP support counting.
+This module attempts to use a Rust extension for the hot loop
+(support counting via contiguous subsequence search). If the Rust
+module is unavailable, it gracefully falls back to the pure-Python
+implementation.
+Control backend via env var:
+- GSPPY_BACKEND=rust  -> require Rust extension (raise if missing)
+- GSPPY_BACKEND=python -> force Python implementation
+- unset/other          -> try Rust first, then fallback to Python
+"""
+from __future__ import annotations
+import os
+from typing import Any, Dict, List, Tuple, Optional, cast
+from .utils import split_into_batches, is_subsequence_in_list
+# Optional GPU (CuPy) support
+_gpu_available = False
+try:  # pragma: no cover - optional dependency path
+    import cupy as _cp_mod  # type: ignore[import-not-found]
+    cp = cast(Any, _cp_mod)
+    try:
+        _gpu_available = cp.cuda.runtime.getDeviceCount() > 0  # type: ignore[attr-defined]
+    except Exception:
+        _gpu_available = False
+except Exception:  # pragma: no cover - optional dependency path
+    cp = None  # type: ignore[assignment]
+    _gpu_available = False
+# Simple per-process cache for encoded transactions keyed by the list object's id
+_ENCODED_CACHE: Dict[int, Tuple[List[List[int]], Dict[int, str], Dict[str, int], int]] = {}
+def _get_encoded_transactions(
+    transactions: List[Tuple[str, ...]],
+) -> Tuple[List[List[int]], Dict[int, str], Dict[str, int]]:
+    """Return encoded transactions using a small in-memory cache.
+    Cache key is the id() of the transactions list and we also track the number of
+    transactions to detect trivial changes. This assumes transactions aren't mutated after
+    GSP is constructed (which is the common case).
+    """
+    key = id(transactions)
+    cached = _ENCODED_CACHE.get(key)
+    if cached is not None:
+        enc_tx, inv_vocab, vocab, n_tx = cached
+        if n_tx == len(transactions):
+            return enc_tx, inv_vocab, vocab
+    enc_tx, inv_vocab, vocab = _encode_transactions(transactions)
+    _ENCODED_CACHE[key] = (enc_tx, inv_vocab, vocab, len(transactions))
+    return enc_tx, inv_vocab, vocab
+# Try importing the Rust extension
+_rust_available = False
+_compute_supports_rust: Any = None
+try:
+    from _gsppy_rust import compute_supports_py as _compute_supports_rust  # type: ignore
+    _rust_available = True
+except Exception:
+    _compute_supports_rust = None
+    _rust_available = False
+def _env_backend() -> str:
+    return os.environ.get("GSPPY_BACKEND", "auto").lower()
+def _encode_transactions(transactions: List[Tuple[str, ...]]) -> Tuple[List[List[int]], Dict[int, str], Dict[str, int]]:
+    """Encode transactions of strings into integer IDs.
+    Parameters:
+        transactions: List of transactions where each transaction is a tuple of strings.
+    Returns:
+        A tuple of:
+        - enc_tx: List[List[int]] encoded transactions
+        - inv_vocab: Dict[int, str] mapping back from id to original string
+        - vocab: Dict[str, int] mapping from original string to integer id
+    """
+    vocab: Dict[str, int] = {}
+    enc_tx: List[List[int]] = []
+    for t in transactions:
+        row: List[int] = []
+        for s in t:
+            if s not in vocab:
+                vocab[s] = len(vocab)
+            row.append(vocab[s])
+        enc_tx.append(row)
+    inv_vocab = {v: k for k, v in vocab.items()}
+    return enc_tx, inv_vocab, vocab
+def _encode_candidates(candidates: List[Tuple[str, ...]], vocab: Dict[str, int]) -> List[List[int]]:
+    """Encode candidate patterns using a provided vocabulary mapping."""
+    return [[vocab[s] for s in cand] for cand in candidates]
+def _support_counts_gpu_singletons(
+    enc_tx: List[List[int]],
+    cand_ids: List[int],
+    min_support_abs: int,
+    vocab_size: int,
+) -> List[Tuple[List[int], int]]:
+    """GPU-accelerated support counts for singleton candidates using CuPy.
+    This computes the number of transactions containing each candidate item ID.
+    It uniquifies items per transaction on CPU to preserve presence semantics,
+    then performs a single bincount on GPU.
+    """
+    # Ensure one contribution per transaction
+    unique_rows: List[List[int]] = [list(set(row)) for row in enc_tx]
+    if not unique_rows:
+        return []
+    # Flatten to a 1D list of item ids, then move to GPU
+    flat: List[int] = [item for row in unique_rows for item in row]
+    if not flat:
+        return []
+    cp_flat = cp.asarray(flat, dtype=cp.int32)  # type: ignore[name-defined]
+    counts = cp.bincount(cp_flat, minlength=vocab_size)  # type: ignore[attr-defined]
+    counts_host: Any = counts.get()  # back to host as a NumPy array
+    out: List[Tuple[List[int], int]] = []
+    for cid in cand_ids:
+        freq = int(counts_host[cid])
+        if freq >= min_support_abs:
+            out.append(([cid], freq))
+    return out
+def support_counts_python(
+    transactions: List[Tuple[str, ...]],
+    candidates: List[Tuple[str, ...]],
+    min_support_abs: int,
+    batch_size: int = 100,
+) -> Dict[Tuple[str, ...], int]:
+    """Pure-Python fallback for support counting (single-process).
+    Evaluates each candidate pattern's frequency across all transactions
+    using the same contiguous-subsequence semantics as the Rust backend.
+    Note: This implementation is single-process and optimized for simplicity.
+    Heavy workloads may benefit from the Rust backend.
+    """
+    # Simple non-multiprocessing version to avoid import cycles.
+    results: Dict[Tuple[str, ...], int] = {}
+    for batch in split_into_batches(candidates, batch_size):
+        for cand in batch:
+            freq = sum(1 for t in transactions if is_subsequence_in_list(cand, t))
+            if freq >= min_support_abs:
+                results[cand] = freq
+    return results
+def support_counts(
+    transactions: List[Tuple[str, ...]],
+    candidates: List[Tuple[str, ...]],
+    min_support_abs: int,
+    batch_size: int = 100,
+    backend: Optional[str] = None,
+) -> Dict[Tuple[str, ...], int]:
+    """Choose the best available backend for support counting.
+    Backend selection is controlled by the `backend` argument when provided,
+    otherwise by the env var GSPPY_BACKEND:
+    - "rust": require Rust extension (raise if missing)
+    - "gpu": try GPU path when available (currently singletons optimized),
+              fall back to CPU for the rest
+    - "python": force pure-Python fallback
+    - otherwise: try Rust first and fall back to Python
+    """
+    backend_sel = (backend or _env_backend()).lower()
+    if backend_sel == "gpu":
+        if not _gpu_available:
+            raise RuntimeError("GSPPY_BACKEND=gpu but CuPy GPU is not available")
+        # Encode once
+        enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
+        enc_cands = _encode_candidates(candidates, vocab)
+        # Partition candidates into singletons and non-singletons
+        singletons: List[Tuple[int, Tuple[str, ...]]] = []
+        others: List[Tuple[List[int], Tuple[str, ...]]] = []
+        # Pair original and encoded candidates; lengths should match
+        assert len(candidates) == len(enc_cands), "Encoded candidates length mismatch"
+        for orig, enc in zip(candidates, enc_cands):  # noqa: B905 - lengths checked above
+            if len(enc) == 1:
+                singletons.append((enc[0], orig))
+            else:
+                others.append((enc, orig))
+        out: Dict[Tuple[str, ...], int] = {}
+        # GPU path for singletons
+        if singletons:
+            vocab_size = max(vocab.values()) + 1 if vocab else 0
+            gpu_res = _support_counts_gpu_singletons(
+                enc_tx=enc_tx,
+                cand_ids=[cid for cid, _ in singletons],
+                min_support_abs=min_support_abs,
+                vocab_size=vocab_size,
+            )
+            # Map back to original strings
+            cand_by_id: Dict[int, Tuple[str, ...]] = {cid: orig for cid, orig in singletons}
+            for enc_cand, freq in gpu_res:
+                cid = enc_cand[0]
+                out[cand_by_id[cid]] = int(freq)
+        # Fallback for others (prefer rust when available)
+        if others:
+            if _rust_available:
+                try:
+                    other_enc = [enc for enc, _ in others]
+                    res = cast(
+                        List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, other_enc, int(min_support_abs))
+                    )
+                    for enc_cand, freq in res:
+                        out[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
+                except Exception:
+                    # fallback to python
+                    out.update(
+                        support_counts_python(transactions, [orig for _, orig in others], min_support_abs, batch_size)
+                    )
+            else:
+                out.update(
+                    support_counts_python(transactions, [orig for _, orig in others], min_support_abs, batch_size)
+                )
+        return out
+    if backend_sel == "python":
+        return support_counts_python(transactions, candidates, min_support_abs, batch_size)
+    if backend_sel == "rust":
+        if not _rust_available:
+            raise RuntimeError("GSPPY_BACKEND=rust but Rust extension _gsppy_rust is not available")
+        # use rust
+        enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
+        enc_cands = _encode_candidates(candidates, vocab)
+        result = cast(List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, enc_cands, int(min_support_abs)))
+        out_rust: Dict[Tuple[str, ...], int] = {}
+        for enc_cand, freq in result:
+            out_rust[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
+        return out_rust
+    # auto: try rust then fallback
+    if _rust_available:
+        enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
+        enc_cands = _encode_candidates(candidates, vocab)
+        try:
+            result = cast(List[Tuple[List[int], int]], _compute_supports_rust(enc_tx, enc_cands, int(min_support_abs)))
+            out2: Dict[Tuple[str, ...], int] = {}
+            for enc_cand, freq in result:
+                out2[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
+            return out2
+        except Exception:
+            pass
+    return support_counts_python(transactions, candidates, min_support_abs, batch_size)

gsppy/cli.py CHANGED Viewed

@@ -27,14 +27,16 @@ Key Features:
 This CLI empowers users to perform sequential pattern mining on transactional data efficiently through
 a simple command-line interface.
 """
 import os
 import csv
 import sys
 import json
 import logging
-import argparse
 from typing import Dict, List, Tuple
+import click
 from gsppy.gsp import GSP
 # Configure logging
@@ -71,7 +73,7 @@ def read_transactions_from_json(file_path: str) -> List[List[str]]:
         ValueError: If the file cannot be read or does not contain valid JSON.
     """
     try:
-        with open(file_path, 'r', encoding='utf-8') as f:
+        with open(file_path, "r", encoding="utf-8") as f:
             transactions: List[List[str]] = json.load(f)
         return transactions
     except Exception as e:
@@ -95,7 +97,7 @@ def read_transactions_from_csv(file_path: str) -> List[List[str]]:
     """
     try:
         transactions: List[List[str]] = []
-        with open(file_path, newline='', encoding='utf-8') as csvfile:
+        with open(file_path, newline="", encoding="utf-8") as csvfile:
             reader = csv.reader(csvfile)
             for row in reader:
                 # Check if the row is empty
@@ -138,65 +140,56 @@ def detect_and_read_file(file_path: str) -> List[List[str]]:
     raise ValueError("Unsupported file format. Please provide a JSON or CSV file.")
-def main() -> None:
+# Click-based CLI
+@click.command()
+@click.option(
+    "--file",
+    "file_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to a JSON or CSV file containing transactions.",
+)
+@click.option(
+    "--min_support",
+    default=0.2,
+    show_default=True,
+    type=float,
+    help="Minimum support threshold as a fraction of total transactions.",
+)
+@click.option(
+    "--backend",
+    type=click.Choice(["auto", "python", "rust", "gpu"], case_sensitive=False),
+    default="auto",
+    show_default=True,
+    help="Backend to use for support counting.",
+)
+@click.option("--verbose", is_flag=True, help="Enable verbose output for debugging purposes.")
+def main(file_path: str, min_support: float, backend: str, verbose: bool) -> None:
     """
-    Main function to handle CLI input and run the GSP algorithm.
-    Arguments:
-        - `--file` (str): Path to a JSON or CSV file containing transactions.
-        - `--min_support` (float): Minimum support threshold (default: 0.2).
+    Run the GSP algorithm on transactional data from a file.
     """
-    parser = argparse.ArgumentParser(
-        description="GSP (Generalized Sequential Pattern) Algorithm - "
-                    "Find frequent sequential patterns in transactional data."
-    )
-    # Single file argument
-    parser.add_argument(
-        '--file',
-        type=str,
-        required=True,
-        help='Path to a JSON or CSV file containing transactions (e.g., [["A", "B"], ["B", "C"]] '
-             'or CSV rows per transaction)'
-    )
-    # Minimum support argument
-    parser.add_argument(
-        '--min_support',
-        type=float,
-        default=0.2,
-        help="Minimum support threshold as a fraction of total transactions (default: 0.2)"
-    )
-    # Verbose output argument
-    parser.add_argument(
-        '--verbose',
-        action='store_true',
-        help='Enable verbose output for debugging purposes.'
-    )
-    # Parse arguments
-    args = parser.parse_args()
-    # Setup logging verbosity
-    setup_logging(args.verbose)
+    setup_logging(verbose)
     # Automatically detect and load transactions
     try:
-        transactions = detect_and_read_file(args.file)
+        transactions = detect_and_read_file(file_path)
     except ValueError as e:
         logger.error(f"Error: {e}")
-        return
+        sys.exit(1)
     # Check min_support
-    if args.min_support <= 0.0 or args.min_support > 1.0:
+    if min_support <= 0.0 or min_support > 1.0:
         logger.error("Error: min_support must be in the range (0.0, 1.0].")
-        return
+        sys.exit(1)
+    # Select backend for acceleration layer
+    if backend and backend.lower() != "auto":
+        os.environ["GSPPY_BACKEND"] = backend.lower()
     # Initialize and run GSP algorithm
     try:
         gsp = GSP(transactions)
-        patterns: List[Dict[Tuple[str, ...], int]] = gsp.search(min_support=args.min_support)
+        patterns: List[Dict[Tuple[str, ...], int]] = gsp.search(min_support=min_support)
         logger.info("Frequent Patterns Found:")
         for i, level in enumerate(patterns, start=1):
             logger.info(f"\n{i}-Sequence Patterns:")
@@ -204,7 +197,8 @@ def main() -> None:
                 logger.info(f"Pattern: {pattern}, Support: {support}")
     except Exception as e:
         logger.error(f"Error executing GSP algorithm: {e}")
+        sys.exit(1)
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()

gsppy/gsp.py CHANGED Viewed

@@ -34,11 +34,11 @@ Example Usage:
 ```python
 # Define the transactional dataset
 transactions = [
-    ['Bread', 'Milk'],
-    ['Bread', 'Diaper', 'Beer', 'Eggs'],
-    ['Milk', 'Diaper', 'Beer', 'Coke'],
-    ['Bread', 'Milk', 'Diaper', 'Beer'],
-    ['Bread', 'Milk', 'Diaper', 'Coke']
+    ["Bread", "Milk"],
+    ["Bread", "Diaper", "Beer", "Eggs"],
+    ["Milk", "Diaper", "Beer", "Coke"],
+    ["Bread", "Milk", "Diaper", "Beer"],
+    ["Bread", "Milk", "Diaper", "Coke"],
 ]
 # Initialize GSP with the transactional dataset
@@ -84,13 +84,16 @@ Version:
 --------
 - Current Version: 2.0
 """
+import math
 import logging
 import multiprocessing as mp
-from typing import Any, Dict, List, Tuple
+from typing import Dict, List, Tuple, Optional
 from itertools import chain
 from collections import Counter
 from gsppy.utils import split_into_batches, is_subsequence_in_list, generate_candidates_from_previous
+from gsppy.accelerate import support_counts as support_counts_accel
 logger = logging.getLogger(__name__)
@@ -171,14 +174,13 @@ class GSP:
         self.max_size = max(len(item) for item in raw_transactions)
         self.transactions: List[Tuple[str, ...]] = [tuple(transaction) for transaction in raw_transactions]
         counts: Counter[str] = Counter(chain.from_iterable(raw_transactions))
-        self.unique_candidates: list[tuple[str, Any]] = [(item,) for item in counts.keys()]
+        # Start with singleton candidates (1-sequences)
+        self.unique_candidates: List[Tuple[str, ...]] = [(item,) for item in counts.keys()]
         logger.debug("Unique candidates: %s", self.unique_candidates)
     @staticmethod
     def _worker_batch(
-        batch: List[Tuple[str, ...]],
-        transactions: List[Tuple[str, ...]],
-        min_support: int
+        batch: List[Tuple[str, ...]], transactions: List[Tuple[str, ...]], min_support: int
     ) -> List[Tuple[Tuple[str, ...], int]]:
         """
         Evaluate a batch of candidate sequences to compute their support.
@@ -204,20 +206,15 @@ class GSP:
                 results.append((item, frequency))
         return results
-    def _support(
-        self,
-        items: List[Tuple[str, ...]], min_support: float = 0, batch_size: int = 100
+    def _support_python(
+        self, items: List[Tuple[str, ...]], min_support: int = 0, batch_size: int = 100
     ) -> Dict[Tuple[str, ...], int]:
         """
-        Calculate support counts for candidate sequences, using parallel processing.
-        To improve efficiency, candidate sequences are processed in parallel batches using the
-        `multiprocessing` module. Each sequence is checked against transactions, and its support
-        count is calculated.
+        Calculate support counts for candidate sequences using Python multiprocessing.
         Parameters:
             items (List[Tuple]): Candidate sequences to evaluate.
-            min_support (float): Absolute minimum support count required for a sequence to be considered frequent.
+            min_support (int): Absolute minimum support count required for a sequence to be considered frequent.
             batch_size (int): Maximum number of candidates to process per batch.
         Returns:
@@ -231,12 +228,30 @@ class GSP:
         with mp.Pool(processes=mp.cpu_count()) as pool:
             batch_results = pool.starmap(
                 self._worker_batch,  # Process a batch at a time
-                [(batch, self.transactions, min_support) for batch in batches]
+                [(batch, self.transactions, min_support) for batch in batches],
             )
         # Flatten the list of results and convert to a dictionary
         return {item: freq for batch in batch_results for item, freq in batch}
+    def _support(
+        self,
+        items: List[Tuple[str, ...]],
+        min_support: int = 0,
+        batch_size: int = 100,
+        backend: Optional[str] = None,
+    ) -> Dict[Tuple[str, ...], int]:
+        """
+        Calculate support counts for candidate sequences using the fastest available backend.
+        This will try the Rust extension if available (and configured), otherwise fall back to
+        the Python multiprocessing implementation.
+        """
+        try:
+            return support_counts_accel(self.transactions, items, min_support, batch_size, backend=backend)
+        except Exception:
+            # Fallback to Python implementation on any acceleration failure
+            return self._support_python(items, min_support, batch_size)
     def _print_status(self, run: int, candidates: List[Tuple[str, ...]]) -> None:
         """
         Log progress information for the current GSP iteration.
@@ -248,10 +263,14 @@ class GSP:
             run (int): Current k-sequence generation level (e.g., 1 for 1-item sequences).
             candidates (List[Tuple]): Candidate sequences generated at this level.
         """
-        logger.info("Run %d: %d candidates filtered to %d.",
-                    run, len(candidates), len(self.freq_patterns[run - 1]))
+        logger.info("Run %d: %d candidates filtered to %d.", run, len(candidates), len(self.freq_patterns[run - 1]))
-    def search(self, min_support: float = 0.2) -> List[Dict[Tuple[str, ...], int]]:
+    def search(
+        self,
+        min_support: float = 0.2,
+        max_k: Optional[int] = None,
+        backend: Optional[str] = None,
+    ) -> List[Dict[Tuple[str, ...], int]]:
         """
         Execute the Generalized Sequential Pattern (GSP) mining algorithm.
@@ -280,9 +299,10 @@ class GSP:
         if not 0.0 < min_support <= 1.0:
             raise ValueError("Minimum support must be in the range (0.0, 1.0]")
-        min_support = len(self.transactions) * min_support
+        logger.info(f"Starting GSP algorithm with min_support={min_support}...")
-        logger.info("Starting GSP algorithm with min_support=%.2f...", min_support)
+        # Convert fractional support to absolute count (ceil to preserve threshold semantics)
+        abs_min_support = int(math.ceil(len(self.transactions) * float(min_support)))
         # the set of frequent 1-sequence: all singleton sequences
         # (k-itemsets/k-sequence = 1) - Initially, every item in DB is a
@@ -291,7 +311,7 @@ class GSP:
         # scan transactions to collect support count for each candidate
         # sequence & filter
-        self.freq_patterns.append(self._support(candidates, min_support))
+        self.freq_patterns.append(self._support(candidates, abs_min_support, backend=backend))
         # (k-itemsets/k-sequence = 1)
         k_items = 1
@@ -299,7 +319,10 @@ class GSP:
         self._print_status(k_items, candidates)
         # repeat until no frequent sequence or no candidate can be found
-        while self.freq_patterns[k_items - 1] and k_items + 1 <= self.max_size:
+        # If max_k is provided, stop generating candidates beyond that length
+        while (
+            self.freq_patterns[k_items - 1] and k_items + 1 <= self.max_size and (max_k is None or k_items + 1 <= max_k)
+        ):
             k_items += 1
             # Generate candidate sets Ck (set of candidate k-sequences) -
@@ -309,7 +332,7 @@ class GSP:
             # candidate pruning - eliminates candidates who are not potentially
             # frequent (using support as threshold)
-            self.freq_patterns.append(self._support(candidates, min_support))
+            self.freq_patterns.append(self._support(candidates, abs_min_support, backend=backend))
             self._print_status(k_items, candidates)
         logger.info("GSP algorithm completed.")

gsppy/utils.py CHANGED Viewed

@@ -20,6 +20,7 @@ Main functionalities:
 These utilities are designed to support sequence processing tasks and can be
 adapted to various domains, such as data mining, recommendation systems, and sequence analysis.
 """
 from typing import Dict, List, Tuple, Sequence, Generator
 from functools import lru_cache
 from itertools import product
@@ -39,7 +40,7 @@ def split_into_batches(
         Generator[Sequence[Tuple], None, None]: A generator yielding batches of items.
     """
     for i in range(0, len(items), batch_size):
-        yield items[i:i + batch_size]
+        yield items[i : i + batch_size]
 @lru_cache(maxsize=None)
@@ -65,12 +66,10 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
         return False
     # Use any to check if any slice matches the sequence
-    return any(sequence[i:i + len_sub] == subsequence for i in range(len_seq - len_sub + 1))
+    return any(sequence[i : i + len_sub] == subsequence for i in range(len_seq - len_sub + 1))
-def generate_candidates_from_previous(
-    prev_patterns: Dict[Tuple[str, ...], int]
-) -> List[Tuple[str, ...]]:
+def generate_candidates_from_previous(prev_patterns: Dict[Tuple[str, ...], int]) -> List[Tuple[str, ...]]:
     """
     Generate joined candidates from the previous level's frequent patterns.

{gsppy-2.3.0.dist-info → gsppy-3.0.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gsppy
-Version: 2.3.0
+Version: 3.0.0
 Summary: GSP (Generalized Sequence Pattern) algorithm in Python
 Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
 Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
@@ -32,31 +32,34 @@ Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Natural Language :: English
 Classifier: Operating System :: OS Independent
-Classifier: Programming Language :: Python :: 3.8
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
-Requires-Python: >=3.8
+Requires-Python: >=3.10
+Requires-Dist: click>=8.0.0
 Provides-Extra: dev
-Requires-Dist: cython==3.0.11; extra == 'dev'
+Requires-Dist: cython==3.1.3; extra == 'dev'
 Requires-Dist: hatch==1.14.0; extra == 'dev'
 Requires-Dist: hatchling==1.27.0; extra == 'dev'
-Requires-Dist: mypy==1.14.1; extra == 'dev'
-Requires-Dist: pylint==3.3.3; extra == 'dev'
-Requires-Dist: pyright==1.1.391; extra == 'dev'
+Requires-Dist: mypy==1.18.1; extra == 'dev'
+Requires-Dist: pylint==3.2.7; extra == 'dev'
+Requires-Dist: pyright==1.1.405; extra == 'dev'
 Requires-Dist: pytest-benchmark==5.1.0; extra == 'dev'
-Requires-Dist: pytest-cov==6.0.0; extra == 'dev'
+Requires-Dist: pytest-cov==5.0.0; extra == 'dev'
 Requires-Dist: pytest==8.3.4; extra == 'dev'
-Requires-Dist: ruff==0.8.5; extra == 'dev'
-Requires-Dist: tox==4.23.2; extra == 'dev'
+Requires-Dist: ruff==0.13.0; extra == 'dev'
+Requires-Dist: tox==4.30.2; extra == 'dev'
+Provides-Extra: gpu
+Requires-Dist: cupy<14,>=11; extra == 'gpu'
+Provides-Extra: rust
+Requires-Dist: maturin==1.6.0; extra == 'rust'
 Description-Content-Type: text/markdown
 [![PyPI License](https://img.shields.io/pypi/l/gsppy.svg?style=flat-square)]()
-![](https://img.shields.io/badge/python-3.8+-blue.svg)
+![](https://img.shields.io/badge/python-3.10+-blue.svg)
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3333987.svg)](https://doi.org/10.5281/zenodo.3333987)
 [![PyPI Downloads](https://img.shields.io/pypi/dm/gsppy.svg?style=flat-square)](https://pypi.org/project/gsppy/)
@@ -72,7 +75,7 @@ Description-Content-Type: text/markdown
 Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal mining, and user journey discovery.
 > [!IMPORTANT]
-> GSP-Py is compatible with Python 3.8 and later versions!
+> GSP-Py is compatible with Python 3.10 and later versions!
 ---
@@ -137,11 +140,7 @@ git clone https://github.com/jacksonpradolima/gsp-py.git
 cd gsp-py
 ```
-Refer to the [Developer Installation](#developer-installation) section and run:
-```bash
-rye sync
-```
+Refer to the [Developer Installation](#developer-installation) section and run the setup with uv.
 ### Option 2: Install via `pip`
@@ -155,52 +154,228 @@ pip install gsppy
 ## 🛠️ Developer Installation
-This project uses [Rye](https://github.com/mitsuhiko/rye) for managing dependencies, running scripts, and setting up the environment. Follow these steps to install and set up Rye for this project:
+This project now uses [uv](https://github.com/astral-sh/uv) for dependency management and virtual environments.
+#### 1. Install uv
+```bash
+curl -Ls https://astral.sh/uv/install.sh | bash
+```
+Make sure uv is on your PATH (for most Linux setups):
+```bash
+export PATH="$HOME/.local/bin:$PATH"
+```
+#### 2. Set up the project environment
+Create a local virtual environment and install dependencies from uv.lock (single source of truth):
-#### 1. Install Rye
-Run the following command to install Rye:
+```bash
+uv venv .venv
+uv sync --frozen --extra dev  # uses uv.lock
+uv pip install -e .
+```
+#### 3. Optional: Enable Rust acceleration
+Rust acceleration is optional and provides faster support counting using a PyO3 extension. Python fallback remains available.
+Build the extension locally:
 ```bash
-curl -sSf https://rye.astral.sh/get | bash
+make rust-build
 ```
-If the `~/.rye/bin` directory is not in your PATH, add the following line to your shell configuration file (e.g., `~/.bashrc`, `~/.zshrc`, etc.):
+Select backend at runtime (auto tries Rust, then falls back to Python):
+```bash
+export GSPPY_BACKEND=rust   # or python, or unset for auto
+```
+Run benchmarks (adjust to your machine):
 ```bash
-export PATH="$HOME/.rye/bin:$PATH"
+make bench-small
+make bench-big   # may use significant memory/CPU
+# or customize:
+GSPPY_BACKEND=auto uv run --python .venv/bin/python --no-project \
+  python benchmarks/bench_support.py --n_tx 1000000 --tx_len 8 --vocab 50000 --min_support 0.2 --warmup
 ```
-Reload your shell configuration file:
+#### 4. Optional: Enable GPU (CuPy) acceleration
+GPU acceleration is experimental and currently optimizes singleton (k=1) support counting using CuPy.
+Non-singleton candidates fall back to the Rust/Python backend.
+Install the optional extra (choose a CuPy build that matches your CUDA/ROCm setup if needed):
 ```bash
-source ~/.bashrc  # or `source ~/.zshrc`
+uv run pip install -e .[gpu]
 ```
-#### 2. Set Up the Project Environment
-To configure the project environment and install its dependencies, run:
+Select the GPU backend at runtime:
 ```bash
-rye sync
+export GSPPY_BACKEND=gpu
 ```
-#### 3. Use Rye Scripts
-Once the environment is set up, you can run the following commands to simplify project tasks:
+If a GPU isn't available, an error will be raised when GSPPY_BACKEND=gpu is set. Otherwise, the default "auto" uses CPU.
-- Run tests (in parallel): `rye run test`
-- Format code: `rye run format`
-- Lint code: `rye run lint`
-- Type-check: `rye run typecheck`
-- Add new dependencies: `rye add <package-name>`
-  - Add new dependency to dev dependencies: `rye add --dev <package-name>`
+#### 5. Common development tasks
+After the environment is ready, activate it and run tasks with standard tools:
-#### Notes
-- Rye automatically reads dependencies and scripts from the `pyproject.toml` file.
-- No need for `requirements.txt`, as Rye manages all dependencies!
+```bash
+source .venv/bin/activate
+pytest -n auto
+ruff check .
+pyright
+```
+If you prefer, you can also prefix commands with uv without activating:
+```bash
+uv run pytest -n auto
+uv run ruff check .
+uv run pyright
+```
+#### 5. Makefile (shortcuts)
+You can use the Makefile to automate common tasks:
+```bash
+make setup               # create .venv with uv and pin Python
+make install             # sync deps (from uv.lock) + install project (-e .)
+make test                # pytest -n auto
+make lint                # ruff check .
+make format              # ruff --fix
+make typecheck           # pyright (and mypy if configured)
+make pre-commit-install  # install the pre-commit hook
+make pre-commit-run      # run pre-commit on all files
+# Rust-specific shortcuts
+make rust-setup          # install rustup toolchain
+make rust-build          # build PyO3 extension with maturin
+make bench-small         # run small benchmark
+make bench-big           # run large benchmark
+```
+> [!NOTE]
+> Tox in this project uses the "tox-uv" plugin. When running `make tox` or `tox`, missing Python interpreters can be provisioned automatically via uv (no need to pre-install all versions). This makes local setup faster.
 ## 💡 Usage
-The library is designed to be easy to use and integrate with your own projects. Below is an example of how you can
-configure and run GSP-Py.
+The library is designed to be easy to use and integrate with your own projects. You can use GSP-Py either programmatically (Python API) or directly from the command line (CLI).
+---
+## 🚦 Using GSP-Py via CLI
+GSP-Py provides a command-line interface (CLI) for running the Generalized Sequential Pattern algorithm on transactional data. This allows you to mine frequent sequential patterns from JSON or CSV files without writing any code.
+### Installation
+First, install GSP-Py (if not already installed):
+```bash
+pip install gsppy
+```
+This will make the `gsppy` CLI command available in your environment.
+### Preparing Your Data
+Your input file should be either:
+- **JSON**: A list of transactions, each transaction is a list of items. Example:
+  ```json
+  [
+    ["Bread", "Milk"],
+    ["Bread", "Diaper", "Beer", "Eggs"],
+    ["Milk", "Diaper", "Beer", "Coke"],
+    ["Bread", "Milk", "Diaper", "Beer"],
+    ["Bread", "Milk", "Diaper", "Coke"]
+  ]
+  ```
+- **CSV**: Each row is a transaction, items separated by commas. Example:
+  ```csv
+  Bread,Milk
+  Bread,Diaper,Beer,Eggs
+  Milk,Diaper,Beer,Coke
+  Bread,Milk,Diaper,Beer
+  Bread,Milk,Diaper,Coke
+  ```
+### Running the CLI
+Use the following command to run GSPPy on your data:
+```bash
+gsppy --file path/to/transactions.json --min_support 0.3 --backend auto
+```
+Or for CSV files:
+```bash
+gsppy --file path/to/transactions.csv --min_support 0.3 --backend rust
+```
+#### CLI Options
+- `--file`: Path to your input file (JSON or CSV). **Required**.
+- `--min_support`: Minimum support threshold as a fraction (e.g., `0.3` for 30%). Default is `0.2`.
+- `--backend`: Backend to use for support counting. One of `auto` (default), `python`, `rust`, or `gpu`.
+- `--verbose`: (Optional) Enable detailed output for debugging.
+#### Example
+Suppose you have a file `transactions.json` as shown above. To find patterns with at least 30% support:
+```bash
+gsppy --file transactions.json --min_support 0.3
+```
+Sample output:
+```
+Pre-processing transactions...
+Starting GSP algorithm with min_support=0.3...
+Run 1: 6 candidates filtered to 5.
+Run 2: 20 candidates filtered to 3.
+Run 3: 2 candidates filtered to 2.
+Run 4: 1 candidates filtered to 0.
+GSP algorithm completed.
+Frequent Patterns Found:
+1-Sequence Patterns:
+Pattern: ('Bread',), Support: 4
+Pattern: ('Milk',), Support: 4
+Pattern: ('Diaper',), Support: 4
+Pattern: ('Beer',), Support: 3
+Pattern: ('Coke',), Support: 2
+2-Sequence Patterns:
+Pattern: ('Bread', 'Milk'), Support: 3
+Pattern: ('Milk', 'Diaper'), Support: 3
+Pattern: ('Diaper', 'Beer'), Support: 3
+3-Sequence Patterns:
+Pattern: ('Bread', 'Milk', 'Diaper'), Support: 2
+Pattern: ('Milk', 'Diaper', 'Beer'), Support: 2
+```
+#### Error Handling
+- If the file does not exist or is in an unsupported format, a clear error message will be shown.
+- The `min_support` value must be between 0.0 and 1.0 (exclusive of 0.0, inclusive of 1.0).
+#### Advanced: Verbose Output
+To see detailed logs for debugging, add the `--verbose` flag:
+```bash
+gsppy --file transactions.json --min_support 0.3 --verbose
+```
+---
+The following example shows how to use GSP-Py programmatically in Python:
 ### Example Input Data
@@ -301,20 +476,20 @@ improvement? [Open a discussion or issue!](https://github.com/jacksonpradolima/g
 We welcome contributions from the community! If you'd like to help improve GSP-Py, read
 our [CONTRIBUTING.md](CONTRIBUTING.md) guide to get started.
-Development dependencies (e.g., testing and linting tools) are automatically managed using Rye. To install
-these dependencies and set up the environment, run:
+Development dependencies (e.g., testing and linting tools) are handled via uv.
+To set up and run the main tasks:
 ```bash
-rye sync
+uv venv .venv
+uv sync --frozen --extra dev
+uv pip install -e .
+# Run tasks
+uv run pytest -n auto
+uv run ruff check .
+uv run pyright
 ```
-After syncing, you can run the following scripts using Rye for development tasks:
-- Run tests (in parallel): `rye run test`
-- Lint code: `rye run lint`
-- Type-check: `rye run typecheck`
-- Format code: `rye run format`
 ### General Steps:
 1. Fork the repository.

gsppy-3.0.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+gsppy/accelerate.py,sha256=YO3YQFzo2VAC6IXOTnQnOajkZO7SabkieGb1IPgWdSI,10407
+gsppy/cli.py,sha256=wsGoc_utxpRfgCF9vPOAyLDTOJZ8NaiwiUny5VyIYvQ,6567
+gsppy/gsp.py,sha256=GCHFhOu-DyHEPsse_OXzf9IaZoigF8ouRqgn_OsZBvA,14855
+gsppy/utils.py,sha256=YlV0F64lnd2Xymf6XnYr6mMLYWV2f2yjaHkZbAS1Qs0,3362
+gsppy-3.0.0.dist-info/METADATA,sha256=5Q6iWC2tabQyDFjEztrgK4nsOWzz4z21oSXmFvQ0wU8,17670
+gsppy-3.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+gsppy-3.0.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
+gsppy-3.0.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
+gsppy-3.0.0.dist-info/RECORD,,

gsppy-2.3.0.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-gsppy/cli.py,sha256=YxBL341LJzb6EN-RBkhW3o4ZCexOGiQXq_aRovKccA8,6790
-gsppy/gsp.py,sha256=CUCC1W5GGlGbWkC_td0qDfnSJiuzbWoMapR0qciejw8,13800
-gsppy/utils.py,sha256=gOT3USxmC0MrBnSHOQ8avxghWmjQe59hS4jNQ3eiENQ,3363
-gsppy-2.3.0.dist-info/METADATA,sha256=bgEnT2H2FGQx_ha4Unqz40qVVu-IICaCkLJ0ppOwUgs,12941
-gsppy-2.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-gsppy-2.3.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
-gsppy-2.3.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
-gsppy-2.3.0.dist-info/RECORD,,

{gsppy-2.3.0.dist-info → gsppy-3.0.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{gsppy-2.3.0.dist-info → gsppy-3.0.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{gsppy-2.3.0.dist-info → gsppy-3.0.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

gsppy 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

gsppy 2.3.0py3-none-any.whl → 3.0.0py3-none-any.whl