PyPI - gsppy - Versions diffs - 3.6.0__py3-none-any.whl → 4.0.0__py3-none-any.whl - Mend

gsppy 3.6.0py3-none-any.whl → 4.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

gsppy/__init__.py +37 -2
gsppy/cli.py +314 -11
gsppy/dataframe_adapters.py +458 -0
gsppy/enums.py +49 -0
gsppy/gsp.py +150 -9
gsppy/token_mapper.py +99 -0
gsppy/utils.py +120 -0
{gsppy-3.6.0.dist-info → gsppy-4.0.0.dist-info}/METADATA +329 -9
gsppy-4.0.0.dist-info/RECORD +15 -0
gsppy-3.6.0.dist-info/RECORD +0 -12
{gsppy-3.6.0.dist-info → gsppy-4.0.0.dist-info}/WHEEL +0 -0
{gsppy-3.6.0.dist-info → gsppy-4.0.0.dist-info}/entry_points.txt +0 -0
{gsppy-3.6.0.dist-info → gsppy-4.0.0.dist-info}/licenses/LICENSE +0 -0

gsppy/__init__.py CHANGED Viewed

@@ -10,17 +10,37 @@ from gsppy.cli import (
     setup_logging,
     detect_and_read_file,
     read_transactions_from_csv,
+    read_transactions_from_spm,
     read_transactions_from_json,
+    read_transactions_from_arrow,
+    read_transactions_from_parquet,
 )
 from gsppy.gsp import GSP
 from gsppy.pruning import (
+    CombinedPruning,
     PruningStrategy,
     SupportBasedPruning,
-    FrequencyBasedPruning,
     TemporalAwarePruning,
-    CombinedPruning,
+    FrequencyBasedPruning,
     create_default_pruning_strategy,
 )
+from gsppy.token_mapper import TokenMapper
+# DataFrame adapters are optional - import only if dependencies are available
+try:
+    from gsppy.dataframe_adapters import (
+        DataFrameAdapterError,
+        pandas_to_transactions,
+        polars_to_transactions,
+        dataframe_to_transactions,
+    )
+except ImportError:
+    DataFrameAdapterError = None  # type: ignore
+    pandas_to_transactions = None  # type: ignore
+    polars_to_transactions = None  # type: ignore
+    dataframe_to_transactions = None  # type: ignore
+_DATAFRAME_AVAILABLE = DataFrameAdapterError is not None
 try:
     __version__ = importlib_metadata.version("gsppy")
@@ -32,6 +52,9 @@ __all__ = [
     "detect_and_read_file",
     "read_transactions_from_csv",
     "read_transactions_from_json",
+    "read_transactions_from_parquet",
+    "read_transactions_from_arrow",
+    "read_transactions_from_spm",
     "setup_logging",
     "__version__",
     "PruningStrategy",
@@ -40,4 +63,16 @@ __all__ = [
     "TemporalAwarePruning",
     "CombinedPruning",
     "create_default_pruning_strategy",
+    "TokenMapper",
 ]
+# Add DataFrame adapters to __all__ if available
+if _DATAFRAME_AVAILABLE:
+    __all__.extend(
+        [
+            "dataframe_to_transactions",
+            "polars_to_transactions",
+            "pandas_to_transactions",
+            "DataFrameAdapterError",
+        ]
+    )

gsppy/cli.py CHANGED Viewed

@@ -28,6 +28,8 @@ This CLI empowers users to perform sequential pattern mining on transactional da
 a simple command-line interface.
 """
+from __future__ import annotations
 import os
 import csv
 import sys
@@ -38,21 +40,29 @@ from typing import Any, Dict, List, Tuple, Union, Optional, cast
 import click
 from gsppy.gsp import GSP
+from gsppy.enums import (
+    ARROW_EXTENSIONS,
+    PARQUET_EXTENSIONS,
+    DATAFRAME_EXTENSIONS,
+    SUPPORTED_EXTENSIONS_MESSAGE,
+    FileFormat,
+    FileExtension,
+)
 from gsppy.utils import has_timestamps
 def setup_logging(verbose: bool) -> None:
     """
     Configure logging with standardized format based on verbosity level.
     When verbose is enabled, provides detailed structured logging with:
     - Timestamps (ISO 8601 format)
     - Log levels
     - Process ID for traceability
     - Module context
     When verbose is disabled, uses simple format with just the message.
     Parameters:
         verbose: Whether to enable verbose logging with detailed formatting.
     """
@@ -60,7 +70,7 @@ def setup_logging(verbose: bool) -> None:
     root_logger = logging.getLogger()
     for handler in root_logger.handlers[:]:
         root_logger.removeHandler(handler)
     if verbose:
         # Detailed format with timestamps, levels, PID, and context for verbose mode
         log_format = "%(asctime)s | %(levelname)-8s | PID:%(process)d | %(name)s | %(message)s"
@@ -71,7 +81,7 @@ def setup_logging(verbose: bool) -> None:
         log_format = "%(message)s"
         date_format = None
         log_level = logging.INFO
     # Configure logging with the appropriate format
     logging.basicConfig(
         level=log_level,
@@ -186,9 +196,39 @@ def read_transactions_from_csv(file_path: str) -> List[List[str]]:
         raise ValueError(msg) from e
+def read_transactions_from_spm(file_path: str) -> List[List[str]]:
+    """
+    Read transactions from an SPM/GSP format file.
+    The SPM/GSP format uses delimiters:
+    - `-1`: End of element (item set)
+    - `-2`: End of sequence (transaction)
+    Parameters:
+        file_path (str): Path to the file containing transactions.
+    Returns:
+        List[List[str]]: Parsed transactions from the file.
+    Raises:
+        ValueError: If the file cannot be read or contains invalid data.
+    """
+    try:
+        from gsppy.utils import read_transactions_from_spm as read_spm
+        return cast(List[List[str]], read_spm(file_path, return_mappings=False))
+    except Exception as e:
+        msg = f"Error reading transaction data from SPM file '{file_path}': {e}"
+        logging.error(msg)
+        raise ValueError(msg) from e
 def detect_and_read_file(file_path: str) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
     """
-    Detect file format (CSV or JSON) and read transactions.
+    Detect file format (CSV, JSON, Parquet, Arrow) and read transactions.
+    Supports traditional formats (CSV, JSON) and modern DataFrame formats (Parquet, Arrow).
+    For DataFrame formats, requires 'gsppy[dataframe]' to be installed.
     Parameters:
         file_path (str): Path to the file containing transactions.
@@ -206,13 +246,200 @@ def detect_and_read_file(file_path: str) -> Union[List[List[str]], List[List[Tup
     _, file_extension = os.path.splitext(file_path)
     file_extension = file_extension.lower()
-    if file_extension == ".json":
+    if file_extension == FileExtension.JSON.value:
         return read_transactions_from_json(file_path)
-    if file_extension == ".csv":
+    if file_extension == FileExtension.CSV.value:
         return read_transactions_from_csv(file_path)
-    raise ValueError("Unsupported file format. Please provide a JSON or CSV file.")
+    if file_extension in PARQUET_EXTENSIONS:
+        return read_transactions_from_parquet(file_path)
+    if file_extension in ARROW_EXTENSIONS:
+        return read_transactions_from_arrow(file_path)
+    raise ValueError(SUPPORTED_EXTENSIONS_MESSAGE.format(extension=file_extension))
+def read_transactions_from_parquet(
+    file_path: str,
+    transaction_col: Optional[str] = None,
+    item_col: Optional[str] = None,
+    timestamp_col: Optional[str] = None,
+    sequence_col: Optional[str] = None,
+) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
+    """
+    Read transactions from a Parquet file using Polars.
+    Parameters:
+        file_path (str): Path to the Parquet file.
+        transaction_col (Optional[str]): Column name for transaction IDs (grouped format).
+        item_col (Optional[str]): Column name for items (grouped format).
+        timestamp_col (Optional[str]): Column name for timestamps.
+        sequence_col (Optional[str]): Column name containing sequences (sequence format).
+    Returns:
+        Union[List[List[str]], List[List[Tuple[str, float]]]]:
+            Parsed transactions from the file.
+    Raises:
+        ValueError: If the file cannot be read or Polars is not installed.
+    """
+    try:
+        import polars as pl
+        from gsppy.dataframe_adapters import polars_to_transactions
+    except ImportError as e:
+        raise ValueError("Parquet support requires Polars. Install with: pip install 'gsppy[dataframe]'") from e
+    try:
+        df: Any = pl.read_parquet(file_path)
+        return polars_to_transactions(
+            df,
+            transaction_col=transaction_col,
+            item_col=item_col,
+            timestamp_col=timestamp_col,
+            sequence_col=sequence_col,
+        )
+    except Exception as e:
+        msg = f"Error reading transaction data from Parquet file '{file_path}': {e}"
+        logging.error(msg)
+        raise ValueError(msg) from e
+def read_transactions_from_arrow(
+    file_path: str,
+    transaction_col: Optional[str] = None,
+    item_col: Optional[str] = None,
+    timestamp_col: Optional[str] = None,
+    sequence_col: Optional[str] = None,
+) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
+    """
+    Read transactions from an Arrow/Feather file using Polars.
+    Parameters:
+        file_path (str): Path to the Arrow/Feather file.
+        transaction_col (Optional[str]): Column name for transaction IDs (grouped format).
+        item_col (Optional[str]): Column name for items (grouped format).
+        timestamp_col (Optional[str]): Column name for timestamps.
+        sequence_col (Optional[str]): Column name containing sequences (sequence format).
+    Returns:
+        Union[List[List[str]], List[List[Tuple[str, float]]]]:
+            Parsed transactions from the file.
+    Raises:
+        ValueError: If the file cannot be read or Polars is not installed.
+    """
+    try:
+        import polars as pl
+        from gsppy.dataframe_adapters import polars_to_transactions
+    except ImportError as e:
+        raise ValueError("Arrow/Feather support requires Polars. Install with: pip install 'gsppy[dataframe]'") from e
+    try:
+        df: Any = pl.read_ipc(file_path)
+        return polars_to_transactions(
+            df,
+            transaction_col=transaction_col,
+            item_col=item_col,
+            timestamp_col=timestamp_col,
+            sequence_col=sequence_col,
+        )
+    except Exception as e:
+        msg = f"Error reading transaction data from Arrow file '{file_path}': {e}"
+        logging.error(msg)
+        raise ValueError(msg) from e
+def _load_dataframe_format(
+    file_path: str,
+    file_extension: str,
+    transaction_col: Optional[str],
+    item_col: Optional[str],
+    timestamp_col: Optional[str],
+    sequence_col: Optional[str],
+) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
+    """
+    Load transactions from DataFrame formats (Parquet/Arrow).
+    Parameters:
+        file_path: Path to the file
+        file_extension: File extension (lowercase)
+        transaction_col: Transaction ID column name
+        item_col: Item column name
+        timestamp_col: Timestamp column name
+        sequence_col: Sequence column name
+    Returns:
+        Loaded transactions
+    """
+    if file_extension in PARQUET_EXTENSIONS:
+        return read_transactions_from_parquet(
+            file_path,
+            transaction_col=transaction_col,
+            item_col=item_col,
+            timestamp_col=timestamp_col,
+            sequence_col=sequence_col,
+        )
+    else:  # Arrow/Feather
+        return read_transactions_from_arrow(
+            file_path,
+            transaction_col=transaction_col,
+            item_col=item_col,
+            timestamp_col=timestamp_col,
+            sequence_col=sequence_col,
+        )
+def _load_transactions_by_format(
+    file_path: str,
+    file_format: str,
+    file_extension: str,
+    is_dataframe_format: bool,
+    transaction_col: Optional[str],
+    item_col: Optional[str],
+    timestamp_col: Optional[str],
+    sequence_col: Optional[str],
+) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
+    """
+    Load transactions based on specified format.
+    Parameters:
+        file_path: Path to the file
+        file_format: Format string (lowercase)
+        file_extension: File extension (lowercase)
+        is_dataframe_format: Whether file is a DataFrame format
+        transaction_col: Transaction ID column name
+        item_col: Item column name
+        timestamp_col: Timestamp column name
+        sequence_col: Sequence column name
+    Returns:
+        Loaded transactions
+    Raises:
+        ValueError: If format is unknown
+    """
+    if file_format == FileFormat.SPM.value:
+        return read_transactions_from_spm(file_path)
+    elif file_format == FileFormat.JSON.value:
+        return read_transactions_from_json(file_path)
+    elif file_format == FileFormat.CSV.value:
+        return read_transactions_from_csv(file_path)
+    elif file_format in (FileFormat.PARQUET.value, FileFormat.ARROW.value):
+        return _load_dataframe_format(file_path, file_extension, transaction_col, item_col, timestamp_col, sequence_col)
+    elif file_format == FileFormat.AUTO.value:
+        # Auto-detect format
+        if is_dataframe_format:
+            return _load_dataframe_format(
+                file_path, file_extension, transaction_col, item_col, timestamp_col, sequence_col
+            )
+        else:
+            return detect_and_read_file(file_path)
+    else:
+        raise ValueError(f"Unknown format: {file_format}")
 # Click-based CLI
@@ -222,7 +449,7 @@ def detect_and_read_file(file_path: str) -> Union[List[List[str]], List[List[Tup
     "file_path",
     required=True,
     type=click.Path(exists=True),
-    help="Path to a JSON or CSV file containing transactions.",
+    help="Path to a transaction file (JSON, CSV, SPM, Parquet, or Arrow format).",
 )
 @click.option(
     "--min_support",
@@ -256,6 +483,37 @@ def detect_and_read_file(file_path: str) -> Union[List[List[str]], List[List[Tup
     default=None,
     help="Maximum time span from first to last item in patterns (requires timestamped transactions).",
 )
+@click.option(
+    "--transaction-col",
+    type=str,
+    default=None,
+    help="DataFrame: column name for transaction IDs (grouped format).",
+)
+@click.option(
+    "--item-col",
+    type=str,
+    default=None,
+    help="DataFrame: column name for items (grouped format).",
+)
+@click.option(
+    "--timestamp-col",
+    type=str,
+    default=None,
+    help="DataFrame: column name for timestamps.",
+)
+@click.option(
+    "--sequence-col",
+    type=str,
+    default=None,
+    help="DataFrame: column name containing sequences (sequence format).",
+)
+@click.option(
+    "--format",
+    type=click.Choice([fmt.value for fmt in FileFormat], case_sensitive=False),
+    default=FileFormat.AUTO.value,
+    show_default=True,
+    help="File format to use. 'auto' detects format from file extension.",
+)
 @click.option("--verbose", is_flag=True, help="Enable verbose output for debugging purposes.")
 def main(
     file_path: str,
@@ -264,11 +522,21 @@ def main(
     mingap: Optional[float],
     maxgap: Optional[float],
     maxspan: Optional[float],
+    transaction_col: Optional[str],
+    item_col: Optional[str],
+    timestamp_col: Optional[str],
+    sequence_col: Optional[str],
+    format: str,  # noqa: A002
     verbose: bool,
 ) -> None:
     """
     Run the GSP algorithm on transactional data from a file.
+    Supports multiple file formats:
+    - JSON/CSV/SPM: Traditional transaction formats
+    - Parquet/Arrow: Modern DataFrame formats (requires 'gsppy[dataframe]')
+    - Polars/Pandas DataFrames: Can be passed directly (requires 'gsppy[dataframe]')
     Supports both simple transactions (items only) and timestamped transactions
     (item-timestamp pairs) for temporal pattern mining.
@@ -285,12 +553,47 @@ def main(
         gsppy --file temporal_data.json --min_support 0.3 --maxgap 10
         gsppy --file events.json --min_support 0.5 --mingap 2 --maxgap 10 --maxspan 20
         ```
+        With Parquet files (grouped format):
+        ```bash
+        gsppy --file data.parquet --min_support 0.3 \
+              --transaction-col txn_id --item-col product
+        ```
+        With Arrow files (sequence format):
+        ```bash
+        gsppy --file sequences.arrow --min_support 0.3 \
+              --sequence-col items
+        ```
+        With SPM format files:
+        ```bash
+        gsppy --file data.txt --format spm --min_support 0.3
+        ```
     """
     setup_logging(verbose)
+    # Detect file extension to determine if DataFrame column params are needed
+    _, file_extension = os.path.splitext(file_path)
+    file_extension = file_extension.lower()
+    is_dataframe_format = file_extension in DATAFRAME_EXTENSIONS
     # Automatically detect and load transactions
     try:
-        transactions = detect_and_read_file(file_path)
+        file_format = format.lower()
+        transactions = _load_transactions_by_format(
+            file_path,
+            file_format,
+            file_extension,
+            is_dataframe_format,
+            transaction_col,
+            item_col,
+            timestamp_col,
+            sequence_col,
+        )
     except ValueError as e:
         logger.error(f"Error: {e}")
         sys.exit(1)

gsppy 3.6.0__py3-none-any.whl → 4.0.0__py3-none-any.whl

gsppy 3.6.0py3-none-any.whl → 4.0.0py3-none-any.whl