gsppy 3.6.0__py3-none-any.whl → 4.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsppy/gsp.py CHANGED
@@ -85,10 +85,12 @@ Version:
85
85
  - Current Version: 2.0
86
86
  """
87
87
 
88
+ from __future__ import annotations
89
+
88
90
  import math
89
91
  import logging
90
92
  import multiprocessing as mp
91
- from typing import Dict, List, Tuple, Union, Optional, cast
93
+ from typing import TYPE_CHECKING, Dict, List, Tuple, Union, Literal, Optional, cast, overload
92
94
  from itertools import chain
93
95
  from collections import Counter
94
96
 
@@ -100,8 +102,13 @@ from gsppy.utils import (
100
102
  is_subsequence_in_list_with_time_constraints,
101
103
  )
102
104
  from gsppy.pruning import PruningStrategy, create_default_pruning_strategy
105
+ from gsppy.sequence import Sequence, dict_to_sequences
103
106
  from gsppy.accelerate import support_counts as support_counts_accel
104
107
 
108
+ if TYPE_CHECKING:
109
+ import pandas as pd
110
+ import polars as pl
111
+
105
112
  logger: logging.Logger = logging.getLogger(__name__)
106
113
 
107
114
 
@@ -126,21 +133,37 @@ class GSP:
126
133
 
127
134
  def __init__(
128
135
  self,
129
- raw_transactions: Union[List[List[str]], List[List[Tuple[str, float]]]],
136
+ raw_transactions: Union[
137
+ List[List[str]],
138
+ List[List[Tuple[str, float]]],
139
+ "pl.DataFrame",
140
+ "pl.LazyFrame",
141
+ "pd.DataFrame",
142
+ ],
130
143
  mingap: Optional[float] = None,
131
144
  maxgap: Optional[float] = None,
132
145
  maxspan: Optional[float] = None,
133
146
  verbose: bool = False,
134
147
  pruning_strategy: Optional[PruningStrategy] = None,
148
+ transaction_col: Optional[str] = None,
149
+ item_col: Optional[str] = None,
150
+ timestamp_col: Optional[str] = None,
151
+ sequence_col: Optional[str] = None,
135
152
  ):
136
153
  """
137
154
  Initialize the GSP algorithm with raw transactional data.
138
155
 
139
156
  Parameters:
140
- raw_transactions (Union[List[List[str]], List[List[Tuple[str, float]]]]):
141
- Input transaction dataset where each transaction is either:
142
- - A list of items (e.g., [['A', 'B'], ['B', 'C', 'D']])
143
- - A list of (item, timestamp) tuples (e.g., [[('A', 1.0), ('B', 2.0)]])
157
+ raw_transactions (Union[List[List[str]], List[List[Tuple[str, float]]], DataFrame]):
158
+ Input transaction dataset. Accepts:
159
+ - A list of transactions where each transaction is a list of items (e.g., [['A', 'B'], ['B', 'C', 'D']])
160
+ - A list of transactions with timestamps (e.g., [[('A', 1.0), ('B', 2.0)]])
161
+ - A Polars or Pandas DataFrame (requires 'gsppy[dataframe]' installation)
162
+
163
+ When using DataFrames, you must specify either:
164
+ - `sequence_col`: Column containing complete sequences (list format)
165
+ - `transaction_col` and `item_col`: Columns for grouped format
166
+
144
167
  mingap (Optional[float]): Minimum time gap required between consecutive items in patterns.
145
168
  maxgap (Optional[float]): Maximum time gap allowed between consecutive items in patterns.
146
169
  maxspan (Optional[float]): Maximum time span from first to last item in patterns.
@@ -149,6 +172,10 @@ class GSP:
149
172
  pruning_strategy (Optional[PruningStrategy]): Custom pruning strategy for candidate filtering.
150
173
  If None, a default strategy is created based on
151
174
  temporal constraints.
175
+ transaction_col (Optional[str]): DataFrame only - column name for transaction IDs (grouped format).
176
+ item_col (Optional[str]): DataFrame only - column name for items (grouped format).
177
+ timestamp_col (Optional[str]): DataFrame only - column name for timestamps.
178
+ sequence_col (Optional[str]): DataFrame only - column name containing sequences (sequence format).
152
179
 
153
180
  Attributes Initialized:
154
181
  - Processes the input raw transaction dataset.
@@ -161,6 +188,44 @@ class GSP:
161
188
  ValueError: If the input transaction dataset is empty, contains
162
189
  fewer than two transactions, or is not properly formatted.
163
190
  Also raised if temporal constraints are invalid.
191
+
192
+ Examples:
193
+ Basic usage with lists:
194
+
195
+ ```python
196
+ from gsppy.gsp import GSP
197
+
198
+ transactions = [["A", "B"], ["B", "C", "D"]]
199
+ gsp = GSP(transactions)
200
+ patterns = gsp.search(min_support=0.5)
201
+ ```
202
+
203
+ Using Polars DataFrame (grouped format):
204
+
205
+ ```python
206
+ import polars as pl
207
+ from gsppy.gsp import GSP
208
+
209
+ df = pl.DataFrame(
210
+ {
211
+ "transaction_id": [1, 1, 2, 2, 2],
212
+ "item": ["A", "B", "A", "C", "D"],
213
+ }
214
+ )
215
+ gsp = GSP(df, transaction_col="transaction_id", item_col="item")
216
+ patterns = gsp.search(min_support=0.5)
217
+ ```
218
+
219
+ Using Pandas DataFrame (sequence format):
220
+
221
+ ```python
222
+ import pandas as pd
223
+ from gsppy.gsp import GSP
224
+
225
+ df = pd.DataFrame({"sequence": [["A", "B"], ["A", "C", "D"]]})
226
+ gsp = GSP(df, sequence_col="sequence")
227
+ patterns = gsp.search(min_support=0.5)
228
+ ```
164
229
  """
165
230
  self.freq_patterns: List[Dict[Tuple[str, ...], int]] = []
166
231
  self.mingap = mingap
@@ -170,7 +235,13 @@ class GSP:
170
235
  self.pruning_strategy: PruningStrategy
171
236
  self._configure_logging()
172
237
  self._validate_temporal_constraints()
173
- self._pre_processing(raw_transactions)
238
+
239
+ # Convert DataFrame to transaction list if necessary
240
+ transactions_to_process = self._convert_input_data(
241
+ raw_transactions, transaction_col, item_col, timestamp_col, sequence_col
242
+ )
243
+
244
+ self._pre_processing(transactions_to_process)
174
245
  # Initialize default pruning strategy if none provided
175
246
  if pruning_strategy is None:
176
247
  self.pruning_strategy = create_default_pruning_strategy(
@@ -180,6 +251,78 @@ class GSP:
180
251
  else:
181
252
  self.pruning_strategy = pruning_strategy
182
253
 
254
+ def _convert_input_data(
255
+ self,
256
+ raw_transactions: Union[
257
+ List[List[str]],
258
+ List[List[Tuple[str, float]]],
259
+ "pl.DataFrame",
260
+ "pl.LazyFrame",
261
+ "pd.DataFrame",
262
+ ],
263
+ transaction_col: Optional[str],
264
+ item_col: Optional[str],
265
+ timestamp_col: Optional[str],
266
+ sequence_col: Optional[str],
267
+ ) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
268
+ """
269
+ Convert input data to the expected transaction list format.
270
+
271
+ This method handles both traditional list inputs and DataFrame inputs
272
+ (Polars or Pandas). DataFrames are converted using the dataframe_adapters module.
273
+
274
+ Parameters:
275
+ raw_transactions: Input data (list or DataFrame)
276
+ transaction_col: Column name for transaction IDs (DataFrame grouped format)
277
+ item_col: Column name for items (DataFrame grouped format)
278
+ timestamp_col: Column name for timestamps (DataFrame)
279
+ sequence_col: Column name for sequences (DataFrame sequence format)
280
+
281
+ Returns:
282
+ Transaction list in the expected format
283
+
284
+ Raises:
285
+ ValueError: If DataFrame parameters are specified for non-DataFrame input
286
+ or if DataFrame conversion fails
287
+ """
288
+ # Check if any DataFrame-specific parameters are provided
289
+ df_params_provided = any([transaction_col, item_col, timestamp_col, sequence_col])
290
+
291
+ # If it's a list, validate that no DataFrame parameters were provided
292
+ if isinstance(raw_transactions, list):
293
+ if df_params_provided:
294
+ raise ValueError(
295
+ "DataFrame parameters (transaction_col, item_col, timestamp_col, sequence_col) "
296
+ "cannot be used with list input"
297
+ )
298
+ return cast(Union[List[List[str]], List[List[Tuple[str, float]]]], raw_transactions) # pyright: ignore[reportUnnecessaryCast]
299
+
300
+ # Otherwise, try to convert as DataFrame
301
+ from gsppy.dataframe_adapters import DataFrameAdapterError, dataframe_to_transactions
302
+
303
+ try:
304
+ logger.debug("Converting DataFrame input to transaction list")
305
+ transactions = dataframe_to_transactions(
306
+ raw_transactions,
307
+ transaction_col=transaction_col,
308
+ item_col=item_col,
309
+ timestamp_col=timestamp_col,
310
+ sequence_col=sequence_col,
311
+ )
312
+ logger.debug("Successfully converted DataFrame with %d transactions", len(transactions))
313
+ return transactions
314
+ except DataFrameAdapterError as e:
315
+ msg = f"Failed to convert DataFrame input: {e}"
316
+ logger.error(msg)
317
+ raise ValueError(msg) from e
318
+ except ImportError as e:
319
+ msg = (
320
+ "DataFrame input detected but dataframe_adapters module failed to import. "
321
+ "Install DataFrame support with: pip install 'gsppy[dataframe]'"
322
+ )
323
+ logger.error(msg)
324
+ raise ValueError(msg) from e
325
+
183
326
  def _configure_logging(self) -> None:
184
327
  """
185
328
  Configure logging for the GSP instance based on verbosity setting.
@@ -262,8 +405,7 @@ class GSP:
262
405
  # Validate temporal constraints are only used with timestamps
263
406
  if (self.mingap is not None or self.maxgap is not None or self.maxspan is not None) and not self.has_timestamps:
264
407
  logger.warning(
265
- "Temporal constraints specified but transactions do not have timestamps. "
266
- "Constraints will be ignored."
408
+ "Temporal constraints specified but transactions do not have timestamps. Constraints will be ignored."
267
409
  )
268
410
  # Clear temporal constraints since they cannot be applied
269
411
  self.mingap = None
@@ -449,13 +591,37 @@ class GSP:
449
591
  """
450
592
  logger.info("Run %d: %d candidates filtered to %d.", run, len(candidates), len(self.freq_patterns[run - 1]))
451
593
 
594
+ @overload
452
595
  def search(
453
596
  self,
454
597
  min_support: float = 0.2,
455
598
  max_k: Optional[int] = None,
456
599
  backend: Optional[str] = None,
457
600
  verbose: Optional[bool] = None,
458
- ) -> List[Dict[Tuple[str, ...], int]]:
601
+ *,
602
+ return_sequences: Literal[False] = False,
603
+ ) -> List[Dict[Tuple[str, ...], int]]: ...
604
+
605
+ @overload
606
+ def search(
607
+ self,
608
+ min_support: float = 0.2,
609
+ max_k: Optional[int] = None,
610
+ backend: Optional[str] = None,
611
+ verbose: Optional[bool] = None,
612
+ *,
613
+ return_sequences: Literal[True],
614
+ ) -> List[List[Sequence]]: ...
615
+
616
+ def search(
617
+ self,
618
+ min_support: float = 0.2,
619
+ max_k: Optional[int] = None,
620
+ backend: Optional[str] = None,
621
+ verbose: Optional[bool] = None,
622
+ *,
623
+ return_sequences: bool = False,
624
+ ) -> Union[List[Dict[Tuple[str, ...], int]], List[List[Sequence]]]:
459
625
  """
460
626
  Execute the Generalized Sequential Pattern (GSP) mining algorithm.
461
627
 
@@ -476,11 +642,20 @@ class GSP:
476
642
  Note: temporal constraints always use Python backend.
477
643
  verbose (Optional[bool]): Override instance verbosity setting for this search.
478
644
  If None, uses the instance's verbose setting.
645
+ return_sequences (bool): If True, returns patterns as Sequence objects instead of
646
+ Dict[Tuple[str, ...], int]. Defaults to False for backward
647
+ compatibility. When True, returns List[List[Sequence]] where
648
+ each Sequence contains items, support count, and can be extended
649
+ with additional metadata.
479
650
 
480
651
  Returns:
481
- List[Dict[Tuple[str, ...], int]]: A list of dictionaries containing frequent patterns
482
- at each k-sequence level, with patterns as keys
483
- and their support counts as values.
652
+ Union[List[Dict[Tuple[str, ...], int]], List[List[Sequence]]]:
653
+ If return_sequences is False (default):
654
+ A list of dictionaries containing frequent patterns at each k-sequence level,
655
+ with patterns as keys and their support counts as values.
656
+ If return_sequences is True:
657
+ A list of lists containing Sequence objects at each k-sequence level,
658
+ where each Sequence encapsulates the pattern items and support count.
484
659
 
485
660
  Raises:
486
661
  ValueError: If the minimum support threshold is not in the range `(0.0, 1.0]`.
@@ -491,7 +666,7 @@ class GSP:
491
666
  - Status updates for each iteration until the algorithm terminates.
492
667
 
493
668
  Examples:
494
- Basic usage without temporal constraints:
669
+ Basic usage without temporal constraints (default tuple-based):
495
670
 
496
671
  ```python
497
672
  from gsppy.gsp import GSP
@@ -504,6 +679,28 @@ class GSP:
504
679
 
505
680
  gsp = GSP(transactions)
506
681
  patterns = gsp.search(min_support=0.3)
682
+ # Returns: [{('Bread',): 4, ('Milk',): 4, ...}, {('Bread', 'Milk'): 3, ...}, ...]
683
+ ```
684
+
685
+ Using Sequence objects for richer pattern representation:
686
+
687
+ ```python
688
+ from gsppy.gsp import GSP
689
+
690
+ transactions = [
691
+ ["Bread", "Milk"],
692
+ ["Bread", "Diaper", "Beer", "Eggs"],
693
+ ["Milk", "Diaper", "Beer", "Coke"],
694
+ ]
695
+
696
+ gsp = GSP(transactions)
697
+ patterns = gsp.search(min_support=0.3, return_sequences=True)
698
+ # Returns: [[Sequence(('Bread',), support=4), Sequence(('Milk',), support=4), ...], ...]
699
+
700
+ # Access pattern details
701
+ for level_patterns in patterns:
702
+ for seq in level_patterns:
703
+ print(f"Pattern: {seq.items}, Support: {seq.support}")
507
704
  ```
508
705
 
509
706
  Usage with temporal constraints (requires timestamped transactions):
@@ -541,6 +738,9 @@ class GSP:
541
738
  f"Using temporal constraints: mingap={self.mingap}, maxgap={self.maxgap}, maxspan={self.maxspan}"
542
739
  )
543
740
 
741
+ # Clear freq_patterns for this search (allow reusing the GSP instance)
742
+ self.freq_patterns = []
743
+
544
744
  # Convert fractional support to absolute count (ceil to preserve threshold semantics)
545
745
  abs_min_support = int(math.ceil(len(self.transactions) * float(min_support)))
546
746
 
@@ -588,4 +788,9 @@ class GSP:
588
788
  self.verbose = original_verbose
589
789
  self._configure_logging()
590
790
 
591
- return self.freq_patterns[:-1]
791
+ # Return results in the requested format
792
+ result = self.freq_patterns[:-1]
793
+ if return_sequences:
794
+ # Convert Dict[Tuple[str, ...], int] to List[Sequence] for each level
795
+ return [dict_to_sequences(level_patterns) for level_patterns in result]
796
+ return result