gsppy 3.6.0__py3-none-any.whl → 4.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsppy/__init__.py +47 -2
- gsppy/cli.py +316 -13
- gsppy/dataframe_adapters.py +458 -0
- gsppy/enums.py +49 -0
- gsppy/gsp.py +220 -15
- gsppy/sequence.py +371 -0
- gsppy/token_mapper.py +99 -0
- gsppy/utils.py +120 -0
- {gsppy-3.6.0.dist-info → gsppy-4.1.0.dist-info}/METADATA +405 -9
- gsppy-4.1.0.dist-info/RECORD +16 -0
- gsppy-3.6.0.dist-info/RECORD +0 -12
- {gsppy-3.6.0.dist-info → gsppy-4.1.0.dist-info}/WHEEL +0 -0
- {gsppy-3.6.0.dist-info → gsppy-4.1.0.dist-info}/entry_points.txt +0 -0
- {gsppy-3.6.0.dist-info → gsppy-4.1.0.dist-info}/licenses/LICENSE +0 -0
gsppy/gsp.py
CHANGED
|
@@ -85,10 +85,12 @@ Version:
|
|
|
85
85
|
- Current Version: 2.0
|
|
86
86
|
"""
|
|
87
87
|
|
|
88
|
+
from __future__ import annotations
|
|
89
|
+
|
|
88
90
|
import math
|
|
89
91
|
import logging
|
|
90
92
|
import multiprocessing as mp
|
|
91
|
-
from typing import Dict, List, Tuple, Union, Optional, cast
|
|
93
|
+
from typing import TYPE_CHECKING, Dict, List, Tuple, Union, Literal, Optional, cast, overload
|
|
92
94
|
from itertools import chain
|
|
93
95
|
from collections import Counter
|
|
94
96
|
|
|
@@ -100,8 +102,13 @@ from gsppy.utils import (
|
|
|
100
102
|
is_subsequence_in_list_with_time_constraints,
|
|
101
103
|
)
|
|
102
104
|
from gsppy.pruning import PruningStrategy, create_default_pruning_strategy
|
|
105
|
+
from gsppy.sequence import Sequence, dict_to_sequences
|
|
103
106
|
from gsppy.accelerate import support_counts as support_counts_accel
|
|
104
107
|
|
|
108
|
+
if TYPE_CHECKING:
|
|
109
|
+
import pandas as pd
|
|
110
|
+
import polars as pl
|
|
111
|
+
|
|
105
112
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
106
113
|
|
|
107
114
|
|
|
@@ -126,21 +133,37 @@ class GSP:
|
|
|
126
133
|
|
|
127
134
|
def __init__(
|
|
128
135
|
self,
|
|
129
|
-
raw_transactions: Union[
|
|
136
|
+
raw_transactions: Union[
|
|
137
|
+
List[List[str]],
|
|
138
|
+
List[List[Tuple[str, float]]],
|
|
139
|
+
"pl.DataFrame",
|
|
140
|
+
"pl.LazyFrame",
|
|
141
|
+
"pd.DataFrame",
|
|
142
|
+
],
|
|
130
143
|
mingap: Optional[float] = None,
|
|
131
144
|
maxgap: Optional[float] = None,
|
|
132
145
|
maxspan: Optional[float] = None,
|
|
133
146
|
verbose: bool = False,
|
|
134
147
|
pruning_strategy: Optional[PruningStrategy] = None,
|
|
148
|
+
transaction_col: Optional[str] = None,
|
|
149
|
+
item_col: Optional[str] = None,
|
|
150
|
+
timestamp_col: Optional[str] = None,
|
|
151
|
+
sequence_col: Optional[str] = None,
|
|
135
152
|
):
|
|
136
153
|
"""
|
|
137
154
|
Initialize the GSP algorithm with raw transactional data.
|
|
138
155
|
|
|
139
156
|
Parameters:
|
|
140
|
-
raw_transactions (Union[List[List[str]], List[List[Tuple[str, float]]]]):
|
|
141
|
-
Input transaction dataset
|
|
142
|
-
- A list of items (e.g., [['A', 'B'], ['B', 'C', 'D']])
|
|
143
|
-
- A list of
|
|
157
|
+
raw_transactions (Union[List[List[str]], List[List[Tuple[str, float]]], DataFrame]):
|
|
158
|
+
Input transaction dataset. Accepts:
|
|
159
|
+
- A list of transactions where each transaction is a list of items (e.g., [['A', 'B'], ['B', 'C', 'D']])
|
|
160
|
+
- A list of transactions with timestamps (e.g., [[('A', 1.0), ('B', 2.0)]])
|
|
161
|
+
- A Polars or Pandas DataFrame (requires 'gsppy[dataframe]' installation)
|
|
162
|
+
|
|
163
|
+
When using DataFrames, you must specify either:
|
|
164
|
+
- `sequence_col`: Column containing complete sequences (list format)
|
|
165
|
+
- `transaction_col` and `item_col`: Columns for grouped format
|
|
166
|
+
|
|
144
167
|
mingap (Optional[float]): Minimum time gap required between consecutive items in patterns.
|
|
145
168
|
maxgap (Optional[float]): Maximum time gap allowed between consecutive items in patterns.
|
|
146
169
|
maxspan (Optional[float]): Maximum time span from first to last item in patterns.
|
|
@@ -149,6 +172,10 @@ class GSP:
|
|
|
149
172
|
pruning_strategy (Optional[PruningStrategy]): Custom pruning strategy for candidate filtering.
|
|
150
173
|
If None, a default strategy is created based on
|
|
151
174
|
temporal constraints.
|
|
175
|
+
transaction_col (Optional[str]): DataFrame only - column name for transaction IDs (grouped format).
|
|
176
|
+
item_col (Optional[str]): DataFrame only - column name for items (grouped format).
|
|
177
|
+
timestamp_col (Optional[str]): DataFrame only - column name for timestamps.
|
|
178
|
+
sequence_col (Optional[str]): DataFrame only - column name containing sequences (sequence format).
|
|
152
179
|
|
|
153
180
|
Attributes Initialized:
|
|
154
181
|
- Processes the input raw transaction dataset.
|
|
@@ -161,6 +188,44 @@ class GSP:
|
|
|
161
188
|
ValueError: If the input transaction dataset is empty, contains
|
|
162
189
|
fewer than two transactions, or is not properly formatted.
|
|
163
190
|
Also raised if temporal constraints are invalid.
|
|
191
|
+
|
|
192
|
+
Examples:
|
|
193
|
+
Basic usage with lists:
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
from gsppy.gsp import GSP
|
|
197
|
+
|
|
198
|
+
transactions = [["A", "B"], ["B", "C", "D"]]
|
|
199
|
+
gsp = GSP(transactions)
|
|
200
|
+
patterns = gsp.search(min_support=0.5)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
Using Polars DataFrame (grouped format):
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
import polars as pl
|
|
207
|
+
from gsppy.gsp import GSP
|
|
208
|
+
|
|
209
|
+
df = pl.DataFrame(
|
|
210
|
+
{
|
|
211
|
+
"transaction_id": [1, 1, 2, 2, 2],
|
|
212
|
+
"item": ["A", "B", "A", "C", "D"],
|
|
213
|
+
}
|
|
214
|
+
)
|
|
215
|
+
gsp = GSP(df, transaction_col="transaction_id", item_col="item")
|
|
216
|
+
patterns = gsp.search(min_support=0.5)
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
Using Pandas DataFrame (sequence format):
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
import pandas as pd
|
|
223
|
+
from gsppy.gsp import GSP
|
|
224
|
+
|
|
225
|
+
df = pd.DataFrame({"sequence": [["A", "B"], ["A", "C", "D"]]})
|
|
226
|
+
gsp = GSP(df, sequence_col="sequence")
|
|
227
|
+
patterns = gsp.search(min_support=0.5)
|
|
228
|
+
```
|
|
164
229
|
"""
|
|
165
230
|
self.freq_patterns: List[Dict[Tuple[str, ...], int]] = []
|
|
166
231
|
self.mingap = mingap
|
|
@@ -170,7 +235,13 @@ class GSP:
|
|
|
170
235
|
self.pruning_strategy: PruningStrategy
|
|
171
236
|
self._configure_logging()
|
|
172
237
|
self._validate_temporal_constraints()
|
|
173
|
-
|
|
238
|
+
|
|
239
|
+
# Convert DataFrame to transaction list if necessary
|
|
240
|
+
transactions_to_process = self._convert_input_data(
|
|
241
|
+
raw_transactions, transaction_col, item_col, timestamp_col, sequence_col
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
self._pre_processing(transactions_to_process)
|
|
174
245
|
# Initialize default pruning strategy if none provided
|
|
175
246
|
if pruning_strategy is None:
|
|
176
247
|
self.pruning_strategy = create_default_pruning_strategy(
|
|
@@ -180,6 +251,78 @@ class GSP:
|
|
|
180
251
|
else:
|
|
181
252
|
self.pruning_strategy = pruning_strategy
|
|
182
253
|
|
|
254
|
+
def _convert_input_data(
|
|
255
|
+
self,
|
|
256
|
+
raw_transactions: Union[
|
|
257
|
+
List[List[str]],
|
|
258
|
+
List[List[Tuple[str, float]]],
|
|
259
|
+
"pl.DataFrame",
|
|
260
|
+
"pl.LazyFrame",
|
|
261
|
+
"pd.DataFrame",
|
|
262
|
+
],
|
|
263
|
+
transaction_col: Optional[str],
|
|
264
|
+
item_col: Optional[str],
|
|
265
|
+
timestamp_col: Optional[str],
|
|
266
|
+
sequence_col: Optional[str],
|
|
267
|
+
) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
|
|
268
|
+
"""
|
|
269
|
+
Convert input data to the expected transaction list format.
|
|
270
|
+
|
|
271
|
+
This method handles both traditional list inputs and DataFrame inputs
|
|
272
|
+
(Polars or Pandas). DataFrames are converted using the dataframe_adapters module.
|
|
273
|
+
|
|
274
|
+
Parameters:
|
|
275
|
+
raw_transactions: Input data (list or DataFrame)
|
|
276
|
+
transaction_col: Column name for transaction IDs (DataFrame grouped format)
|
|
277
|
+
item_col: Column name for items (DataFrame grouped format)
|
|
278
|
+
timestamp_col: Column name for timestamps (DataFrame)
|
|
279
|
+
sequence_col: Column name for sequences (DataFrame sequence format)
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Transaction list in the expected format
|
|
283
|
+
|
|
284
|
+
Raises:
|
|
285
|
+
ValueError: If DataFrame parameters are specified for non-DataFrame input
|
|
286
|
+
or if DataFrame conversion fails
|
|
287
|
+
"""
|
|
288
|
+
# Check if any DataFrame-specific parameters are provided
|
|
289
|
+
df_params_provided = any([transaction_col, item_col, timestamp_col, sequence_col])
|
|
290
|
+
|
|
291
|
+
# If it's a list, validate that no DataFrame parameters were provided
|
|
292
|
+
if isinstance(raw_transactions, list):
|
|
293
|
+
if df_params_provided:
|
|
294
|
+
raise ValueError(
|
|
295
|
+
"DataFrame parameters (transaction_col, item_col, timestamp_col, sequence_col) "
|
|
296
|
+
"cannot be used with list input"
|
|
297
|
+
)
|
|
298
|
+
return cast(Union[List[List[str]], List[List[Tuple[str, float]]]], raw_transactions) # pyright: ignore[reportUnnecessaryCast]
|
|
299
|
+
|
|
300
|
+
# Otherwise, try to convert as DataFrame
|
|
301
|
+
from gsppy.dataframe_adapters import DataFrameAdapterError, dataframe_to_transactions
|
|
302
|
+
|
|
303
|
+
try:
|
|
304
|
+
logger.debug("Converting DataFrame input to transaction list")
|
|
305
|
+
transactions = dataframe_to_transactions(
|
|
306
|
+
raw_transactions,
|
|
307
|
+
transaction_col=transaction_col,
|
|
308
|
+
item_col=item_col,
|
|
309
|
+
timestamp_col=timestamp_col,
|
|
310
|
+
sequence_col=sequence_col,
|
|
311
|
+
)
|
|
312
|
+
logger.debug("Successfully converted DataFrame with %d transactions", len(transactions))
|
|
313
|
+
return transactions
|
|
314
|
+
except DataFrameAdapterError as e:
|
|
315
|
+
msg = f"Failed to convert DataFrame input: {e}"
|
|
316
|
+
logger.error(msg)
|
|
317
|
+
raise ValueError(msg) from e
|
|
318
|
+
except ImportError as e:
|
|
319
|
+
msg = (
|
|
320
|
+
"DataFrame input detected but dataframe_adapters module failed to import. "
|
|
321
|
+
"Install DataFrame support with: pip install 'gsppy[dataframe]'"
|
|
322
|
+
)
|
|
323
|
+
logger.error(msg)
|
|
324
|
+
raise ValueError(msg) from e
|
|
325
|
+
|
|
183
326
|
def _configure_logging(self) -> None:
|
|
184
327
|
"""
|
|
185
328
|
Configure logging for the GSP instance based on verbosity setting.
|
|
@@ -262,8 +405,7 @@ class GSP:
|
|
|
262
405
|
# Validate temporal constraints are only used with timestamps
|
|
263
406
|
if (self.mingap is not None or self.maxgap is not None or self.maxspan is not None) and not self.has_timestamps:
|
|
264
407
|
logger.warning(
|
|
265
|
-
"Temporal constraints specified but transactions do not have timestamps. "
|
|
266
|
-
"Constraints will be ignored."
|
|
408
|
+
"Temporal constraints specified but transactions do not have timestamps. Constraints will be ignored."
|
|
267
409
|
)
|
|
268
410
|
# Clear temporal constraints since they cannot be applied
|
|
269
411
|
self.mingap = None
|
|
@@ -449,13 +591,37 @@ class GSP:
|
|
|
449
591
|
"""
|
|
450
592
|
logger.info("Run %d: %d candidates filtered to %d.", run, len(candidates), len(self.freq_patterns[run - 1]))
|
|
451
593
|
|
|
594
|
+
@overload
|
|
452
595
|
def search(
|
|
453
596
|
self,
|
|
454
597
|
min_support: float = 0.2,
|
|
455
598
|
max_k: Optional[int] = None,
|
|
456
599
|
backend: Optional[str] = None,
|
|
457
600
|
verbose: Optional[bool] = None,
|
|
458
|
-
|
|
601
|
+
*,
|
|
602
|
+
return_sequences: Literal[False] = False,
|
|
603
|
+
) -> List[Dict[Tuple[str, ...], int]]: ...
|
|
604
|
+
|
|
605
|
+
@overload
|
|
606
|
+
def search(
|
|
607
|
+
self,
|
|
608
|
+
min_support: float = 0.2,
|
|
609
|
+
max_k: Optional[int] = None,
|
|
610
|
+
backend: Optional[str] = None,
|
|
611
|
+
verbose: Optional[bool] = None,
|
|
612
|
+
*,
|
|
613
|
+
return_sequences: Literal[True],
|
|
614
|
+
) -> List[List[Sequence]]: ...
|
|
615
|
+
|
|
616
|
+
def search(
|
|
617
|
+
self,
|
|
618
|
+
min_support: float = 0.2,
|
|
619
|
+
max_k: Optional[int] = None,
|
|
620
|
+
backend: Optional[str] = None,
|
|
621
|
+
verbose: Optional[bool] = None,
|
|
622
|
+
*,
|
|
623
|
+
return_sequences: bool = False,
|
|
624
|
+
) -> Union[List[Dict[Tuple[str, ...], int]], List[List[Sequence]]]:
|
|
459
625
|
"""
|
|
460
626
|
Execute the Generalized Sequential Pattern (GSP) mining algorithm.
|
|
461
627
|
|
|
@@ -476,11 +642,20 @@ class GSP:
|
|
|
476
642
|
Note: temporal constraints always use Python backend.
|
|
477
643
|
verbose (Optional[bool]): Override instance verbosity setting for this search.
|
|
478
644
|
If None, uses the instance's verbose setting.
|
|
645
|
+
return_sequences (bool): If True, returns patterns as Sequence objects instead of
|
|
646
|
+
Dict[Tuple[str, ...], int]. Defaults to False for backward
|
|
647
|
+
compatibility. When True, returns List[List[Sequence]] where
|
|
648
|
+
each Sequence contains items, support count, and can be extended
|
|
649
|
+
with additional metadata.
|
|
479
650
|
|
|
480
651
|
Returns:
|
|
481
|
-
List[Dict[Tuple[str, ...], int]]:
|
|
482
|
-
|
|
483
|
-
|
|
652
|
+
Union[List[Dict[Tuple[str, ...], int]], List[List[Sequence]]]:
|
|
653
|
+
If return_sequences is False (default):
|
|
654
|
+
A list of dictionaries containing frequent patterns at each k-sequence level,
|
|
655
|
+
with patterns as keys and their support counts as values.
|
|
656
|
+
If return_sequences is True:
|
|
657
|
+
A list of lists containing Sequence objects at each k-sequence level,
|
|
658
|
+
where each Sequence encapsulates the pattern items and support count.
|
|
484
659
|
|
|
485
660
|
Raises:
|
|
486
661
|
ValueError: If the minimum support threshold is not in the range `(0.0, 1.0]`.
|
|
@@ -491,7 +666,7 @@ class GSP:
|
|
|
491
666
|
- Status updates for each iteration until the algorithm terminates.
|
|
492
667
|
|
|
493
668
|
Examples:
|
|
494
|
-
Basic usage without temporal constraints:
|
|
669
|
+
Basic usage without temporal constraints (default tuple-based):
|
|
495
670
|
|
|
496
671
|
```python
|
|
497
672
|
from gsppy.gsp import GSP
|
|
@@ -504,6 +679,28 @@ class GSP:
|
|
|
504
679
|
|
|
505
680
|
gsp = GSP(transactions)
|
|
506
681
|
patterns = gsp.search(min_support=0.3)
|
|
682
|
+
# Returns: [{('Bread',): 4, ('Milk',): 4, ...}, {('Bread', 'Milk'): 3, ...}, ...]
|
|
683
|
+
```
|
|
684
|
+
|
|
685
|
+
Using Sequence objects for richer pattern representation:
|
|
686
|
+
|
|
687
|
+
```python
|
|
688
|
+
from gsppy.gsp import GSP
|
|
689
|
+
|
|
690
|
+
transactions = [
|
|
691
|
+
["Bread", "Milk"],
|
|
692
|
+
["Bread", "Diaper", "Beer", "Eggs"],
|
|
693
|
+
["Milk", "Diaper", "Beer", "Coke"],
|
|
694
|
+
]
|
|
695
|
+
|
|
696
|
+
gsp = GSP(transactions)
|
|
697
|
+
patterns = gsp.search(min_support=0.3, return_sequences=True)
|
|
698
|
+
# Returns: [[Sequence(('Bread',), support=4), Sequence(('Milk',), support=4), ...], ...]
|
|
699
|
+
|
|
700
|
+
# Access pattern details
|
|
701
|
+
for level_patterns in patterns:
|
|
702
|
+
for seq in level_patterns:
|
|
703
|
+
print(f"Pattern: {seq.items}, Support: {seq.support}")
|
|
507
704
|
```
|
|
508
705
|
|
|
509
706
|
Usage with temporal constraints (requires timestamped transactions):
|
|
@@ -541,6 +738,9 @@ class GSP:
|
|
|
541
738
|
f"Using temporal constraints: mingap={self.mingap}, maxgap={self.maxgap}, maxspan={self.maxspan}"
|
|
542
739
|
)
|
|
543
740
|
|
|
741
|
+
# Clear freq_patterns for this search (allow reusing the GSP instance)
|
|
742
|
+
self.freq_patterns = []
|
|
743
|
+
|
|
544
744
|
# Convert fractional support to absolute count (ceil to preserve threshold semantics)
|
|
545
745
|
abs_min_support = int(math.ceil(len(self.transactions) * float(min_support)))
|
|
546
746
|
|
|
@@ -588,4 +788,9 @@ class GSP:
|
|
|
588
788
|
self.verbose = original_verbose
|
|
589
789
|
self._configure_logging()
|
|
590
790
|
|
|
591
|
-
|
|
791
|
+
# Return results in the requested format
|
|
792
|
+
result = self.freq_patterns[:-1]
|
|
793
|
+
if return_sequences:
|
|
794
|
+
# Convert Dict[Tuple[str, ...], int] to List[Sequence] for each level
|
|
795
|
+
return [dict_to_sequences(level_patterns) for level_patterns in result]
|
|
796
|
+
return result
|