gsppy 3.3.0__py3-none-any.whl → 3.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsppy/__init__.py +6 -1
- gsppy/accelerate.py +5 -2
- gsppy/cli.py +179 -25
- gsppy/gsp.py +200 -17
- gsppy/utils.py +285 -5
- {gsppy-3.3.0.dist-info → gsppy-3.5.0.dist-info}/METADATA +199 -16
- gsppy-3.5.0.dist-info/RECORD +11 -0
- gsppy-3.3.0.dist-info/RECORD +0 -11
- {gsppy-3.3.0.dist-info → gsppy-3.5.0.dist-info}/WHEEL +0 -0
- {gsppy-3.3.0.dist-info → gsppy-3.5.0.dist-info}/entry_points.txt +0 -0
- {gsppy-3.3.0.dist-info → gsppy-3.5.0.dist-info}/licenses/LICENSE +0 -0
gsppy/__init__.py
CHANGED
|
@@ -6,7 +6,12 @@ implementation, CLI helpers for loading transactional data, and the package vers
|
|
|
6
6
|
|
|
7
7
|
from importlib import metadata as importlib_metadata
|
|
8
8
|
|
|
9
|
-
from gsppy.cli import
|
|
9
|
+
from gsppy.cli import (
|
|
10
|
+
setup_logging,
|
|
11
|
+
detect_and_read_file,
|
|
12
|
+
read_transactions_from_csv,
|
|
13
|
+
read_transactions_from_json,
|
|
14
|
+
)
|
|
10
15
|
from gsppy.gsp import GSP
|
|
11
16
|
|
|
12
17
|
try:
|
gsppy/accelerate.py
CHANGED
|
@@ -28,11 +28,14 @@ try: # pragma: no cover - optional dependency path
|
|
|
28
28
|
cp = cast(Any, _cp_mod)
|
|
29
29
|
|
|
30
30
|
try:
|
|
31
|
-
|
|
31
|
+
if cp is not None:
|
|
32
|
+
_gpu_available = cp.cuda.runtime.getDeviceCount() > 0
|
|
33
|
+
else:
|
|
34
|
+
_gpu_available = False
|
|
32
35
|
except Exception:
|
|
33
36
|
_gpu_available = False
|
|
34
37
|
except Exception: # pragma: no cover - optional dependency path
|
|
35
|
-
cp = None
|
|
38
|
+
cp = None
|
|
36
39
|
_gpu_available = False
|
|
37
40
|
|
|
38
41
|
# Simple per-process cache for encoded transactions keyed by the list object's id
|
gsppy/cli.py
CHANGED
|
@@ -33,49 +33,123 @@ import csv
|
|
|
33
33
|
import sys
|
|
34
34
|
import json
|
|
35
35
|
import logging
|
|
36
|
-
from typing import Dict, List, Tuple
|
|
36
|
+
from typing import Any, Dict, List, Tuple, Union, Optional, cast
|
|
37
37
|
|
|
38
38
|
import click
|
|
39
39
|
|
|
40
40
|
from gsppy.gsp import GSP
|
|
41
|
-
|
|
42
|
-
# Configure logging
|
|
43
|
-
logging.basicConfig(
|
|
44
|
-
level=logging.INFO,
|
|
45
|
-
format="%(message)s", # Simplified to keep CLI output clean
|
|
46
|
-
handlers=[logging.StreamHandler(sys.stdout)],
|
|
47
|
-
)
|
|
48
|
-
logger: logging.Logger = logging.getLogger(__name__)
|
|
41
|
+
from gsppy.utils import has_timestamps
|
|
49
42
|
|
|
50
43
|
|
|
51
44
|
def setup_logging(verbose: bool) -> None:
|
|
52
45
|
"""
|
|
53
|
-
|
|
54
|
-
|
|
46
|
+
Configure logging with standardized format based on verbosity level.
|
|
47
|
+
|
|
48
|
+
When verbose is enabled, provides detailed structured logging with:
|
|
49
|
+
- Timestamps (ISO 8601 format)
|
|
50
|
+
- Log levels
|
|
51
|
+
- Process ID for traceability
|
|
52
|
+
- Module context
|
|
53
|
+
|
|
54
|
+
When verbose is disabled, uses simple format with just the message.
|
|
55
|
+
|
|
56
|
+
Parameters:
|
|
57
|
+
verbose: Whether to enable verbose logging with detailed formatting.
|
|
55
58
|
"""
|
|
59
|
+
# Remove any existing handlers
|
|
60
|
+
root_logger = logging.getLogger()
|
|
61
|
+
for handler in root_logger.handlers[:]:
|
|
62
|
+
root_logger.removeHandler(handler)
|
|
63
|
+
|
|
56
64
|
if verbose:
|
|
57
|
-
|
|
65
|
+
# Detailed format with timestamps, levels, PID, and context for verbose mode
|
|
66
|
+
log_format = "%(asctime)s | %(levelname)-8s | PID:%(process)d | %(name)s | %(message)s"
|
|
67
|
+
date_format = "%Y-%m-%dT%H:%M:%S"
|
|
68
|
+
log_level = logging.DEBUG
|
|
58
69
|
else:
|
|
59
|
-
|
|
70
|
+
# Simple format for default mode - just the message
|
|
71
|
+
log_format = "%(message)s"
|
|
72
|
+
date_format = None
|
|
73
|
+
log_level = logging.INFO
|
|
74
|
+
|
|
75
|
+
# Configure logging with the appropriate format
|
|
76
|
+
logging.basicConfig(
|
|
77
|
+
level=log_level,
|
|
78
|
+
format=log_format,
|
|
79
|
+
datefmt=date_format,
|
|
80
|
+
handlers=[logging.StreamHandler(sys.stdout)],
|
|
81
|
+
force=True, # Force reconfiguration even if already configured
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
|
60
86
|
|
|
61
87
|
|
|
62
|
-
def read_transactions_from_json(file_path: str) -> List[List[str]]:
|
|
88
|
+
def read_transactions_from_json(file_path: str) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
|
|
63
89
|
"""
|
|
64
90
|
Read transactions from a JSON file.
|
|
65
91
|
|
|
92
|
+
Supports both simple transactions and timestamped transactions:
|
|
93
|
+
- Simple: [["A", "B", "C"], ["D", "E"]]
|
|
94
|
+
- Timestamped: [[["A", 1], ["B", 3]], [["D", 2], ["E", 5]]]
|
|
95
|
+
where the first element is the item and the second element is the timestamp
|
|
96
|
+
|
|
66
97
|
Parameters:
|
|
67
98
|
file_path (str): Path to the file containing transactions.
|
|
68
99
|
|
|
69
100
|
Returns:
|
|
70
|
-
List[List]
|
|
101
|
+
Union[List[List[str]], List[List[Tuple[str, float]]]]:
|
|
102
|
+
Parsed transactions from the file. For timestamped data,
|
|
103
|
+
inner lists are converted to tuples (item, timestamp).
|
|
71
104
|
|
|
72
105
|
Raises:
|
|
73
106
|
ValueError: If the file cannot be read or does not contain valid JSON.
|
|
74
107
|
"""
|
|
75
108
|
try:
|
|
76
109
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
77
|
-
|
|
78
|
-
|
|
110
|
+
raw_data: Any = json.load(f)
|
|
111
|
+
|
|
112
|
+
if not isinstance(raw_data, list):
|
|
113
|
+
raise ValueError("JSON must contain a top-level list of transactions.")
|
|
114
|
+
|
|
115
|
+
raw_transactions: List[List[Union[str, Tuple[str, float]]]] = cast(
|
|
116
|
+
List[List[Union[str, Tuple[str, float]]]], raw_data
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Check if this is timestamped data using the helper function.
|
|
120
|
+
# Use defensive checks to avoid errors on malformed data:
|
|
121
|
+
# - Find the first non-empty transaction instead of assuming index 0 is non-empty.
|
|
122
|
+
# - Normalize inner list pairs (from json.load) to tuples before calling has_timestamps.
|
|
123
|
+
first_non_empty_transaction: Optional[List[Union[str, Tuple[str, float]]]] = next(
|
|
124
|
+
(transaction for transaction in raw_transactions if transaction),
|
|
125
|
+
None,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
is_timestamped = False
|
|
129
|
+
if first_non_empty_transaction is not None:
|
|
130
|
+
# Normalize to the exact input type expected by has_timestamps
|
|
131
|
+
normalized_first: List[Union[str, Tuple[str, float]]] = []
|
|
132
|
+
for item in first_non_empty_transaction:
|
|
133
|
+
if isinstance(item, list) and len(item) == 2:
|
|
134
|
+
normalized_first.append((str(item[0]), float(item[1])))
|
|
135
|
+
elif isinstance(item, tuple):
|
|
136
|
+
normalized_first.append(cast(Tuple[str, float], item))
|
|
137
|
+
else:
|
|
138
|
+
normalized_first.append(str(item))
|
|
139
|
+
|
|
140
|
+
is_timestamped = has_timestamps(normalized_first)
|
|
141
|
+
|
|
142
|
+
if is_timestamped:
|
|
143
|
+
# Convert timestamped data: [[["A", 1], ["B", 2]]] -> [[("A", 1), ("B", 2)]]
|
|
144
|
+
transactions: List[List[Tuple[str, float]]] = [
|
|
145
|
+
[cast(Tuple[str, float], tuple(item) if isinstance(item, list) else item) for item in transaction]
|
|
146
|
+
for transaction in raw_transactions
|
|
147
|
+
]
|
|
148
|
+
return transactions
|
|
149
|
+
|
|
150
|
+
# Simple transactions remain as-is (or invalid data passed through for GSP to validate)
|
|
151
|
+
simple_transactions: List[List[str]] = [[str(item) for item in transaction] for transaction in raw_transactions]
|
|
152
|
+
return simple_transactions
|
|
79
153
|
except Exception as e:
|
|
80
154
|
msg = f"Error reading transaction data from JSON file '{file_path}': {e}"
|
|
81
155
|
logging.error(msg)
|
|
@@ -112,7 +186,7 @@ def read_transactions_from_csv(file_path: str) -> List[List[str]]:
|
|
|
112
186
|
raise ValueError(msg) from e
|
|
113
187
|
|
|
114
188
|
|
|
115
|
-
def detect_and_read_file(file_path: str) -> List[List[str]]:
|
|
189
|
+
def detect_and_read_file(file_path: str) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
|
|
116
190
|
"""
|
|
117
191
|
Detect file format (CSV or JSON) and read transactions.
|
|
118
192
|
|
|
@@ -120,7 +194,8 @@ def detect_and_read_file(file_path: str) -> List[List[str]]:
|
|
|
120
194
|
file_path (str): Path to the file containing transactions.
|
|
121
195
|
|
|
122
196
|
Returns:
|
|
123
|
-
List[List]
|
|
197
|
+
Union[List[List[str]], List[List[Tuple[str, float]]]]:
|
|
198
|
+
Parsed transactions from the file.
|
|
124
199
|
|
|
125
200
|
Raises:
|
|
126
201
|
ValueError: If the file format is unsupported or reading fails.
|
|
@@ -163,10 +238,53 @@ def detect_and_read_file(file_path: str) -> List[List[str]]:
|
|
|
163
238
|
show_default=True,
|
|
164
239
|
help="Backend to use for support counting.",
|
|
165
240
|
)
|
|
241
|
+
@click.option(
|
|
242
|
+
"--mingap",
|
|
243
|
+
type=float,
|
|
244
|
+
default=None,
|
|
245
|
+
help="Minimum time gap required between consecutive items in patterns (requires timestamped transactions).",
|
|
246
|
+
)
|
|
247
|
+
@click.option(
|
|
248
|
+
"--maxgap",
|
|
249
|
+
type=float,
|
|
250
|
+
default=None,
|
|
251
|
+
help="Maximum time gap allowed between consecutive items in patterns (requires timestamped transactions).",
|
|
252
|
+
)
|
|
253
|
+
@click.option(
|
|
254
|
+
"--maxspan",
|
|
255
|
+
type=float,
|
|
256
|
+
default=None,
|
|
257
|
+
help="Maximum time span from first to last item in patterns (requires timestamped transactions).",
|
|
258
|
+
)
|
|
166
259
|
@click.option("--verbose", is_flag=True, help="Enable verbose output for debugging purposes.")
|
|
167
|
-
def main(
|
|
260
|
+
def main(
|
|
261
|
+
file_path: str,
|
|
262
|
+
min_support: float,
|
|
263
|
+
backend: str,
|
|
264
|
+
mingap: Optional[float],
|
|
265
|
+
maxgap: Optional[float],
|
|
266
|
+
maxspan: Optional[float],
|
|
267
|
+
verbose: bool,
|
|
268
|
+
) -> None:
|
|
168
269
|
"""
|
|
169
270
|
Run the GSP algorithm on transactional data from a file.
|
|
271
|
+
|
|
272
|
+
Supports both simple transactions (items only) and timestamped transactions
|
|
273
|
+
(item-timestamp pairs) for temporal pattern mining.
|
|
274
|
+
|
|
275
|
+
Examples:
|
|
276
|
+
Basic usage without temporal constraints:
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
gsppy --file transactions.json --min_support 0.3
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
With temporal constraints:
|
|
283
|
+
|
|
284
|
+
```bash
|
|
285
|
+
gsppy --file temporal_data.json --min_support 0.3 --maxgap 10
|
|
286
|
+
gsppy --file events.json --min_support 0.5 --mingap 2 --maxgap 10 --maxspan 20
|
|
287
|
+
```
|
|
170
288
|
"""
|
|
171
289
|
setup_logging(verbose)
|
|
172
290
|
|
|
@@ -177,10 +295,8 @@ def main(file_path: str, min_support: float, backend: str, verbose: bool) -> Non
|
|
|
177
295
|
logger.error(f"Error: {e}")
|
|
178
296
|
sys.exit(1)
|
|
179
297
|
|
|
180
|
-
#
|
|
181
|
-
|
|
182
|
-
logger.error("Error: min_support must be in the range (0.0, 1.0].")
|
|
183
|
-
sys.exit(1)
|
|
298
|
+
# Validate parameters
|
|
299
|
+
_validate_parameters(min_support, mingap, maxgap, maxspan)
|
|
184
300
|
|
|
185
301
|
# Select backend for acceleration layer
|
|
186
302
|
if backend and backend.lower() != "auto":
|
|
@@ -188,7 +304,7 @@ def main(file_path: str, min_support: float, backend: str, verbose: bool) -> Non
|
|
|
188
304
|
|
|
189
305
|
# Initialize and run GSP algorithm
|
|
190
306
|
try:
|
|
191
|
-
gsp = GSP(transactions)
|
|
307
|
+
gsp = GSP(transactions, mingap=mingap, maxgap=maxgap, maxspan=maxspan, verbose=verbose)
|
|
192
308
|
patterns: List[Dict[Tuple[str, ...], int]] = gsp.search(min_support=min_support)
|
|
193
309
|
logger.info("Frequent Patterns Found:")
|
|
194
310
|
for i, level in enumerate(patterns, start=1):
|
|
@@ -200,5 +316,43 @@ def main(file_path: str, min_support: float, backend: str, verbose: bool) -> Non
|
|
|
200
316
|
sys.exit(1)
|
|
201
317
|
|
|
202
318
|
|
|
319
|
+
def _validate_parameters(
|
|
320
|
+
min_support: float,
|
|
321
|
+
mingap: Optional[float],
|
|
322
|
+
maxgap: Optional[float],
|
|
323
|
+
maxspan: Optional[float],
|
|
324
|
+
) -> None:
|
|
325
|
+
"""
|
|
326
|
+
Validate input parameters for GSP algorithm.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
min_support: Minimum support threshold
|
|
330
|
+
mingap: Minimum time gap constraint
|
|
331
|
+
maxgap: Maximum time gap constraint
|
|
332
|
+
maxspan: Maximum time span constraint
|
|
333
|
+
|
|
334
|
+
Raises:
|
|
335
|
+
SystemExit: If validation fails
|
|
336
|
+
"""
|
|
337
|
+
# Check min_support
|
|
338
|
+
if min_support <= 0.0 or min_support > 1.0:
|
|
339
|
+
logger.error("Error: min_support must be in the range (0.0, 1.0].")
|
|
340
|
+
sys.exit(1)
|
|
341
|
+
|
|
342
|
+
# Validate temporal constraints
|
|
343
|
+
if mingap is not None and mingap < 0:
|
|
344
|
+
logger.error("Error: mingap must be non-negative.")
|
|
345
|
+
sys.exit(1)
|
|
346
|
+
if maxgap is not None and maxgap < 0:
|
|
347
|
+
logger.error("Error: maxgap must be non-negative.")
|
|
348
|
+
sys.exit(1)
|
|
349
|
+
if maxspan is not None and maxspan < 0:
|
|
350
|
+
logger.error("Error: maxspan must be non-negative.")
|
|
351
|
+
sys.exit(1)
|
|
352
|
+
if mingap is not None and maxgap is not None and mingap > maxgap:
|
|
353
|
+
logger.error("Error: mingap cannot be greater than maxgap.")
|
|
354
|
+
sys.exit(1)
|
|
355
|
+
|
|
356
|
+
|
|
203
357
|
if __name__ == "__main__":
|
|
204
358
|
main()
|
gsppy/gsp.py
CHANGED
|
@@ -88,11 +88,17 @@ Version:
|
|
|
88
88
|
import math
|
|
89
89
|
import logging
|
|
90
90
|
import multiprocessing as mp
|
|
91
|
-
from typing import Dict, List, Tuple, Optional
|
|
91
|
+
from typing import Dict, List, Tuple, Union, Optional, cast
|
|
92
92
|
from itertools import chain
|
|
93
93
|
from collections import Counter
|
|
94
94
|
|
|
95
|
-
from gsppy.utils import
|
|
95
|
+
from gsppy.utils import (
|
|
96
|
+
has_timestamps,
|
|
97
|
+
split_into_batches,
|
|
98
|
+
is_subsequence_in_list,
|
|
99
|
+
generate_candidates_from_previous,
|
|
100
|
+
is_subsequence_in_list_with_time_constraints,
|
|
101
|
+
)
|
|
96
102
|
from gsppy.accelerate import support_counts as support_counts_accel
|
|
97
103
|
|
|
98
104
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
@@ -117,41 +123,98 @@ class GSP:
|
|
|
117
123
|
k-sequence for pattern generation.
|
|
118
124
|
"""
|
|
119
125
|
|
|
120
|
-
def __init__(
|
|
126
|
+
def __init__(
|
|
127
|
+
self,
|
|
128
|
+
raw_transactions: Union[List[List[str]], List[List[Tuple[str, float]]]],
|
|
129
|
+
mingap: Optional[float] = None,
|
|
130
|
+
maxgap: Optional[float] = None,
|
|
131
|
+
maxspan: Optional[float] = None,
|
|
132
|
+
verbose: bool = False,
|
|
133
|
+
):
|
|
121
134
|
"""
|
|
122
135
|
Initialize the GSP algorithm with raw transactional data.
|
|
123
136
|
|
|
124
137
|
Parameters:
|
|
125
|
-
raw_transactions (List[List]):
|
|
126
|
-
|
|
138
|
+
raw_transactions (Union[List[List[str]], List[List[Tuple[str, float]]]]):
|
|
139
|
+
Input transaction dataset where each transaction is either:
|
|
140
|
+
- A list of items (e.g., [['A', 'B'], ['B', 'C', 'D']])
|
|
141
|
+
- A list of (item, timestamp) tuples (e.g., [[('A', 1.0), ('B', 2.0)]])
|
|
142
|
+
mingap (Optional[float]): Minimum time gap required between consecutive items in patterns.
|
|
143
|
+
maxgap (Optional[float]): Maximum time gap allowed between consecutive items in patterns.
|
|
144
|
+
maxspan (Optional[float]): Maximum time span from first to last item in patterns.
|
|
145
|
+
verbose (bool): Enable verbose logging output with detailed progress information.
|
|
146
|
+
Default is False (minimal output).
|
|
127
147
|
|
|
128
148
|
Attributes Initialized:
|
|
129
149
|
- Processes the input raw transaction dataset.
|
|
130
150
|
- Computes unique singleton candidates (`unique_candidates`).
|
|
131
151
|
- Extracts the maximum transaction size (`max_size`) from the dataset for limiting
|
|
132
152
|
the search space.
|
|
153
|
+
- Stores temporal constraints for use during pattern mining.
|
|
133
154
|
|
|
134
155
|
Raises:
|
|
135
156
|
ValueError: If the input transaction dataset is empty, contains
|
|
136
157
|
fewer than two transactions, or is not properly formatted.
|
|
158
|
+
Also raised if temporal constraints are invalid.
|
|
137
159
|
"""
|
|
138
160
|
self.freq_patterns: List[Dict[Tuple[str, ...], int]] = []
|
|
161
|
+
self.mingap = mingap
|
|
162
|
+
self.maxgap = maxgap
|
|
163
|
+
self.maxspan = maxspan
|
|
164
|
+
self.verbose = verbose
|
|
165
|
+
self._configure_logging()
|
|
166
|
+
self._validate_temporal_constraints()
|
|
139
167
|
self._pre_processing(raw_transactions)
|
|
140
168
|
|
|
141
|
-
def
|
|
169
|
+
def _configure_logging(self) -> None:
|
|
170
|
+
"""
|
|
171
|
+
Configure logging for the GSP instance based on verbosity setting.
|
|
172
|
+
|
|
173
|
+
When verbose is True, sets the module logger to DEBUG level for detailed output.
|
|
174
|
+
When verbose is False, sets the module logger to WARNING level for minimal output.
|
|
175
|
+
|
|
176
|
+
This method intentionally avoids modifying the root logger to prevent
|
|
177
|
+
unexpected global logging side effects, especially in multiprocessing
|
|
178
|
+
environments.
|
|
179
|
+
"""
|
|
180
|
+
if self.verbose:
|
|
181
|
+
logger.setLevel(logging.DEBUG)
|
|
182
|
+
else:
|
|
183
|
+
logger.setLevel(logging.WARNING)
|
|
184
|
+
|
|
185
|
+
def _validate_temporal_constraints(self) -> None:
|
|
186
|
+
"""
|
|
187
|
+
Validate temporal constraint parameters.
|
|
188
|
+
|
|
189
|
+
Raises:
|
|
190
|
+
ValueError: If any temporal constraint is negative or if mingap > maxgap.
|
|
191
|
+
"""
|
|
192
|
+
if self.mingap is not None and self.mingap < 0:
|
|
193
|
+
raise ValueError("mingap must be non-negative")
|
|
194
|
+
if self.maxgap is not None and self.maxgap < 0:
|
|
195
|
+
raise ValueError("maxgap must be non-negative")
|
|
196
|
+
if self.maxspan is not None and self.maxspan < 0:
|
|
197
|
+
raise ValueError("maxspan must be non-negative")
|
|
198
|
+
if self.mingap is not None and self.maxgap is not None and self.mingap > self.maxgap:
|
|
199
|
+
raise ValueError("mingap cannot be greater than maxgap")
|
|
200
|
+
|
|
201
|
+
def _pre_processing(self, raw_transactions: Union[List[List[str]], List[List[Tuple[str, float]]]]) -> None:
|
|
142
202
|
"""
|
|
143
203
|
Validate and preprocess the input transactional dataset.
|
|
144
204
|
|
|
145
205
|
This method ensures that the dataset is formatted correctly and converts the transactions
|
|
146
206
|
into tuples while counting unique singleton candidates for initial support computation steps.
|
|
207
|
+
It handles both simple transactions (items only) and timestamped transactions.
|
|
147
208
|
|
|
148
209
|
Parameters:
|
|
149
|
-
raw_transactions (List[List]):
|
|
210
|
+
raw_transactions (Union[List[List[str]], List[List[Tuple[str, float]]]]):
|
|
211
|
+
Input transactional data (with or without timestamps).
|
|
150
212
|
|
|
151
213
|
Attributes Set:
|
|
152
214
|
- `transactions`: The preprocessed transactions converted to tuples.
|
|
153
215
|
- `unique_candidates`: A list of unique singleton candidates derived from the dataset.
|
|
154
216
|
- `max_size`: The length of the largest transaction in the data.
|
|
217
|
+
- `has_timestamps`: Boolean indicating if transactions include timestamps.
|
|
155
218
|
|
|
156
219
|
Raises:
|
|
157
220
|
ValueError: If the dataset is empty, improperly formatted, or contains fewer than 2 transactions.
|
|
@@ -171,28 +234,71 @@ class GSP:
|
|
|
171
234
|
raise ValueError(msg)
|
|
172
235
|
|
|
173
236
|
logger.info("Pre-processing transactions...")
|
|
237
|
+
|
|
238
|
+
# Detect if transactions have timestamps by checking non-empty transactions
|
|
239
|
+
self.has_timestamps = False
|
|
240
|
+
for tx in raw_transactions:
|
|
241
|
+
if tx: # Check non-empty transactions
|
|
242
|
+
tx_sequence = cast(List[Union[str, Tuple[str, float]]], tx)
|
|
243
|
+
self.has_timestamps = has_timestamps(tx_sequence)
|
|
244
|
+
if self.has_timestamps:
|
|
245
|
+
logger.debug("Detected timestamped transactions")
|
|
246
|
+
break
|
|
247
|
+
|
|
248
|
+
# Validate temporal constraints are only used with timestamps
|
|
249
|
+
if (self.mingap is not None or self.maxgap is not None or self.maxspan is not None) and not self.has_timestamps:
|
|
250
|
+
logger.warning(
|
|
251
|
+
"Temporal constraints specified but transactions do not have timestamps. "
|
|
252
|
+
"Constraints will be ignored."
|
|
253
|
+
)
|
|
254
|
+
# Clear temporal constraints since they cannot be applied
|
|
255
|
+
self.mingap = None
|
|
256
|
+
self.maxgap = None
|
|
257
|
+
self.maxspan = None
|
|
258
|
+
|
|
174
259
|
self.max_size: int = max(len(item) for item in raw_transactions)
|
|
175
|
-
|
|
176
|
-
|
|
260
|
+
|
|
261
|
+
if self.has_timestamps:
|
|
262
|
+
# For timestamped transactions, convert to tuples and extract items for counting
|
|
263
|
+
timestamped_txs = cast(List[List[Tuple[str, float]]], raw_transactions)
|
|
264
|
+
self.transactions = [tuple(transaction) for transaction in timestamped_txs]
|
|
265
|
+
# Extract just the items for counting unique candidates
|
|
266
|
+
all_items = chain.from_iterable([[item for item, _ in tx] for tx in timestamped_txs])
|
|
267
|
+
counts: Counter[str] = Counter(all_items)
|
|
268
|
+
else:
|
|
269
|
+
# For non-timestamped transactions, process as before
|
|
270
|
+
simple_txs = cast(List[List[str]], raw_transactions)
|
|
271
|
+
self.transactions = [tuple(transaction) for transaction in simple_txs]
|
|
272
|
+
counts: Counter[str] = Counter(chain.from_iterable(simple_txs))
|
|
273
|
+
|
|
177
274
|
# Start with singleton candidates (1-sequences)
|
|
178
275
|
self.unique_candidates: List[Tuple[str, ...]] = [(item,) for item in counts.keys()]
|
|
179
276
|
logger.debug("Unique candidates: %s", self.unique_candidates)
|
|
180
277
|
|
|
181
278
|
@staticmethod
|
|
182
279
|
def _worker_batch(
|
|
183
|
-
batch: List[Tuple[str, ...]],
|
|
280
|
+
batch: List[Tuple[str, ...]],
|
|
281
|
+
transactions: List[Union[Tuple[str, ...], Tuple[Tuple[str, float], ...]]],
|
|
282
|
+
min_support: int,
|
|
283
|
+
mingap: Optional[float] = None,
|
|
284
|
+
maxgap: Optional[float] = None,
|
|
285
|
+
maxspan: Optional[float] = None,
|
|
184
286
|
) -> List[Tuple[Tuple[str, ...], int]]:
|
|
185
287
|
"""
|
|
186
288
|
Evaluate a batch of candidate sequences to compute their support.
|
|
187
289
|
|
|
188
290
|
This method iterates over the candidates in the given batch and checks their frequency
|
|
189
291
|
of appearance across all transactions. Candidates meeting the user-defined minimum
|
|
190
|
-
support threshold are returned.
|
|
292
|
+
support threshold are returned. Supports temporal constraints when timestamps are present.
|
|
191
293
|
|
|
192
294
|
Parameters:
|
|
193
295
|
batch (List[Tuple]): A batch of candidate sequences, where each sequence is represented as a tuple.
|
|
194
|
-
transactions (List[Tuple]
|
|
296
|
+
transactions (List[Union[Tuple[str, ...], Tuple[Tuple[str, float], ...]]]):
|
|
297
|
+
Preprocessed transactions as tuples (with or without timestamps).
|
|
195
298
|
min_support (int): Absolute minimum support count required for a candidate to be considered frequent.
|
|
299
|
+
mingap (Optional[float]): Minimum time gap between consecutive items.
|
|
300
|
+
maxgap (Optional[float]): Maximum time gap between consecutive items.
|
|
301
|
+
maxspan (Optional[float]): Maximum time span from first to last item.
|
|
196
302
|
|
|
197
303
|
Returns:
|
|
198
304
|
List[Tuple[Tuple, int]]: A list of tuples where each tuple contains:
|
|
@@ -200,8 +306,27 @@ class GSP:
|
|
|
200
306
|
- The candidate's support count.
|
|
201
307
|
"""
|
|
202
308
|
results: List[Tuple[Tuple[str, ...], int]] = []
|
|
309
|
+
has_temporal = mingap is not None or maxgap is not None or maxspan is not None
|
|
310
|
+
|
|
311
|
+
# Detect if transactions have timestamps using the helper function,
|
|
312
|
+
# based on the first non-empty transaction in the batch.
|
|
313
|
+
first_non_empty_tx = next((t for t in transactions if t), None)
|
|
314
|
+
has_timestamps_flag = bool(first_non_empty_tx and has_timestamps(first_non_empty_tx))
|
|
315
|
+
|
|
203
316
|
for item in batch:
|
|
204
|
-
|
|
317
|
+
if has_timestamps_flag or has_temporal:
|
|
318
|
+
# Use temporal-aware checking for timestamped transactions
|
|
319
|
+
frequency = sum(
|
|
320
|
+
1
|
|
321
|
+
for t in transactions
|
|
322
|
+
if is_subsequence_in_list_with_time_constraints(
|
|
323
|
+
item, t, mingap=mingap, maxgap=maxgap, maxspan=maxspan
|
|
324
|
+
)
|
|
325
|
+
)
|
|
326
|
+
else:
|
|
327
|
+
# Use standard non-temporal checking for simple transactions
|
|
328
|
+
frequency = sum(1 for t in transactions if is_subsequence_in_list(item, t))
|
|
329
|
+
|
|
205
330
|
if frequency >= min_support:
|
|
206
331
|
results.append((item, frequency))
|
|
207
332
|
return results
|
|
@@ -228,7 +353,7 @@ class GSP:
|
|
|
228
353
|
with mp.Pool(processes=mp.cpu_count()) as pool:
|
|
229
354
|
batch_results = pool.starmap(
|
|
230
355
|
self._worker_batch, # Process a batch at a time
|
|
231
|
-
[(batch, self.transactions, min_support) for batch in batches],
|
|
356
|
+
[(batch, self.transactions, min_support, self.mingap, self.maxgap, self.maxspan) for batch in batches],
|
|
232
357
|
)
|
|
233
358
|
|
|
234
359
|
# Flatten the list of results and convert to a dictionary
|
|
@@ -245,9 +370,21 @@ class GSP:
|
|
|
245
370
|
Calculate support counts for candidate sequences using the fastest available backend.
|
|
246
371
|
This will try the Rust extension if available (and configured), otherwise fall back to
|
|
247
372
|
the Python multiprocessing implementation.
|
|
373
|
+
|
|
374
|
+
Note: When temporal constraints are active or transactions have timestamps,
|
|
375
|
+
the Python implementation is always used as the accelerated backends do not yet
|
|
376
|
+
support temporal constraints or timestamped transactions.
|
|
248
377
|
"""
|
|
378
|
+
# Use Python implementation when temporal constraints are active or timestamps present
|
|
379
|
+
has_temporal = self.mingap is not None or self.maxgap is not None or self.maxspan is not None
|
|
380
|
+
if has_temporal or self.has_timestamps:
|
|
381
|
+
return self._support_python(items, min_support, batch_size)
|
|
382
|
+
|
|
383
|
+
# For non-timestamped transactions, we can use accelerated support counting
|
|
384
|
+
# Cast is safe here because we've confirmed no timestamps above
|
|
385
|
+
non_timestamped_transactions = cast(List[Tuple[str, ...]], self.transactions)
|
|
249
386
|
try:
|
|
250
|
-
return support_counts_accel(
|
|
387
|
+
return support_counts_accel(non_timestamped_transactions, items, min_support, batch_size, backend=backend)
|
|
251
388
|
except Exception:
|
|
252
389
|
# Fallback to Python implementation on any acceleration failure
|
|
253
390
|
return self._support_python(items, min_support, batch_size)
|
|
@@ -270,6 +407,7 @@ class GSP:
|
|
|
270
407
|
min_support: float = 0.2,
|
|
271
408
|
max_k: Optional[int] = None,
|
|
272
409
|
backend: Optional[str] = None,
|
|
410
|
+
verbose: Optional[bool] = None,
|
|
273
411
|
) -> List[Dict[Tuple[str, ...], int]]:
|
|
274
412
|
"""
|
|
275
413
|
Execute the Generalized Sequential Pattern (GSP) mining algorithm.
|
|
@@ -278,10 +416,19 @@ class GSP:
|
|
|
278
416
|
in the input transaction dataset. Patterns are extracted iteratively at each k-sequence level,
|
|
279
417
|
starting from singleton sequences, until no further frequent patterns can be found.
|
|
280
418
|
|
|
419
|
+
When temporal constraints (mingap, maxgap, maxspan) are specified during initialization,
|
|
420
|
+
the algorithm enforces these constraints during pattern matching, allowing for time-aware
|
|
421
|
+
sequential pattern mining.
|
|
422
|
+
|
|
281
423
|
Parameters:
|
|
282
424
|
min_support (float): Minimum support threshold as a fraction of total transactions.
|
|
283
425
|
For example, `0.3` means that a sequence is frequent if it
|
|
284
426
|
appears in at least 30% of all transactions.
|
|
427
|
+
max_k (Optional[int]): Maximum length of patterns to mine. If None, mines up to max transaction length.
|
|
428
|
+
backend (Optional[str]): Backend to use for support counting ('auto', 'python', 'rust', 'gpu').
|
|
429
|
+
Note: temporal constraints always use Python backend.
|
|
430
|
+
verbose (Optional[bool]): Override instance verbosity setting for this search.
|
|
431
|
+
If None, uses the instance's verbose setting.
|
|
285
432
|
|
|
286
433
|
Returns:
|
|
287
434
|
List[Dict[Tuple[str, ...], int]]: A list of dictionaries containing frequent patterns
|
|
@@ -296,8 +443,8 @@ class GSP:
|
|
|
296
443
|
and completion.
|
|
297
444
|
- Status updates for each iteration until the algorithm terminates.
|
|
298
445
|
|
|
299
|
-
|
|
300
|
-
Basic usage
|
|
446
|
+
Examples:
|
|
447
|
+
Basic usage without temporal constraints:
|
|
301
448
|
|
|
302
449
|
```python
|
|
303
450
|
from gsppy.gsp import GSP
|
|
@@ -311,11 +458,41 @@ class GSP:
|
|
|
311
458
|
gsp = GSP(transactions)
|
|
312
459
|
patterns = gsp.search(min_support=0.3)
|
|
313
460
|
```
|
|
461
|
+
|
|
462
|
+
Usage with temporal constraints (requires timestamped transactions):
|
|
463
|
+
|
|
464
|
+
```python
|
|
465
|
+
from gsppy.gsp import GSP
|
|
466
|
+
|
|
467
|
+
# Transactions with timestamps (item, timestamp) pairs
|
|
468
|
+
# where timestamps can be in any unit (seconds, minutes, hours, days, etc.)
|
|
469
|
+
timestamped_transactions = [
|
|
470
|
+
[("A", 1), ("B", 3), ("C", 5)], # timestamps: 1, 3, 5
|
|
471
|
+
[("A", 2), ("B", 10), ("C", 12)], # timestamps: 2, 10, 12
|
|
472
|
+
[("A", 1), ("C", 4)], # timestamps: 1, 4
|
|
473
|
+
]
|
|
474
|
+
|
|
475
|
+
# Find patterns with maxgap of 5 time units between consecutive items
|
|
476
|
+
gsp = GSP(timestamped_transactions, maxgap=5)
|
|
477
|
+
patterns = gsp.search(min_support=0.5)
|
|
478
|
+
# Pattern ("A", "B", "C") won't be found in transaction 2
|
|
479
|
+
# because gap between A and B is 8 (exceeds maxgap=5)
|
|
480
|
+
```
|
|
314
481
|
"""
|
|
482
|
+
# Override verbosity if specified for this search
|
|
483
|
+
original_verbose = self.verbose
|
|
484
|
+
if verbose is not None:
|
|
485
|
+
self.verbose = verbose
|
|
486
|
+
self._configure_logging()
|
|
487
|
+
|
|
315
488
|
if not 0.0 < min_support <= 1.0:
|
|
316
489
|
raise ValueError("Minimum support must be in the range (0.0, 1.0]")
|
|
317
490
|
|
|
318
491
|
logger.info(f"Starting GSP algorithm with min_support={min_support}...")
|
|
492
|
+
if self.mingap is not None or self.maxgap is not None or self.maxspan is not None:
|
|
493
|
+
logger.info(
|
|
494
|
+
f"Using temporal constraints: mingap={self.mingap}, maxgap={self.maxgap}, maxspan={self.maxspan}"
|
|
495
|
+
)
|
|
319
496
|
|
|
320
497
|
# Convert fractional support to absolute count (ceil to preserve threshold semantics)
|
|
321
498
|
abs_min_support = int(math.ceil(len(self.transactions) * float(min_support)))
|
|
@@ -352,4 +529,10 @@ class GSP:
|
|
|
352
529
|
|
|
353
530
|
self._print_status(k_items, candidates)
|
|
354
531
|
logger.info("GSP algorithm completed.")
|
|
532
|
+
|
|
533
|
+
# Restore original verbosity if it was overridden
|
|
534
|
+
if verbose is not None:
|
|
535
|
+
self.verbose = original_verbose
|
|
536
|
+
self._configure_logging()
|
|
537
|
+
|
|
355
538
|
return self.freq_patterns[:-1]
|
gsppy/utils.py
CHANGED
|
@@ -21,11 +21,48 @@ These utilities are designed to support sequence processing tasks and can be
|
|
|
21
21
|
adapted to various domains, such as data mining, recommendation systems, and sequence analysis.
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
|
-
from typing import Dict, List, Tuple, Sequence, Generator
|
|
24
|
+
from typing import Dict, List, Tuple, Union, Optional, Sequence, Generator, cast
|
|
25
25
|
from functools import lru_cache
|
|
26
26
|
from itertools import product
|
|
27
27
|
|
|
28
28
|
|
|
29
|
+
def has_timestamps(
|
|
30
|
+
sequence: Union[
|
|
31
|
+
Tuple[Union[str, Tuple[str, Union[int, float]]], ...], List[Union[str, Tuple[str, Union[int, float]]]]
|
|
32
|
+
],
|
|
33
|
+
) -> bool:
|
|
34
|
+
"""
|
|
35
|
+
Check if a sequence contains timestamped data (item-timestamp pairs).
|
|
36
|
+
|
|
37
|
+
Parameters:
|
|
38
|
+
sequence: A sequence that may contain timestamped data
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
bool: True if the sequence contains timestamped data, False otherwise
|
|
42
|
+
|
|
43
|
+
Examples:
|
|
44
|
+
>>> has_timestamps((("A", 1), ("B", 2)))
|
|
45
|
+
True
|
|
46
|
+
>>> has_timestamps(("A", "B", "C"))
|
|
47
|
+
False
|
|
48
|
+
"""
|
|
49
|
+
if not sequence or len(sequence) == 0:
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
first_item = sequence[0]
|
|
53
|
+
|
|
54
|
+
# Check if first item is a tuple or list with 2 elements where second is numeric
|
|
55
|
+
if isinstance(first_item, (tuple, list)) and len(first_item) == 2:
|
|
56
|
+
try:
|
|
57
|
+
# Try to interpret second element as a number
|
|
58
|
+
float(first_item[1])
|
|
59
|
+
return True
|
|
60
|
+
except (TypeError, ValueError):
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
|
|
29
66
|
def split_into_batches(
|
|
30
67
|
items: Sequence[Tuple[str, ...]], batch_size: int
|
|
31
68
|
) -> Generator[Sequence[Tuple[str, ...]], None, None]:
|
|
@@ -59,11 +96,11 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
|
|
|
59
96
|
bool: True if the subsequence is found, False otherwise.
|
|
60
97
|
|
|
61
98
|
Examples:
|
|
62
|
-
>>> is_subsequence_in_list((
|
|
99
|
+
>>> is_subsequence_in_list(("a", "c"), ("a", "b", "c"))
|
|
63
100
|
True
|
|
64
|
-
>>> is_subsequence_in_list((
|
|
101
|
+
>>> is_subsequence_in_list(("a", "c"), ("c", "a"))
|
|
65
102
|
False
|
|
66
|
-
>>> is_subsequence_in_list((
|
|
103
|
+
>>> is_subsequence_in_list(("a", "b"), ("a", "b", "c"))
|
|
67
104
|
True
|
|
68
105
|
"""
|
|
69
106
|
# Handle the case where the subsequence is empty - it should not exist in any sequence
|
|
@@ -86,6 +123,249 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
|
|
|
86
123
|
return False
|
|
87
124
|
|
|
88
125
|
|
|
126
|
+
def is_subsequence_in_list_with_time_constraints(
|
|
127
|
+
subsequence: Tuple[str, ...],
|
|
128
|
+
sequence: Union[Tuple[str, ...], Tuple[Tuple[str, float], ...]],
|
|
129
|
+
mingap: Optional[float] = None,
|
|
130
|
+
maxgap: Optional[float] = None,
|
|
131
|
+
maxspan: Optional[float] = None,
|
|
132
|
+
) -> bool:
|
|
133
|
+
"""
|
|
134
|
+
Check if a subsequence exists within a sequence with optional temporal constraints.
|
|
135
|
+
|
|
136
|
+
This function extends the standard subsequence check to support temporal constraints
|
|
137
|
+
for time-constrained sequential pattern mining. It handles both simple sequences
|
|
138
|
+
(items only) and timestamped sequences (item-timestamp pairs).
|
|
139
|
+
|
|
140
|
+
Temporal Constraints:
|
|
141
|
+
- mingap: Minimum time gap required between consecutive items in the pattern.
|
|
142
|
+
- maxgap: Maximum time gap allowed between consecutive items in the pattern.
|
|
143
|
+
- maxspan: Maximum time span from the first to last item in the pattern.
|
|
144
|
+
|
|
145
|
+
Parameters:
|
|
146
|
+
subsequence (Tuple[str, ...]): The pattern to search for (items only, no timestamps).
|
|
147
|
+
sequence (Union[Tuple[str, ...], Tuple[Tuple[str, float], ...]]):
|
|
148
|
+
The sequence to search within. Can be:
|
|
149
|
+
- Simple: Tuple of items (e.g., ('A', 'B', 'C'))
|
|
150
|
+
- Timestamped: Tuple of (item, timestamp) pairs (e.g., (('A', 1.0), ('B', 3.0)))
|
|
151
|
+
mingap (Optional[float]): Minimum time between consecutive pattern elements.
|
|
152
|
+
maxgap (Optional[float]): Maximum time between consecutive pattern elements.
|
|
153
|
+
maxspan (Optional[float]): Maximum time from first to last pattern element.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
bool: True if the subsequence is found respecting temporal constraints, False otherwise.
|
|
157
|
+
|
|
158
|
+
Examples:
|
|
159
|
+
>>> # Without timestamps (backward compatible)
|
|
160
|
+
>>> is_subsequence_in_list_with_time_constraints(("A", "C"), ("A", "B", "C"))
|
|
161
|
+
True
|
|
162
|
+
|
|
163
|
+
>>> # With timestamps and maxgap constraint
|
|
164
|
+
>>> seq = (("A", 1), ("B", 3), ("C", 10))
|
|
165
|
+
>>> is_subsequence_in_list_with_time_constraints(("A", "C"), seq, maxgap=5)
|
|
166
|
+
False # Gap between A and C is 9, exceeds maxgap=5
|
|
167
|
+
|
|
168
|
+
>>> # With timestamps and mingap constraint
|
|
169
|
+
>>> seq = (("A", 1), ("B", 2), ("C", 3))
|
|
170
|
+
>>> is_subsequence_in_list_with_time_constraints(("A", "C"), seq, mingap=3)
|
|
171
|
+
False # Gap between A and C is 2, less than mingap=3
|
|
172
|
+
|
|
173
|
+
>>> # With timestamps and maxspan constraint
|
|
174
|
+
>>> seq = (("A", 1), ("B", 5), ("C", 12))
|
|
175
|
+
>>> is_subsequence_in_list_with_time_constraints(("A", "C"), seq, maxspan=10)
|
|
176
|
+
False # Span from A to C is 11, exceeds maxspan=10
|
|
177
|
+
"""
|
|
178
|
+
# Handle empty subsequence
|
|
179
|
+
if not subsequence:
|
|
180
|
+
return False
|
|
181
|
+
|
|
182
|
+
# Return False if the subsequence is longer than the sequence
|
|
183
|
+
if len(subsequence) > len(sequence):
|
|
184
|
+
return False
|
|
185
|
+
|
|
186
|
+
# Determine if sequence has timestamps
|
|
187
|
+
has_timestamps_flag = has_timestamps(sequence)
|
|
188
|
+
|
|
189
|
+
# If no temporal constraints and no timestamps, use the optimized cached version
|
|
190
|
+
if not has_timestamps_flag and mingap is None and maxgap is None and maxspan is None:
|
|
191
|
+
return is_subsequence_in_list(subsequence, sequence)
|
|
192
|
+
|
|
193
|
+
# Extract items and timestamps from sequence
|
|
194
|
+
seq_items, seq_times = _extract_items_and_timestamps(sequence, has_timestamps_flag)
|
|
195
|
+
|
|
196
|
+
# Try to find a match starting from each position
|
|
197
|
+
return _find_temporal_match(subsequence, seq_items, seq_times, mingap, maxgap, maxspan)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _extract_items_and_timestamps(
|
|
201
|
+
sequence: Union[Tuple[str, ...], Tuple[Tuple[str, float], ...]],
|
|
202
|
+
has_timestamps_flag: bool,
|
|
203
|
+
) -> Tuple[Tuple[str, ...], Optional[Tuple[float, ...]]]:
|
|
204
|
+
"""
|
|
205
|
+
Extract items and timestamps from a sequence.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
sequence: The sequence to extract from
|
|
209
|
+
has_timestamps_flag: Whether the sequence has timestamps
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Tuple of (items, timestamps) where timestamps is None if not present
|
|
213
|
+
"""
|
|
214
|
+
if has_timestamps_flag:
|
|
215
|
+
# For timestamped sequences, extract items and timestamps separately
|
|
216
|
+
timestamped_seq = cast(Tuple[Tuple[str, float], ...], sequence)
|
|
217
|
+
seq_items = tuple(item for item, _ in timestamped_seq)
|
|
218
|
+
seq_times = tuple(time for _, time in timestamped_seq)
|
|
219
|
+
return seq_items, seq_times
|
|
220
|
+
else:
|
|
221
|
+
# For non-timestamped sequences, return items directly with None for timestamps
|
|
222
|
+
simple_seq = cast(Tuple[str, ...], sequence)
|
|
223
|
+
return simple_seq, None
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _find_temporal_match(
|
|
227
|
+
subsequence: Tuple[str, ...],
|
|
228
|
+
seq_items: Tuple[str, ...],
|
|
229
|
+
seq_times: Optional[Tuple[float, ...]],
|
|
230
|
+
mingap: Optional[float],
|
|
231
|
+
maxgap: Optional[float],
|
|
232
|
+
maxspan: Optional[float],
|
|
233
|
+
) -> bool:
|
|
234
|
+
"""
|
|
235
|
+
Find if subsequence matches with temporal constraints.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
subsequence: Pattern to search for
|
|
239
|
+
seq_items: Items in the sequence
|
|
240
|
+
seq_times: Timestamps (None if not present)
|
|
241
|
+
mingap: Minimum gap constraint
|
|
242
|
+
maxgap: Maximum gap constraint
|
|
243
|
+
maxspan: Maximum span constraint
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
True if match found, False otherwise
|
|
247
|
+
"""
|
|
248
|
+
len_sub = len(subsequence)
|
|
249
|
+
len_seq = len(seq_items)
|
|
250
|
+
|
|
251
|
+
# Try starting from each position
|
|
252
|
+
for start_idx in range(len_seq - len_sub + 1):
|
|
253
|
+
if _try_match_from_position(start_idx, subsequence, seq_items, seq_times, mingap, maxgap, maxspan):
|
|
254
|
+
return True
|
|
255
|
+
|
|
256
|
+
return False
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _try_match_from_position(
|
|
260
|
+
start_idx: int,
|
|
261
|
+
subsequence: Tuple[str, ...],
|
|
262
|
+
seq_items: Tuple[str, ...],
|
|
263
|
+
seq_times: Optional[Tuple[float, ...]],
|
|
264
|
+
mingap: Optional[float],
|
|
265
|
+
maxgap: Optional[float],
|
|
266
|
+
maxspan: Optional[float],
|
|
267
|
+
) -> bool:
|
|
268
|
+
"""
|
|
269
|
+
Try to match subsequence starting from a given position.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
start_idx: Starting position in sequence
|
|
273
|
+
subsequence: Pattern to match
|
|
274
|
+
seq_items: Items in sequence
|
|
275
|
+
seq_times: Timestamps (None if not present)
|
|
276
|
+
mingap: Minimum gap constraint
|
|
277
|
+
maxgap: Maximum gap constraint
|
|
278
|
+
maxspan: Maximum span constraint
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
True if match found, False otherwise
|
|
282
|
+
"""
|
|
283
|
+
sub_idx = 0
|
|
284
|
+
matched_indices: List[int] = []
|
|
285
|
+
len_sub = len(subsequence)
|
|
286
|
+
len_seq = len(seq_items)
|
|
287
|
+
|
|
288
|
+
for seq_idx in range(start_idx, len_seq):
|
|
289
|
+
if seq_items[seq_idx] == subsequence[sub_idx]:
|
|
290
|
+
# Check temporal constraints if we have timestamps and have previous matches
|
|
291
|
+
if (
|
|
292
|
+
seq_times is not None
|
|
293
|
+
and matched_indices
|
|
294
|
+
and not _check_temporal_constraints(seq_idx, matched_indices, seq_times, mingap, maxgap)
|
|
295
|
+
):
|
|
296
|
+
# Skip this occurrence and continue searching for a valid one
|
|
297
|
+
continue
|
|
298
|
+
|
|
299
|
+
matched_indices.append(seq_idx)
|
|
300
|
+
sub_idx += 1
|
|
301
|
+
|
|
302
|
+
# If we've matched the entire subsequence, check maxspan
|
|
303
|
+
if sub_idx == len_sub:
|
|
304
|
+
return _check_maxspan(matched_indices, seq_times, maxspan)
|
|
305
|
+
|
|
306
|
+
return False
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _check_temporal_constraints(
|
|
310
|
+
seq_idx: int,
|
|
311
|
+
matched_indices: List[int],
|
|
312
|
+
seq_times: Tuple[float, ...],
|
|
313
|
+
mingap: Optional[float],
|
|
314
|
+
maxgap: Optional[float],
|
|
315
|
+
) -> bool:
|
|
316
|
+
"""
|
|
317
|
+
Check if temporal constraints are satisfied for a new match.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
seq_idx: Current sequence index
|
|
321
|
+
matched_indices: Previously matched indices
|
|
322
|
+
seq_times: Timestamps
|
|
323
|
+
mingap: Minimum gap constraint
|
|
324
|
+
maxgap: Maximum gap constraint
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
True if constraints satisfied, False otherwise
|
|
328
|
+
"""
|
|
329
|
+
prev_idx = matched_indices[-1]
|
|
330
|
+
time_gap = seq_times[seq_idx] - seq_times[prev_idx]
|
|
331
|
+
|
|
332
|
+
# Check mingap constraint
|
|
333
|
+
if mingap is not None and time_gap < mingap:
|
|
334
|
+
return False
|
|
335
|
+
|
|
336
|
+
# Check maxgap constraint
|
|
337
|
+
if maxgap is not None and time_gap > maxgap:
|
|
338
|
+
return False
|
|
339
|
+
|
|
340
|
+
return True
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def _check_maxspan(
|
|
344
|
+
matched_indices: List[int],
|
|
345
|
+
seq_times: Optional[Tuple[float, ...]],
|
|
346
|
+
maxspan: Optional[float],
|
|
347
|
+
) -> bool:
|
|
348
|
+
"""
|
|
349
|
+
Check if maxspan constraint is satisfied.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
matched_indices: Matched sequence indices
|
|
353
|
+
seq_times: Timestamps (None if not present)
|
|
354
|
+
maxspan: Maximum span constraint
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
True if constraint satisfied or not applicable, False otherwise
|
|
358
|
+
"""
|
|
359
|
+
if seq_times is not None and maxspan is not None:
|
|
360
|
+
first_idx = matched_indices[0]
|
|
361
|
+
last_idx = matched_indices[-1]
|
|
362
|
+
span = seq_times[last_idx] - seq_times[first_idx]
|
|
363
|
+
if span > maxspan:
|
|
364
|
+
return False
|
|
365
|
+
|
|
366
|
+
return True
|
|
367
|
+
|
|
368
|
+
|
|
89
369
|
def generate_candidates_from_previous(prev_patterns: Dict[Tuple[str, ...], int]) -> List[Tuple[str, ...]]:
|
|
90
370
|
"""
|
|
91
371
|
Generate joined candidates from the previous level's frequent patterns.
|
|
@@ -96,7 +376,7 @@ def generate_candidates_from_previous(prev_patterns: Dict[Tuple[str, ...], int])
|
|
|
96
376
|
Returns:
|
|
97
377
|
List[Tuple]: Candidate patterns for the next level.
|
|
98
378
|
"""
|
|
99
|
-
keys = list(prev_patterns.keys())
|
|
379
|
+
keys: List[Tuple[str, ...]] = list(prev_patterns.keys())
|
|
100
380
|
return [
|
|
101
381
|
pattern1 + (pattern2[-1],)
|
|
102
382
|
for pattern1, pattern2 in product(keys, repeat=2)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gsppy
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.5.0
|
|
4
4
|
Summary: GSP (Generalized Sequence Pattern) algorithm in Python
|
|
5
5
|
Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
|
|
6
6
|
Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
@@ -41,27 +41,28 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
41
41
|
Requires-Python: >=3.10
|
|
42
42
|
Requires-Dist: click>=8.0.0
|
|
43
43
|
Provides-Extra: dev
|
|
44
|
-
Requires-Dist: cython==3.2.
|
|
45
|
-
Requires-Dist: hatch==1.16.
|
|
44
|
+
Requires-Dist: cython==3.2.4; extra == 'dev'
|
|
45
|
+
Requires-Dist: hatch==1.16.3; extra == 'dev'
|
|
46
46
|
Requires-Dist: hatchling==1.28.0; extra == 'dev'
|
|
47
|
+
Requires-Dist: hypothesis<7.0.0,>=6.0.0; extra == 'dev'
|
|
47
48
|
Requires-Dist: pylint==4.0.4; extra == 'dev'
|
|
48
|
-
Requires-Dist: pyright==1.1.
|
|
49
|
+
Requires-Dist: pyright==1.1.408; extra == 'dev'
|
|
49
50
|
Requires-Dist: pytest-benchmark==5.2.3; extra == 'dev'
|
|
50
51
|
Requires-Dist: pytest-cov==7.0.0; extra == 'dev'
|
|
51
52
|
Requires-Dist: pytest==9.0.2; extra == 'dev'
|
|
52
|
-
Requires-Dist: ruff==0.14.
|
|
53
|
-
Requires-Dist: tox==4.
|
|
54
|
-
Requires-Dist: ty==0.0.
|
|
53
|
+
Requires-Dist: ruff==0.14.13; extra == 'dev'
|
|
54
|
+
Requires-Dist: tox==4.34.1; extra == 'dev'
|
|
55
|
+
Requires-Dist: ty==0.0.12; extra == 'dev'
|
|
55
56
|
Provides-Extra: docs
|
|
56
57
|
Requires-Dist: mkdocs-gen-files<1,>=0.5; extra == 'docs'
|
|
57
58
|
Requires-Dist: mkdocs-literate-nav<1,>=0.6; extra == 'docs'
|
|
58
59
|
Requires-Dist: mkdocs-material<10,>=9.5; extra == 'docs'
|
|
59
60
|
Requires-Dist: mkdocs<2,>=1.6; extra == 'docs'
|
|
60
|
-
Requires-Dist: mkdocstrings[python]<
|
|
61
|
+
Requires-Dist: mkdocstrings[python]<1.1,>=0.26; extra == 'docs'
|
|
61
62
|
Provides-Extra: gpu
|
|
62
63
|
Requires-Dist: cupy<14,>=11; extra == 'gpu'
|
|
63
64
|
Provides-Extra: rust
|
|
64
|
-
Requires-Dist: maturin==1.
|
|
65
|
+
Requires-Dist: maturin==1.11.5; extra == 'rust'
|
|
65
66
|
Description-Content-Type: text/markdown
|
|
66
67
|
|
|
67
68
|
[](https://jacksonpradolima.github.io/gsp-py/)
|
|
@@ -104,6 +105,7 @@ Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal m
|
|
|
104
105
|
6. [💡 Usage](#usage)
|
|
105
106
|
- [✅ Example: Analyzing Sales Data](#example-analyzing-sales-data)
|
|
106
107
|
- [📊 Explanation: Support and Results](#explanation-support-and-results)
|
|
108
|
+
- [⏱️ Temporal Constraints](#temporal-constraints)
|
|
107
109
|
7. [⌨️ Typing](#typing)
|
|
108
110
|
8. [🌟 Planned Features](#planned-features)
|
|
109
111
|
9. [🤝 Contributing](#contributing)
|
|
@@ -122,6 +124,7 @@ principles**. Using support thresholds, GSP identifies frequent sequences of ite
|
|
|
122
124
|
- **Ordered (non-contiguous) matching**: Detects patterns where items appear in order but not necessarily adjacent, following standard GSP semantics. For example, the pattern `('A', 'C')` is found in the sequence `['A', 'B', 'C']`.
|
|
123
125
|
- **Support-based pruning**: Only retains sequences that meet the minimum support threshold.
|
|
124
126
|
- **Candidate generation**: Iteratively generates candidate sequences of increasing length.
|
|
127
|
+
- **Temporal constraints**: Support for time-constrained pattern mining with `mingap`, `maxgap`, and `maxspan` parameters to find patterns within specific time windows.
|
|
125
128
|
- **General-purpose**: Useful in retail, web analytics, social networks, temporal sequence mining, and more.
|
|
126
129
|
|
|
127
130
|
For example:
|
|
@@ -372,7 +375,28 @@ gsppy --file path/to/transactions.csv --min_support 0.3 --backend rust
|
|
|
372
375
|
- `--file`: Path to your input file (JSON or CSV). **Required**.
|
|
373
376
|
- `--min_support`: Minimum support threshold as a fraction (e.g., `0.3` for 30%). Default is `0.2`.
|
|
374
377
|
- `--backend`: Backend to use for support counting. One of `auto` (default), `python`, `rust`, or `gpu`.
|
|
375
|
-
- `--verbose`:
|
|
378
|
+
- `--verbose`: Enable detailed logging with timestamps, log levels, and process IDs for debugging and traceability.
|
|
379
|
+
- `--mingap`, `--maxgap`, `--maxspan`: Temporal constraints for time-aware pattern mining (requires timestamped transactions).
|
|
380
|
+
|
|
381
|
+
#### Verbose Mode
|
|
382
|
+
|
|
383
|
+
For debugging or to track execution in CI/CD pipelines, use the `--verbose` flag:
|
|
384
|
+
|
|
385
|
+
```bash
|
|
386
|
+
gsppy --file transactions.json --min_support 0.3 --verbose
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
This produces structured logging output with timestamps, log levels, and process information:
|
|
390
|
+
|
|
391
|
+
```
|
|
392
|
+
YYYY-MM-DDTHH:MM:SS | INFO | PID:4179 | gsppy.gsp | Pre-processing transactions...
|
|
393
|
+
YYYY-MM-DDTHH:MM:SS | DEBUG | PID:4179 | gsppy.gsp | Unique candidates: [('Bread',), ('Milk',), ...]
|
|
394
|
+
YYYY-MM-DDTHH:MM:SS | INFO | PID:4179 | gsppy.gsp | Starting GSP algorithm with min_support=0.3...
|
|
395
|
+
YYYY-MM-DDTHH:MM:SS | INFO | PID:4179 | gsppy.gsp | Run 1: 6 candidates filtered to 5.
|
|
396
|
+
...
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
For complete logging documentation, see [docs/logging.md](docs/logging.md).
|
|
376
400
|
|
|
377
401
|
#### Example
|
|
378
402
|
|
|
@@ -469,6 +493,30 @@ result = GSP(transactions).search(min_support)
|
|
|
469
493
|
print(result)
|
|
470
494
|
```
|
|
471
495
|
|
|
496
|
+
### Verbose Mode for Debugging
|
|
497
|
+
|
|
498
|
+
Enable detailed logging to track algorithm progress and debug issues:
|
|
499
|
+
|
|
500
|
+
```python
|
|
501
|
+
from gsppy.gsp import GSP
|
|
502
|
+
|
|
503
|
+
# Enable verbose logging for the entire instance
|
|
504
|
+
gsp = GSP(transactions, verbose=True)
|
|
505
|
+
result = gsp.search(min_support=0.3)
|
|
506
|
+
|
|
507
|
+
# Or enable verbose for a specific search only
|
|
508
|
+
gsp = GSP(transactions)
|
|
509
|
+
result = gsp.search(min_support=0.3, verbose=True)
|
|
510
|
+
```
|
|
511
|
+
|
|
512
|
+
Verbose mode provides:
|
|
513
|
+
- Detailed progress information during execution
|
|
514
|
+
- Candidate generation and filtering statistics
|
|
515
|
+
- Preprocessing and validation details
|
|
516
|
+
- Useful for debugging, research, and CI/CD integration
|
|
517
|
+
|
|
518
|
+
For complete documentation on logging, see [docs/logging.md](docs/logging.md).
|
|
519
|
+
|
|
472
520
|
### Output
|
|
473
521
|
|
|
474
522
|
The algorithm will return a list of patterns with their corresponding support.
|
|
@@ -535,6 +583,128 @@ result = gsp.search(min_support=0.5) # Need at least 2/4 sequences
|
|
|
535
583
|
|
|
536
584
|
---
|
|
537
585
|
|
|
586
|
+
## ⏱️ Temporal Constraints
|
|
587
|
+
|
|
588
|
+
GSP-Py supports **time-constrained sequential pattern mining** with three powerful temporal constraints: `mingap`, `maxgap`, and `maxspan`. These constraints enable domain-specific applications such as medical event mining, retail analytics, and temporal user journey discovery.
|
|
589
|
+
|
|
590
|
+
### Temporal Constraint Parameters
|
|
591
|
+
|
|
592
|
+
- **`mingap`**: Minimum time gap required between consecutive items in a pattern
|
|
593
|
+
- **`maxgap`**: Maximum time gap allowed between consecutive items in a pattern
|
|
594
|
+
- **`maxspan`**: Maximum time span from the first to the last item in a pattern
|
|
595
|
+
|
|
596
|
+
### Using Temporal Constraints
|
|
597
|
+
|
|
598
|
+
To use temporal constraints, your transactions must include timestamps as (item, timestamp) tuples:
|
|
599
|
+
|
|
600
|
+
```python
|
|
601
|
+
from gsppy.gsp import GSP
|
|
602
|
+
|
|
603
|
+
# Transactions with timestamps (e.g., in seconds, hours, days, etc.)
|
|
604
|
+
timestamped_transactions = [
|
|
605
|
+
[("Login", 0), ("Browse", 2), ("AddToCart", 5), ("Purchase", 7)],
|
|
606
|
+
[("Login", 0), ("Browse", 1), ("AddToCart", 15), ("Purchase", 20)],
|
|
607
|
+
[("Login", 0), ("Browse", 3), ("AddToCart", 6), ("Purchase", 8)],
|
|
608
|
+
]
|
|
609
|
+
|
|
610
|
+
# Find patterns where consecutive events occur within 10 time units
|
|
611
|
+
gsp = GSP(timestamped_transactions, maxgap=10)
|
|
612
|
+
patterns = gsp.search(min_support=0.6)
|
|
613
|
+
|
|
614
|
+
# The pattern ("Browse", "AddToCart", "Purchase") will:
|
|
615
|
+
# - Be found in transaction 1: gaps are 3 and 2 (both ≤ 10) ✅
|
|
616
|
+
# - NOT be found in transaction 2: gap between Browse→AddToCart is 14 (exceeds maxgap) ❌
|
|
617
|
+
# - Be found in transaction 3: gaps are 3 and 2 (both ≤ 10) ✅
|
|
618
|
+
# Result: Support = 2/3 = 67% (above threshold of 60%)
|
|
619
|
+
```
|
|
620
|
+
|
|
621
|
+
### CLI Usage with Temporal Constraints
|
|
622
|
+
|
|
623
|
+
```bash
|
|
624
|
+
# Find patterns with maximum gap of 5 time units
|
|
625
|
+
gsppy --file temporal_data.json --min_support 0.3 --maxgap 5
|
|
626
|
+
|
|
627
|
+
# Find patterns with minimum gap of 2 time units
|
|
628
|
+
gsppy --file temporal_data.json --min_support 0.3 --mingap 2
|
|
629
|
+
|
|
630
|
+
# Find patterns that complete within 10 time units
|
|
631
|
+
gsppy --file temporal_data.json --min_support 0.3 --maxspan 10
|
|
632
|
+
|
|
633
|
+
# Combine multiple constraints
|
|
634
|
+
gsppy --file temporal_data.json --min_support 0.3 --mingap 1 --maxgap 5 --maxspan 10
|
|
635
|
+
```
|
|
636
|
+
|
|
637
|
+
### Real-World Examples
|
|
638
|
+
|
|
639
|
+
#### Medical Event Mining
|
|
640
|
+
|
|
641
|
+
```python
|
|
642
|
+
from gsppy.gsp import GSP
|
|
643
|
+
|
|
644
|
+
# Medical events with timestamps in days
|
|
645
|
+
medical_sequences = [
|
|
646
|
+
[("Symptom", 0), ("Diagnosis", 2), ("Treatment", 5), ("Recovery", 15)],
|
|
647
|
+
[("Symptom", 0), ("Diagnosis", 1), ("Treatment", 20), ("Recovery", 30)],
|
|
648
|
+
[("Symptom", 0), ("Diagnosis", 3), ("Treatment", 6), ("Recovery", 18)],
|
|
649
|
+
]
|
|
650
|
+
|
|
651
|
+
# Find patterns where treatment follows diagnosis within 10 days
|
|
652
|
+
gsp = GSP(medical_sequences, maxgap=10)
|
|
653
|
+
result = gsp.search(min_support=0.5)
|
|
654
|
+
|
|
655
|
+
# Pattern ("Diagnosis", "Treatment") found in sequences 1 & 3 only
|
|
656
|
+
# (sequence 2 has gap of 19 days, exceeding maxgap)
|
|
657
|
+
```
|
|
658
|
+
|
|
659
|
+
#### Retail Analytics
|
|
660
|
+
|
|
661
|
+
```python
|
|
662
|
+
from gsppy.gsp import GSP
|
|
663
|
+
|
|
664
|
+
# Customer purchases with timestamps in hours
|
|
665
|
+
purchase_sequences = [
|
|
666
|
+
[("Browse", 0), ("AddToCart", 0.5), ("Purchase", 1)],
|
|
667
|
+
[("Browse", 0), ("AddToCart", 1), ("Purchase", 25)], # Long delay
|
|
668
|
+
[("Browse", 0), ("AddToCart", 0.3), ("Purchase", 0.8)],
|
|
669
|
+
]
|
|
670
|
+
|
|
671
|
+
# Find purchase journeys that complete within 2 hours
|
|
672
|
+
gsp = GSP(purchase_sequences, maxspan=2)
|
|
673
|
+
result = gsp.search(min_support=0.5)
|
|
674
|
+
|
|
675
|
+
# Full sequence found in 2 out of 3 transactions
|
|
676
|
+
# (sequence 2 has span of 25 hours, exceeding maxspan)
|
|
677
|
+
```
|
|
678
|
+
|
|
679
|
+
#### User Journey Discovery
|
|
680
|
+
|
|
681
|
+
```python
|
|
682
|
+
from gsppy.gsp import GSP
|
|
683
|
+
|
|
684
|
+
# Website navigation with timestamps in seconds
|
|
685
|
+
navigation_sequences = [
|
|
686
|
+
[("Home", 0), ("Search", 5), ("Product", 10), ("Checkout", 15)],
|
|
687
|
+
[("Home", 0), ("Search", 3), ("Product", 8), ("Checkout", 180)],
|
|
688
|
+
[("Home", 0), ("Search", 4), ("Product", 9), ("Checkout", 14)],
|
|
689
|
+
]
|
|
690
|
+
|
|
691
|
+
# Find navigation patterns with:
|
|
692
|
+
# - Minimum 2 seconds between steps (mingap)
|
|
693
|
+
# - Maximum 20 seconds between steps (maxgap)
|
|
694
|
+
# - Complete within 30 seconds total (maxspan)
|
|
695
|
+
gsp = GSP(navigation_sequences, mingap=2, maxgap=20, maxspan=30)
|
|
696
|
+
result = gsp.search(min_support=0.5)
|
|
697
|
+
```
|
|
698
|
+
|
|
699
|
+
### Important Notes
|
|
700
|
+
|
|
701
|
+
- Temporal constraints require timestamped transactions (item-timestamp tuples)
|
|
702
|
+
- If temporal constraints are specified but transactions don't have timestamps, a warning is logged and constraints are ignored
|
|
703
|
+
- When using temporal constraints, the Python backend is automatically used (accelerated backends don't yet support temporal constraints)
|
|
704
|
+
- Timestamps can be in any unit (seconds, minutes, hours, days) as long as they're consistent within your dataset
|
|
705
|
+
|
|
706
|
+
---
|
|
707
|
+
|
|
538
708
|
## ⌨️ Typing
|
|
539
709
|
|
|
540
710
|
`gsppy` ships inline type information (PEP 561) via a bundled `py.typed` marker. The public API is re-exported from
|
|
@@ -554,11 +724,6 @@ We are actively working to improve GSP-Py. Here are some exciting features plann
|
|
|
554
724
|
2. **Support for Preprocessing and Postprocessing**:
|
|
555
725
|
- Add hooks to allow users to transform datasets before mining and customize the output results.
|
|
556
726
|
|
|
557
|
-
3. **Support for Time-Constrained Pattern Mining**:
|
|
558
|
-
- Extend GSP-Py to handle temporal datasets by allowing users to define time constraints (e.g., maximum time gaps
|
|
559
|
-
between events, time windows) during the sequence mining process.
|
|
560
|
-
- Enable candidate pruning and support calculations based on these temporal constraints.
|
|
561
|
-
|
|
562
727
|
Want to contribute or suggest an
|
|
563
728
|
improvement? [Open a discussion or issue!](https://github.com/jacksonpradolima/gsp-py/issues)
|
|
564
729
|
|
|
@@ -583,16 +748,34 @@ uv run ruff check .
|
|
|
583
748
|
uv run pyright
|
|
584
749
|
```
|
|
585
750
|
|
|
751
|
+
### Testing & Fuzzing
|
|
752
|
+
|
|
753
|
+
GSP-Py includes comprehensive test coverage, including property-based fuzzing tests using [Hypothesis](https://hypothesis.readthedocs.io/). These fuzzing tests automatically generate random inputs to verify algorithm invariants and discover edge cases. Run the fuzzing tests with:
|
|
754
|
+
|
|
755
|
+
```bash
|
|
756
|
+
uv run pytest tests/test_gsp_fuzzing.py -v
|
|
757
|
+
```
|
|
758
|
+
|
|
586
759
|
### General Steps:
|
|
587
760
|
|
|
588
761
|
1. Fork the repository.
|
|
589
762
|
2. Create a feature branch: `git checkout -b feature/my-feature`.
|
|
590
|
-
3. Commit your changes: `git commit -m "
|
|
763
|
+
3. Commit your changes using [Conventional Commits](https://www.conventionalcommits.org/) format: `git commit -m "feat: add my feature"`.
|
|
591
764
|
4. Push to your branch: `git push origin feature/my-feature`.
|
|
592
765
|
5. Submit a pull request to the main repository!
|
|
593
766
|
|
|
594
767
|
Looking for ideas? Check out our [Planned Features](#planned-features) section.
|
|
595
768
|
|
|
769
|
+
### Release Management
|
|
770
|
+
|
|
771
|
+
GSP-Py uses automated release management with [Conventional Commits](https://www.conventionalcommits.org/). When commits are merged to `main`:
|
|
772
|
+
- **Releases are triggered** by: `fix:` (patch), `feat:` (minor), `perf:` (patch), or `BREAKING CHANGE:` (major)
|
|
773
|
+
- **No release** for: `docs:`, `style:`, `refactor:`, `test:`, `build:`, `ci:`, `chore:`
|
|
774
|
+
- CHANGELOG.md is automatically updated with structured release notes
|
|
775
|
+
- Git tags and GitHub releases are created automatically
|
|
776
|
+
|
|
777
|
+
See [Release Management Guide](docs/RELEASE_MANAGEMENT.md) for details on commit message format and release process.
|
|
778
|
+
|
|
596
779
|
---
|
|
597
780
|
|
|
598
781
|
## 📝 License
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
gsppy/__init__.py,sha256=NMVa-ZWT449wuxZMF9Ym7p-DChOxOibaaqlpPxksfuo,805
|
|
2
|
+
gsppy/accelerate.py,sha256=rDho3ysADETpuhT2SF9voBjd3XRaQUzuA5k_baNACF8,11020
|
|
3
|
+
gsppy/cli.py,sha256=-viXa8VFIF-QvrHYy1vtDxtMm50sM_tZq5B5DMZ1Jtw,12516
|
|
4
|
+
gsppy/gsp.py,sha256=k72pvdmD6jU4AId2rrHQrJ4FBUgtkuC0ntEY8QHGi5c,24486
|
|
5
|
+
gsppy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
gsppy/utils.py,sha256=dAEq1hEZMN0ZjoocKs_ZIgOI9j_Y6rJEAKneul3zNRo,13501
|
|
7
|
+
gsppy-3.5.0.dist-info/METADATA,sha256=ix2X_VEUTved_DaTsSJMERT-CZ34TUYF0XMC2KeNeuE,29747
|
|
8
|
+
gsppy-3.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
9
|
+
gsppy-3.5.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
10
|
+
gsppy-3.5.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
|
|
11
|
+
gsppy-3.5.0.dist-info/RECORD,,
|
gsppy-3.3.0.dist-info/RECORD
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
gsppy/__init__.py,sha256=FcWEYkzMCiqIBmc4yhgIXFKzvSNjJA7LX7juUabvoJ4,784
|
|
2
|
-
gsppy/accelerate.py,sha256=2I3IA42FyPZvfwc0-f0bovZ8YgbdvJXj0qDlYWSWiXI,10998
|
|
3
|
-
gsppy/cli.py,sha256=W5udAPKOjlxi-c-RKcz5HW-sDgoap4ojHD87bd-X498,6583
|
|
4
|
-
gsppy/gsp.py,sha256=aCtPrldVNCkwj6wwytrZzbayYKkXi9Om-3xzrHUMkLQ,15293
|
|
5
|
-
gsppy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
gsppy/utils.py,sha256=KtjfDgsTwvwxIyA2KCQmgu8cFkBqQvMZN8Ct5NB60Tc,3952
|
|
7
|
-
gsppy-3.3.0.dist-info/METADATA,sha256=VQtJqYCs9I4HnO5EpEeI9SijBxxgaNir_mw1HMmWKlw,22727
|
|
8
|
-
gsppy-3.3.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
9
|
-
gsppy-3.3.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
10
|
-
gsppy-3.3.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
|
|
11
|
-
gsppy-3.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|