gsppy 3.6.0__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsppy/__init__.py +37 -2
- gsppy/cli.py +314 -11
- gsppy/dataframe_adapters.py +458 -0
- gsppy/enums.py +49 -0
- gsppy/gsp.py +150 -9
- gsppy/token_mapper.py +99 -0
- gsppy/utils.py +120 -0
- {gsppy-3.6.0.dist-info → gsppy-4.0.0.dist-info}/METADATA +329 -9
- gsppy-4.0.0.dist-info/RECORD +15 -0
- gsppy-3.6.0.dist-info/RECORD +0 -12
- {gsppy-3.6.0.dist-info → gsppy-4.0.0.dist-info}/WHEEL +0 -0
- {gsppy-3.6.0.dist-info → gsppy-4.0.0.dist-info}/entry_points.txt +0 -0
- {gsppy-3.6.0.dist-info → gsppy-4.0.0.dist-info}/licenses/LICENSE +0 -0
gsppy/__init__.py
CHANGED
|
@@ -10,17 +10,37 @@ from gsppy.cli import (
|
|
|
10
10
|
setup_logging,
|
|
11
11
|
detect_and_read_file,
|
|
12
12
|
read_transactions_from_csv,
|
|
13
|
+
read_transactions_from_spm,
|
|
13
14
|
read_transactions_from_json,
|
|
15
|
+
read_transactions_from_arrow,
|
|
16
|
+
read_transactions_from_parquet,
|
|
14
17
|
)
|
|
15
18
|
from gsppy.gsp import GSP
|
|
16
19
|
from gsppy.pruning import (
|
|
20
|
+
CombinedPruning,
|
|
17
21
|
PruningStrategy,
|
|
18
22
|
SupportBasedPruning,
|
|
19
|
-
FrequencyBasedPruning,
|
|
20
23
|
TemporalAwarePruning,
|
|
21
|
-
|
|
24
|
+
FrequencyBasedPruning,
|
|
22
25
|
create_default_pruning_strategy,
|
|
23
26
|
)
|
|
27
|
+
from gsppy.token_mapper import TokenMapper
|
|
28
|
+
|
|
29
|
+
# DataFrame adapters are optional - import only if dependencies are available
|
|
30
|
+
try:
|
|
31
|
+
from gsppy.dataframe_adapters import (
|
|
32
|
+
DataFrameAdapterError,
|
|
33
|
+
pandas_to_transactions,
|
|
34
|
+
polars_to_transactions,
|
|
35
|
+
dataframe_to_transactions,
|
|
36
|
+
)
|
|
37
|
+
except ImportError:
|
|
38
|
+
DataFrameAdapterError = None # type: ignore
|
|
39
|
+
pandas_to_transactions = None # type: ignore
|
|
40
|
+
polars_to_transactions = None # type: ignore
|
|
41
|
+
dataframe_to_transactions = None # type: ignore
|
|
42
|
+
|
|
43
|
+
_DATAFRAME_AVAILABLE = DataFrameAdapterError is not None
|
|
24
44
|
|
|
25
45
|
try:
|
|
26
46
|
__version__ = importlib_metadata.version("gsppy")
|
|
@@ -32,6 +52,9 @@ __all__ = [
|
|
|
32
52
|
"detect_and_read_file",
|
|
33
53
|
"read_transactions_from_csv",
|
|
34
54
|
"read_transactions_from_json",
|
|
55
|
+
"read_transactions_from_parquet",
|
|
56
|
+
"read_transactions_from_arrow",
|
|
57
|
+
"read_transactions_from_spm",
|
|
35
58
|
"setup_logging",
|
|
36
59
|
"__version__",
|
|
37
60
|
"PruningStrategy",
|
|
@@ -40,4 +63,16 @@ __all__ = [
|
|
|
40
63
|
"TemporalAwarePruning",
|
|
41
64
|
"CombinedPruning",
|
|
42
65
|
"create_default_pruning_strategy",
|
|
66
|
+
"TokenMapper",
|
|
43
67
|
]
|
|
68
|
+
|
|
69
|
+
# Add DataFrame adapters to __all__ if available
|
|
70
|
+
if _DATAFRAME_AVAILABLE:
|
|
71
|
+
__all__.extend(
|
|
72
|
+
[
|
|
73
|
+
"dataframe_to_transactions",
|
|
74
|
+
"polars_to_transactions",
|
|
75
|
+
"pandas_to_transactions",
|
|
76
|
+
"DataFrameAdapterError",
|
|
77
|
+
]
|
|
78
|
+
)
|
gsppy/cli.py
CHANGED
|
@@ -28,6 +28,8 @@ This CLI empowers users to perform sequential pattern mining on transactional da
|
|
|
28
28
|
a simple command-line interface.
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
+
from __future__ import annotations
|
|
32
|
+
|
|
31
33
|
import os
|
|
32
34
|
import csv
|
|
33
35
|
import sys
|
|
@@ -38,21 +40,29 @@ from typing import Any, Dict, List, Tuple, Union, Optional, cast
|
|
|
38
40
|
import click
|
|
39
41
|
|
|
40
42
|
from gsppy.gsp import GSP
|
|
43
|
+
from gsppy.enums import (
|
|
44
|
+
ARROW_EXTENSIONS,
|
|
45
|
+
PARQUET_EXTENSIONS,
|
|
46
|
+
DATAFRAME_EXTENSIONS,
|
|
47
|
+
SUPPORTED_EXTENSIONS_MESSAGE,
|
|
48
|
+
FileFormat,
|
|
49
|
+
FileExtension,
|
|
50
|
+
)
|
|
41
51
|
from gsppy.utils import has_timestamps
|
|
42
52
|
|
|
43
53
|
|
|
44
54
|
def setup_logging(verbose: bool) -> None:
|
|
45
55
|
"""
|
|
46
56
|
Configure logging with standardized format based on verbosity level.
|
|
47
|
-
|
|
57
|
+
|
|
48
58
|
When verbose is enabled, provides detailed structured logging with:
|
|
49
59
|
- Timestamps (ISO 8601 format)
|
|
50
60
|
- Log levels
|
|
51
61
|
- Process ID for traceability
|
|
52
62
|
- Module context
|
|
53
|
-
|
|
63
|
+
|
|
54
64
|
When verbose is disabled, uses simple format with just the message.
|
|
55
|
-
|
|
65
|
+
|
|
56
66
|
Parameters:
|
|
57
67
|
verbose: Whether to enable verbose logging with detailed formatting.
|
|
58
68
|
"""
|
|
@@ -60,7 +70,7 @@ def setup_logging(verbose: bool) -> None:
|
|
|
60
70
|
root_logger = logging.getLogger()
|
|
61
71
|
for handler in root_logger.handlers[:]:
|
|
62
72
|
root_logger.removeHandler(handler)
|
|
63
|
-
|
|
73
|
+
|
|
64
74
|
if verbose:
|
|
65
75
|
# Detailed format with timestamps, levels, PID, and context for verbose mode
|
|
66
76
|
log_format = "%(asctime)s | %(levelname)-8s | PID:%(process)d | %(name)s | %(message)s"
|
|
@@ -71,7 +81,7 @@ def setup_logging(verbose: bool) -> None:
|
|
|
71
81
|
log_format = "%(message)s"
|
|
72
82
|
date_format = None
|
|
73
83
|
log_level = logging.INFO
|
|
74
|
-
|
|
84
|
+
|
|
75
85
|
# Configure logging with the appropriate format
|
|
76
86
|
logging.basicConfig(
|
|
77
87
|
level=log_level,
|
|
@@ -186,9 +196,39 @@ def read_transactions_from_csv(file_path: str) -> List[List[str]]:
|
|
|
186
196
|
raise ValueError(msg) from e
|
|
187
197
|
|
|
188
198
|
|
|
199
|
+
def read_transactions_from_spm(file_path: str) -> List[List[str]]:
|
|
200
|
+
"""
|
|
201
|
+
Read transactions from an SPM/GSP format file.
|
|
202
|
+
|
|
203
|
+
The SPM/GSP format uses delimiters:
|
|
204
|
+
- `-1`: End of element (item set)
|
|
205
|
+
- `-2`: End of sequence (transaction)
|
|
206
|
+
|
|
207
|
+
Parameters:
|
|
208
|
+
file_path (str): Path to the file containing transactions.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
List[List[str]]: Parsed transactions from the file.
|
|
212
|
+
|
|
213
|
+
Raises:
|
|
214
|
+
ValueError: If the file cannot be read or contains invalid data.
|
|
215
|
+
"""
|
|
216
|
+
try:
|
|
217
|
+
from gsppy.utils import read_transactions_from_spm as read_spm
|
|
218
|
+
|
|
219
|
+
return cast(List[List[str]], read_spm(file_path, return_mappings=False))
|
|
220
|
+
except Exception as e:
|
|
221
|
+
msg = f"Error reading transaction data from SPM file '{file_path}': {e}"
|
|
222
|
+
logging.error(msg)
|
|
223
|
+
raise ValueError(msg) from e
|
|
224
|
+
|
|
225
|
+
|
|
189
226
|
def detect_and_read_file(file_path: str) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
|
|
190
227
|
"""
|
|
191
|
-
Detect file format (CSV
|
|
228
|
+
Detect file format (CSV, JSON, Parquet, Arrow) and read transactions.
|
|
229
|
+
|
|
230
|
+
Supports traditional formats (CSV, JSON) and modern DataFrame formats (Parquet, Arrow).
|
|
231
|
+
For DataFrame formats, requires 'gsppy[dataframe]' to be installed.
|
|
192
232
|
|
|
193
233
|
Parameters:
|
|
194
234
|
file_path (str): Path to the file containing transactions.
|
|
@@ -206,13 +246,200 @@ def detect_and_read_file(file_path: str) -> Union[List[List[str]], List[List[Tup
|
|
|
206
246
|
_, file_extension = os.path.splitext(file_path)
|
|
207
247
|
file_extension = file_extension.lower()
|
|
208
248
|
|
|
209
|
-
if file_extension ==
|
|
249
|
+
if file_extension == FileExtension.JSON.value:
|
|
210
250
|
return read_transactions_from_json(file_path)
|
|
211
251
|
|
|
212
|
-
if file_extension ==
|
|
252
|
+
if file_extension == FileExtension.CSV.value:
|
|
213
253
|
return read_transactions_from_csv(file_path)
|
|
214
254
|
|
|
215
|
-
|
|
255
|
+
if file_extension in PARQUET_EXTENSIONS:
|
|
256
|
+
return read_transactions_from_parquet(file_path)
|
|
257
|
+
|
|
258
|
+
if file_extension in ARROW_EXTENSIONS:
|
|
259
|
+
return read_transactions_from_arrow(file_path)
|
|
260
|
+
|
|
261
|
+
raise ValueError(SUPPORTED_EXTENSIONS_MESSAGE.format(extension=file_extension))
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def read_transactions_from_parquet(
|
|
265
|
+
file_path: str,
|
|
266
|
+
transaction_col: Optional[str] = None,
|
|
267
|
+
item_col: Optional[str] = None,
|
|
268
|
+
timestamp_col: Optional[str] = None,
|
|
269
|
+
sequence_col: Optional[str] = None,
|
|
270
|
+
) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
|
|
271
|
+
"""
|
|
272
|
+
Read transactions from a Parquet file using Polars.
|
|
273
|
+
|
|
274
|
+
Parameters:
|
|
275
|
+
file_path (str): Path to the Parquet file.
|
|
276
|
+
transaction_col (Optional[str]): Column name for transaction IDs (grouped format).
|
|
277
|
+
item_col (Optional[str]): Column name for items (grouped format).
|
|
278
|
+
timestamp_col (Optional[str]): Column name for timestamps.
|
|
279
|
+
sequence_col (Optional[str]): Column name containing sequences (sequence format).
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Union[List[List[str]], List[List[Tuple[str, float]]]]:
|
|
283
|
+
Parsed transactions from the file.
|
|
284
|
+
|
|
285
|
+
Raises:
|
|
286
|
+
ValueError: If the file cannot be read or Polars is not installed.
|
|
287
|
+
"""
|
|
288
|
+
try:
|
|
289
|
+
import polars as pl
|
|
290
|
+
|
|
291
|
+
from gsppy.dataframe_adapters import polars_to_transactions
|
|
292
|
+
except ImportError as e:
|
|
293
|
+
raise ValueError("Parquet support requires Polars. Install with: pip install 'gsppy[dataframe]'") from e
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
df: Any = pl.read_parquet(file_path)
|
|
297
|
+
return polars_to_transactions(
|
|
298
|
+
df,
|
|
299
|
+
transaction_col=transaction_col,
|
|
300
|
+
item_col=item_col,
|
|
301
|
+
timestamp_col=timestamp_col,
|
|
302
|
+
sequence_col=sequence_col,
|
|
303
|
+
)
|
|
304
|
+
except Exception as e:
|
|
305
|
+
msg = f"Error reading transaction data from Parquet file '{file_path}': {e}"
|
|
306
|
+
logging.error(msg)
|
|
307
|
+
raise ValueError(msg) from e
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def read_transactions_from_arrow(
|
|
311
|
+
file_path: str,
|
|
312
|
+
transaction_col: Optional[str] = None,
|
|
313
|
+
item_col: Optional[str] = None,
|
|
314
|
+
timestamp_col: Optional[str] = None,
|
|
315
|
+
sequence_col: Optional[str] = None,
|
|
316
|
+
) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
|
|
317
|
+
"""
|
|
318
|
+
Read transactions from an Arrow/Feather file using Polars.
|
|
319
|
+
|
|
320
|
+
Parameters:
|
|
321
|
+
file_path (str): Path to the Arrow/Feather file.
|
|
322
|
+
transaction_col (Optional[str]): Column name for transaction IDs (grouped format).
|
|
323
|
+
item_col (Optional[str]): Column name for items (grouped format).
|
|
324
|
+
timestamp_col (Optional[str]): Column name for timestamps.
|
|
325
|
+
sequence_col (Optional[str]): Column name containing sequences (sequence format).
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
Union[List[List[str]], List[List[Tuple[str, float]]]]:
|
|
329
|
+
Parsed transactions from the file.
|
|
330
|
+
|
|
331
|
+
Raises:
|
|
332
|
+
ValueError: If the file cannot be read or Polars is not installed.
|
|
333
|
+
"""
|
|
334
|
+
try:
|
|
335
|
+
import polars as pl
|
|
336
|
+
|
|
337
|
+
from gsppy.dataframe_adapters import polars_to_transactions
|
|
338
|
+
except ImportError as e:
|
|
339
|
+
raise ValueError("Arrow/Feather support requires Polars. Install with: pip install 'gsppy[dataframe]'") from e
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
df: Any = pl.read_ipc(file_path)
|
|
343
|
+
return polars_to_transactions(
|
|
344
|
+
df,
|
|
345
|
+
transaction_col=transaction_col,
|
|
346
|
+
item_col=item_col,
|
|
347
|
+
timestamp_col=timestamp_col,
|
|
348
|
+
sequence_col=sequence_col,
|
|
349
|
+
)
|
|
350
|
+
except Exception as e:
|
|
351
|
+
msg = f"Error reading transaction data from Arrow file '{file_path}': {e}"
|
|
352
|
+
logging.error(msg)
|
|
353
|
+
raise ValueError(msg) from e
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _load_dataframe_format(
|
|
357
|
+
file_path: str,
|
|
358
|
+
file_extension: str,
|
|
359
|
+
transaction_col: Optional[str],
|
|
360
|
+
item_col: Optional[str],
|
|
361
|
+
timestamp_col: Optional[str],
|
|
362
|
+
sequence_col: Optional[str],
|
|
363
|
+
) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
|
|
364
|
+
"""
|
|
365
|
+
Load transactions from DataFrame formats (Parquet/Arrow).
|
|
366
|
+
|
|
367
|
+
Parameters:
|
|
368
|
+
file_path: Path to the file
|
|
369
|
+
file_extension: File extension (lowercase)
|
|
370
|
+
transaction_col: Transaction ID column name
|
|
371
|
+
item_col: Item column name
|
|
372
|
+
timestamp_col: Timestamp column name
|
|
373
|
+
sequence_col: Sequence column name
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
Loaded transactions
|
|
377
|
+
"""
|
|
378
|
+
if file_extension in PARQUET_EXTENSIONS:
|
|
379
|
+
return read_transactions_from_parquet(
|
|
380
|
+
file_path,
|
|
381
|
+
transaction_col=transaction_col,
|
|
382
|
+
item_col=item_col,
|
|
383
|
+
timestamp_col=timestamp_col,
|
|
384
|
+
sequence_col=sequence_col,
|
|
385
|
+
)
|
|
386
|
+
else: # Arrow/Feather
|
|
387
|
+
return read_transactions_from_arrow(
|
|
388
|
+
file_path,
|
|
389
|
+
transaction_col=transaction_col,
|
|
390
|
+
item_col=item_col,
|
|
391
|
+
timestamp_col=timestamp_col,
|
|
392
|
+
sequence_col=sequence_col,
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _load_transactions_by_format(
|
|
397
|
+
file_path: str,
|
|
398
|
+
file_format: str,
|
|
399
|
+
file_extension: str,
|
|
400
|
+
is_dataframe_format: bool,
|
|
401
|
+
transaction_col: Optional[str],
|
|
402
|
+
item_col: Optional[str],
|
|
403
|
+
timestamp_col: Optional[str],
|
|
404
|
+
sequence_col: Optional[str],
|
|
405
|
+
) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
|
|
406
|
+
"""
|
|
407
|
+
Load transactions based on specified format.
|
|
408
|
+
|
|
409
|
+
Parameters:
|
|
410
|
+
file_path: Path to the file
|
|
411
|
+
file_format: Format string (lowercase)
|
|
412
|
+
file_extension: File extension (lowercase)
|
|
413
|
+
is_dataframe_format: Whether file is a DataFrame format
|
|
414
|
+
transaction_col: Transaction ID column name
|
|
415
|
+
item_col: Item column name
|
|
416
|
+
timestamp_col: Timestamp column name
|
|
417
|
+
sequence_col: Sequence column name
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
Loaded transactions
|
|
421
|
+
|
|
422
|
+
Raises:
|
|
423
|
+
ValueError: If format is unknown
|
|
424
|
+
"""
|
|
425
|
+
if file_format == FileFormat.SPM.value:
|
|
426
|
+
return read_transactions_from_spm(file_path)
|
|
427
|
+
elif file_format == FileFormat.JSON.value:
|
|
428
|
+
return read_transactions_from_json(file_path)
|
|
429
|
+
elif file_format == FileFormat.CSV.value:
|
|
430
|
+
return read_transactions_from_csv(file_path)
|
|
431
|
+
elif file_format in (FileFormat.PARQUET.value, FileFormat.ARROW.value):
|
|
432
|
+
return _load_dataframe_format(file_path, file_extension, transaction_col, item_col, timestamp_col, sequence_col)
|
|
433
|
+
elif file_format == FileFormat.AUTO.value:
|
|
434
|
+
# Auto-detect format
|
|
435
|
+
if is_dataframe_format:
|
|
436
|
+
return _load_dataframe_format(
|
|
437
|
+
file_path, file_extension, transaction_col, item_col, timestamp_col, sequence_col
|
|
438
|
+
)
|
|
439
|
+
else:
|
|
440
|
+
return detect_and_read_file(file_path)
|
|
441
|
+
else:
|
|
442
|
+
raise ValueError(f"Unknown format: {file_format}")
|
|
216
443
|
|
|
217
444
|
|
|
218
445
|
# Click-based CLI
|
|
@@ -222,7 +449,7 @@ def detect_and_read_file(file_path: str) -> Union[List[List[str]], List[List[Tup
|
|
|
222
449
|
"file_path",
|
|
223
450
|
required=True,
|
|
224
451
|
type=click.Path(exists=True),
|
|
225
|
-
help="Path to a JSON
|
|
452
|
+
help="Path to a transaction file (JSON, CSV, SPM, Parquet, or Arrow format).",
|
|
226
453
|
)
|
|
227
454
|
@click.option(
|
|
228
455
|
"--min_support",
|
|
@@ -256,6 +483,37 @@ def detect_and_read_file(file_path: str) -> Union[List[List[str]], List[List[Tup
|
|
|
256
483
|
default=None,
|
|
257
484
|
help="Maximum time span from first to last item in patterns (requires timestamped transactions).",
|
|
258
485
|
)
|
|
486
|
+
@click.option(
|
|
487
|
+
"--transaction-col",
|
|
488
|
+
type=str,
|
|
489
|
+
default=None,
|
|
490
|
+
help="DataFrame: column name for transaction IDs (grouped format).",
|
|
491
|
+
)
|
|
492
|
+
@click.option(
|
|
493
|
+
"--item-col",
|
|
494
|
+
type=str,
|
|
495
|
+
default=None,
|
|
496
|
+
help="DataFrame: column name for items (grouped format).",
|
|
497
|
+
)
|
|
498
|
+
@click.option(
|
|
499
|
+
"--timestamp-col",
|
|
500
|
+
type=str,
|
|
501
|
+
default=None,
|
|
502
|
+
help="DataFrame: column name for timestamps.",
|
|
503
|
+
)
|
|
504
|
+
@click.option(
|
|
505
|
+
"--sequence-col",
|
|
506
|
+
type=str,
|
|
507
|
+
default=None,
|
|
508
|
+
help="DataFrame: column name containing sequences (sequence format).",
|
|
509
|
+
)
|
|
510
|
+
@click.option(
|
|
511
|
+
"--format",
|
|
512
|
+
type=click.Choice([fmt.value for fmt in FileFormat], case_sensitive=False),
|
|
513
|
+
default=FileFormat.AUTO.value,
|
|
514
|
+
show_default=True,
|
|
515
|
+
help="File format to use. 'auto' detects format from file extension.",
|
|
516
|
+
)
|
|
259
517
|
@click.option("--verbose", is_flag=True, help="Enable verbose output for debugging purposes.")
|
|
260
518
|
def main(
|
|
261
519
|
file_path: str,
|
|
@@ -264,11 +522,21 @@ def main(
|
|
|
264
522
|
mingap: Optional[float],
|
|
265
523
|
maxgap: Optional[float],
|
|
266
524
|
maxspan: Optional[float],
|
|
525
|
+
transaction_col: Optional[str],
|
|
526
|
+
item_col: Optional[str],
|
|
527
|
+
timestamp_col: Optional[str],
|
|
528
|
+
sequence_col: Optional[str],
|
|
529
|
+
format: str, # noqa: A002
|
|
267
530
|
verbose: bool,
|
|
268
531
|
) -> None:
|
|
269
532
|
"""
|
|
270
533
|
Run the GSP algorithm on transactional data from a file.
|
|
271
534
|
|
|
535
|
+
Supports multiple file formats:
|
|
536
|
+
- JSON/CSV/SPM: Traditional transaction formats
|
|
537
|
+
- Parquet/Arrow: Modern DataFrame formats (requires 'gsppy[dataframe]')
|
|
538
|
+
- Polars/Pandas DataFrames: Can be passed directly (requires 'gsppy[dataframe]')
|
|
539
|
+
|
|
272
540
|
Supports both simple transactions (items only) and timestamped transactions
|
|
273
541
|
(item-timestamp pairs) for temporal pattern mining.
|
|
274
542
|
|
|
@@ -285,12 +553,47 @@ def main(
|
|
|
285
553
|
gsppy --file temporal_data.json --min_support 0.3 --maxgap 10
|
|
286
554
|
gsppy --file events.json --min_support 0.5 --mingap 2 --maxgap 10 --maxspan 20
|
|
287
555
|
```
|
|
556
|
+
|
|
557
|
+
With Parquet files (grouped format):
|
|
558
|
+
|
|
559
|
+
```bash
|
|
560
|
+
gsppy --file data.parquet --min_support 0.3 \
|
|
561
|
+
--transaction-col txn_id --item-col product
|
|
562
|
+
```
|
|
563
|
+
|
|
564
|
+
With Arrow files (sequence format):
|
|
565
|
+
|
|
566
|
+
```bash
|
|
567
|
+
gsppy --file sequences.arrow --min_support 0.3 \
|
|
568
|
+
--sequence-col items
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
With SPM format files:
|
|
572
|
+
|
|
573
|
+
```bash
|
|
574
|
+
gsppy --file data.txt --format spm --min_support 0.3
|
|
575
|
+
```
|
|
288
576
|
"""
|
|
289
577
|
setup_logging(verbose)
|
|
290
578
|
|
|
579
|
+
# Detect file extension to determine if DataFrame column params are needed
|
|
580
|
+
_, file_extension = os.path.splitext(file_path)
|
|
581
|
+
file_extension = file_extension.lower()
|
|
582
|
+
is_dataframe_format = file_extension in DATAFRAME_EXTENSIONS
|
|
583
|
+
|
|
291
584
|
# Automatically detect and load transactions
|
|
292
585
|
try:
|
|
293
|
-
|
|
586
|
+
file_format = format.lower()
|
|
587
|
+
transactions = _load_transactions_by_format(
|
|
588
|
+
file_path,
|
|
589
|
+
file_format,
|
|
590
|
+
file_extension,
|
|
591
|
+
is_dataframe_format,
|
|
592
|
+
transaction_col,
|
|
593
|
+
item_col,
|
|
594
|
+
timestamp_col,
|
|
595
|
+
sequence_col,
|
|
596
|
+
)
|
|
294
597
|
except ValueError as e:
|
|
295
598
|
logger.error(f"Error: {e}")
|
|
296
599
|
sys.exit(1)
|