gsppy 3.6.0__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gsppy/__init__.py CHANGED
@@ -10,17 +10,37 @@ from gsppy.cli import (
10
10
  setup_logging,
11
11
  detect_and_read_file,
12
12
  read_transactions_from_csv,
13
+ read_transactions_from_spm,
13
14
  read_transactions_from_json,
15
+ read_transactions_from_arrow,
16
+ read_transactions_from_parquet,
14
17
  )
15
18
  from gsppy.gsp import GSP
16
19
  from gsppy.pruning import (
20
+ CombinedPruning,
17
21
  PruningStrategy,
18
22
  SupportBasedPruning,
19
- FrequencyBasedPruning,
20
23
  TemporalAwarePruning,
21
- CombinedPruning,
24
+ FrequencyBasedPruning,
22
25
  create_default_pruning_strategy,
23
26
  )
27
+ from gsppy.token_mapper import TokenMapper
28
+
29
+ # DataFrame adapters are optional - import only if dependencies are available
30
+ try:
31
+ from gsppy.dataframe_adapters import (
32
+ DataFrameAdapterError,
33
+ pandas_to_transactions,
34
+ polars_to_transactions,
35
+ dataframe_to_transactions,
36
+ )
37
+ except ImportError:
38
+ DataFrameAdapterError = None # type: ignore
39
+ pandas_to_transactions = None # type: ignore
40
+ polars_to_transactions = None # type: ignore
41
+ dataframe_to_transactions = None # type: ignore
42
+
43
+ _DATAFRAME_AVAILABLE = DataFrameAdapterError is not None
24
44
 
25
45
  try:
26
46
  __version__ = importlib_metadata.version("gsppy")
@@ -32,6 +52,9 @@ __all__ = [
32
52
  "detect_and_read_file",
33
53
  "read_transactions_from_csv",
34
54
  "read_transactions_from_json",
55
+ "read_transactions_from_parquet",
56
+ "read_transactions_from_arrow",
57
+ "read_transactions_from_spm",
35
58
  "setup_logging",
36
59
  "__version__",
37
60
  "PruningStrategy",
@@ -40,4 +63,16 @@ __all__ = [
40
63
  "TemporalAwarePruning",
41
64
  "CombinedPruning",
42
65
  "create_default_pruning_strategy",
66
+ "TokenMapper",
43
67
  ]
68
+
69
+ # Add DataFrame adapters to __all__ if available
70
+ if _DATAFRAME_AVAILABLE:
71
+ __all__.extend(
72
+ [
73
+ "dataframe_to_transactions",
74
+ "polars_to_transactions",
75
+ "pandas_to_transactions",
76
+ "DataFrameAdapterError",
77
+ ]
78
+ )
gsppy/cli.py CHANGED
@@ -28,6 +28,8 @@ This CLI empowers users to perform sequential pattern mining on transactional da
28
28
  a simple command-line interface.
29
29
  """
30
30
 
31
+ from __future__ import annotations
32
+
31
33
  import os
32
34
  import csv
33
35
  import sys
@@ -38,21 +40,29 @@ from typing import Any, Dict, List, Tuple, Union, Optional, cast
38
40
  import click
39
41
 
40
42
  from gsppy.gsp import GSP
43
+ from gsppy.enums import (
44
+ ARROW_EXTENSIONS,
45
+ PARQUET_EXTENSIONS,
46
+ DATAFRAME_EXTENSIONS,
47
+ SUPPORTED_EXTENSIONS_MESSAGE,
48
+ FileFormat,
49
+ FileExtension,
50
+ )
41
51
  from gsppy.utils import has_timestamps
42
52
 
43
53
 
44
54
  def setup_logging(verbose: bool) -> None:
45
55
  """
46
56
  Configure logging with standardized format based on verbosity level.
47
-
57
+
48
58
  When verbose is enabled, provides detailed structured logging with:
49
59
  - Timestamps (ISO 8601 format)
50
60
  - Log levels
51
61
  - Process ID for traceability
52
62
  - Module context
53
-
63
+
54
64
  When verbose is disabled, uses simple format with just the message.
55
-
65
+
56
66
  Parameters:
57
67
  verbose: Whether to enable verbose logging with detailed formatting.
58
68
  """
@@ -60,7 +70,7 @@ def setup_logging(verbose: bool) -> None:
60
70
  root_logger = logging.getLogger()
61
71
  for handler in root_logger.handlers[:]:
62
72
  root_logger.removeHandler(handler)
63
-
73
+
64
74
  if verbose:
65
75
  # Detailed format with timestamps, levels, PID, and context for verbose mode
66
76
  log_format = "%(asctime)s | %(levelname)-8s | PID:%(process)d | %(name)s | %(message)s"
@@ -71,7 +81,7 @@ def setup_logging(verbose: bool) -> None:
71
81
  log_format = "%(message)s"
72
82
  date_format = None
73
83
  log_level = logging.INFO
74
-
84
+
75
85
  # Configure logging with the appropriate format
76
86
  logging.basicConfig(
77
87
  level=log_level,
@@ -186,9 +196,39 @@ def read_transactions_from_csv(file_path: str) -> List[List[str]]:
186
196
  raise ValueError(msg) from e
187
197
 
188
198
 
199
+ def read_transactions_from_spm(file_path: str) -> List[List[str]]:
200
+ """
201
+ Read transactions from an SPM/GSP format file.
202
+
203
+ The SPM/GSP format uses delimiters:
204
+ - `-1`: End of element (item set)
205
+ - `-2`: End of sequence (transaction)
206
+
207
+ Parameters:
208
+ file_path (str): Path to the file containing transactions.
209
+
210
+ Returns:
211
+ List[List[str]]: Parsed transactions from the file.
212
+
213
+ Raises:
214
+ ValueError: If the file cannot be read or contains invalid data.
215
+ """
216
+ try:
217
+ from gsppy.utils import read_transactions_from_spm as read_spm
218
+
219
+ return cast(List[List[str]], read_spm(file_path, return_mappings=False))
220
+ except Exception as e:
221
+ msg = f"Error reading transaction data from SPM file '{file_path}': {e}"
222
+ logging.error(msg)
223
+ raise ValueError(msg) from e
224
+
225
+
189
226
  def detect_and_read_file(file_path: str) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
190
227
  """
191
- Detect file format (CSV or JSON) and read transactions.
228
+ Detect file format (CSV, JSON, Parquet, Arrow) and read transactions.
229
+
230
+ Supports traditional formats (CSV, JSON) and modern DataFrame formats (Parquet, Arrow).
231
+ For DataFrame formats, requires 'gsppy[dataframe]' to be installed.
192
232
 
193
233
  Parameters:
194
234
  file_path (str): Path to the file containing transactions.
@@ -206,13 +246,200 @@ def detect_and_read_file(file_path: str) -> Union[List[List[str]], List[List[Tup
206
246
  _, file_extension = os.path.splitext(file_path)
207
247
  file_extension = file_extension.lower()
208
248
 
209
- if file_extension == ".json":
249
+ if file_extension == FileExtension.JSON.value:
210
250
  return read_transactions_from_json(file_path)
211
251
 
212
- if file_extension == ".csv":
252
+ if file_extension == FileExtension.CSV.value:
213
253
  return read_transactions_from_csv(file_path)
214
254
 
215
- raise ValueError("Unsupported file format. Please provide a JSON or CSV file.")
255
+ if file_extension in PARQUET_EXTENSIONS:
256
+ return read_transactions_from_parquet(file_path)
257
+
258
+ if file_extension in ARROW_EXTENSIONS:
259
+ return read_transactions_from_arrow(file_path)
260
+
261
+ raise ValueError(SUPPORTED_EXTENSIONS_MESSAGE.format(extension=file_extension))
262
+
263
+
264
+ def read_transactions_from_parquet(
265
+ file_path: str,
266
+ transaction_col: Optional[str] = None,
267
+ item_col: Optional[str] = None,
268
+ timestamp_col: Optional[str] = None,
269
+ sequence_col: Optional[str] = None,
270
+ ) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
271
+ """
272
+ Read transactions from a Parquet file using Polars.
273
+
274
+ Parameters:
275
+ file_path (str): Path to the Parquet file.
276
+ transaction_col (Optional[str]): Column name for transaction IDs (grouped format).
277
+ item_col (Optional[str]): Column name for items (grouped format).
278
+ timestamp_col (Optional[str]): Column name for timestamps.
279
+ sequence_col (Optional[str]): Column name containing sequences (sequence format).
280
+
281
+ Returns:
282
+ Union[List[List[str]], List[List[Tuple[str, float]]]]:
283
+ Parsed transactions from the file.
284
+
285
+ Raises:
286
+ ValueError: If the file cannot be read or Polars is not installed.
287
+ """
288
+ try:
289
+ import polars as pl
290
+
291
+ from gsppy.dataframe_adapters import polars_to_transactions
292
+ except ImportError as e:
293
+ raise ValueError("Parquet support requires Polars. Install with: pip install 'gsppy[dataframe]'") from e
294
+
295
+ try:
296
+ df: Any = pl.read_parquet(file_path)
297
+ return polars_to_transactions(
298
+ df,
299
+ transaction_col=transaction_col,
300
+ item_col=item_col,
301
+ timestamp_col=timestamp_col,
302
+ sequence_col=sequence_col,
303
+ )
304
+ except Exception as e:
305
+ msg = f"Error reading transaction data from Parquet file '{file_path}': {e}"
306
+ logging.error(msg)
307
+ raise ValueError(msg) from e
308
+
309
+
310
+ def read_transactions_from_arrow(
311
+ file_path: str,
312
+ transaction_col: Optional[str] = None,
313
+ item_col: Optional[str] = None,
314
+ timestamp_col: Optional[str] = None,
315
+ sequence_col: Optional[str] = None,
316
+ ) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
317
+ """
318
+ Read transactions from an Arrow/Feather file using Polars.
319
+
320
+ Parameters:
321
+ file_path (str): Path to the Arrow/Feather file.
322
+ transaction_col (Optional[str]): Column name for transaction IDs (grouped format).
323
+ item_col (Optional[str]): Column name for items (grouped format).
324
+ timestamp_col (Optional[str]): Column name for timestamps.
325
+ sequence_col (Optional[str]): Column name containing sequences (sequence format).
326
+
327
+ Returns:
328
+ Union[List[List[str]], List[List[Tuple[str, float]]]]:
329
+ Parsed transactions from the file.
330
+
331
+ Raises:
332
+ ValueError: If the file cannot be read or Polars is not installed.
333
+ """
334
+ try:
335
+ import polars as pl
336
+
337
+ from gsppy.dataframe_adapters import polars_to_transactions
338
+ except ImportError as e:
339
+ raise ValueError("Arrow/Feather support requires Polars. Install with: pip install 'gsppy[dataframe]'") from e
340
+
341
+ try:
342
+ df: Any = pl.read_ipc(file_path)
343
+ return polars_to_transactions(
344
+ df,
345
+ transaction_col=transaction_col,
346
+ item_col=item_col,
347
+ timestamp_col=timestamp_col,
348
+ sequence_col=sequence_col,
349
+ )
350
+ except Exception as e:
351
+ msg = f"Error reading transaction data from Arrow file '{file_path}': {e}"
352
+ logging.error(msg)
353
+ raise ValueError(msg) from e
354
+
355
+
356
+ def _load_dataframe_format(
357
+ file_path: str,
358
+ file_extension: str,
359
+ transaction_col: Optional[str],
360
+ item_col: Optional[str],
361
+ timestamp_col: Optional[str],
362
+ sequence_col: Optional[str],
363
+ ) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
364
+ """
365
+ Load transactions from DataFrame formats (Parquet/Arrow).
366
+
367
+ Parameters:
368
+ file_path: Path to the file
369
+ file_extension: File extension (lowercase)
370
+ transaction_col: Transaction ID column name
371
+ item_col: Item column name
372
+ timestamp_col: Timestamp column name
373
+ sequence_col: Sequence column name
374
+
375
+ Returns:
376
+ Loaded transactions
377
+ """
378
+ if file_extension in PARQUET_EXTENSIONS:
379
+ return read_transactions_from_parquet(
380
+ file_path,
381
+ transaction_col=transaction_col,
382
+ item_col=item_col,
383
+ timestamp_col=timestamp_col,
384
+ sequence_col=sequence_col,
385
+ )
386
+ else: # Arrow/Feather
387
+ return read_transactions_from_arrow(
388
+ file_path,
389
+ transaction_col=transaction_col,
390
+ item_col=item_col,
391
+ timestamp_col=timestamp_col,
392
+ sequence_col=sequence_col,
393
+ )
394
+
395
+
396
+ def _load_transactions_by_format(
397
+ file_path: str,
398
+ file_format: str,
399
+ file_extension: str,
400
+ is_dataframe_format: bool,
401
+ transaction_col: Optional[str],
402
+ item_col: Optional[str],
403
+ timestamp_col: Optional[str],
404
+ sequence_col: Optional[str],
405
+ ) -> Union[List[List[str]], List[List[Tuple[str, float]]]]:
406
+ """
407
+ Load transactions based on specified format.
408
+
409
+ Parameters:
410
+ file_path: Path to the file
411
+ file_format: Format string (lowercase)
412
+ file_extension: File extension (lowercase)
413
+ is_dataframe_format: Whether file is a DataFrame format
414
+ transaction_col: Transaction ID column name
415
+ item_col: Item column name
416
+ timestamp_col: Timestamp column name
417
+ sequence_col: Sequence column name
418
+
419
+ Returns:
420
+ Loaded transactions
421
+
422
+ Raises:
423
+ ValueError: If format is unknown
424
+ """
425
+ if file_format == FileFormat.SPM.value:
426
+ return read_transactions_from_spm(file_path)
427
+ elif file_format == FileFormat.JSON.value:
428
+ return read_transactions_from_json(file_path)
429
+ elif file_format == FileFormat.CSV.value:
430
+ return read_transactions_from_csv(file_path)
431
+ elif file_format in (FileFormat.PARQUET.value, FileFormat.ARROW.value):
432
+ return _load_dataframe_format(file_path, file_extension, transaction_col, item_col, timestamp_col, sequence_col)
433
+ elif file_format == FileFormat.AUTO.value:
434
+ # Auto-detect format
435
+ if is_dataframe_format:
436
+ return _load_dataframe_format(
437
+ file_path, file_extension, transaction_col, item_col, timestamp_col, sequence_col
438
+ )
439
+ else:
440
+ return detect_and_read_file(file_path)
441
+ else:
442
+ raise ValueError(f"Unknown format: {file_format}")
216
443
 
217
444
 
218
445
  # Click-based CLI
@@ -222,7 +449,7 @@ def detect_and_read_file(file_path: str) -> Union[List[List[str]], List[List[Tup
222
449
  "file_path",
223
450
  required=True,
224
451
  type=click.Path(exists=True),
225
- help="Path to a JSON or CSV file containing transactions.",
452
+ help="Path to a transaction file (JSON, CSV, SPM, Parquet, or Arrow format).",
226
453
  )
227
454
  @click.option(
228
455
  "--min_support",
@@ -256,6 +483,37 @@ def detect_and_read_file(file_path: str) -> Union[List[List[str]], List[List[Tup
256
483
  default=None,
257
484
  help="Maximum time span from first to last item in patterns (requires timestamped transactions).",
258
485
  )
486
+ @click.option(
487
+ "--transaction-col",
488
+ type=str,
489
+ default=None,
490
+ help="DataFrame: column name for transaction IDs (grouped format).",
491
+ )
492
+ @click.option(
493
+ "--item-col",
494
+ type=str,
495
+ default=None,
496
+ help="DataFrame: column name for items (grouped format).",
497
+ )
498
+ @click.option(
499
+ "--timestamp-col",
500
+ type=str,
501
+ default=None,
502
+ help="DataFrame: column name for timestamps.",
503
+ )
504
+ @click.option(
505
+ "--sequence-col",
506
+ type=str,
507
+ default=None,
508
+ help="DataFrame: column name containing sequences (sequence format).",
509
+ )
510
+ @click.option(
511
+ "--format",
512
+ type=click.Choice([fmt.value for fmt in FileFormat], case_sensitive=False),
513
+ default=FileFormat.AUTO.value,
514
+ show_default=True,
515
+ help="File format to use. 'auto' detects format from file extension.",
516
+ )
259
517
  @click.option("--verbose", is_flag=True, help="Enable verbose output for debugging purposes.")
260
518
  def main(
261
519
  file_path: str,
@@ -264,11 +522,21 @@ def main(
264
522
  mingap: Optional[float],
265
523
  maxgap: Optional[float],
266
524
  maxspan: Optional[float],
525
+ transaction_col: Optional[str],
526
+ item_col: Optional[str],
527
+ timestamp_col: Optional[str],
528
+ sequence_col: Optional[str],
529
+ format: str, # noqa: A002
267
530
  verbose: bool,
268
531
  ) -> None:
269
532
  """
270
533
  Run the GSP algorithm on transactional data from a file.
271
534
 
535
+ Supports multiple file formats:
536
+ - JSON/CSV/SPM: Traditional transaction formats
537
+ - Parquet/Arrow: Modern DataFrame formats (requires 'gsppy[dataframe]')
538
+ - Polars/Pandas DataFrames: Can be passed directly (requires 'gsppy[dataframe]')
539
+
272
540
  Supports both simple transactions (items only) and timestamped transactions
273
541
  (item-timestamp pairs) for temporal pattern mining.
274
542
 
@@ -285,12 +553,47 @@ def main(
285
553
  gsppy --file temporal_data.json --min_support 0.3 --maxgap 10
286
554
  gsppy --file events.json --min_support 0.5 --mingap 2 --maxgap 10 --maxspan 20
287
555
  ```
556
+
557
+ With Parquet files (grouped format):
558
+
559
+ ```bash
560
+ gsppy --file data.parquet --min_support 0.3 \
561
+ --transaction-col txn_id --item-col product
562
+ ```
563
+
564
+ With Arrow files (sequence format):
565
+
566
+ ```bash
567
+ gsppy --file sequences.arrow --min_support 0.3 \
568
+ --sequence-col items
569
+ ```
570
+
571
+ With SPM format files:
572
+
573
+ ```bash
574
+ gsppy --file data.txt --format spm --min_support 0.3
575
+ ```
288
576
  """
289
577
  setup_logging(verbose)
290
578
 
579
+ # Detect file extension to determine if DataFrame column params are needed
580
+ _, file_extension = os.path.splitext(file_path)
581
+ file_extension = file_extension.lower()
582
+ is_dataframe_format = file_extension in DATAFRAME_EXTENSIONS
583
+
291
584
  # Automatically detect and load transactions
292
585
  try:
293
- transactions = detect_and_read_file(file_path)
586
+ file_format = format.lower()
587
+ transactions = _load_transactions_by_format(
588
+ file_path,
589
+ file_format,
590
+ file_extension,
591
+ is_dataframe_format,
592
+ transaction_col,
593
+ item_col,
594
+ timestamp_col,
595
+ sequence_col,
596
+ )
294
597
  except ValueError as e:
295
598
  logger.error(f"Error: {e}")
296
599
  sys.exit(1)