gsppy 3.6.0__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,458 @@
1
+ """
2
+ DataFrame adapters for GSP-Py.
3
+
4
+ This module provides utilities to convert Polars and Pandas DataFrames to the format
5
+ expected by the GSP algorithm. It enables high-performance workflows by supporting
6
+ modern data formats like Arrow and Parquet.
7
+
8
+ Key Features:
9
+ -------------
10
+ 1. **Polars DataFrame Support**:
11
+ - Convert Polars DataFrames to transaction lists
12
+ - Efficient zero-copy operations where possible
13
+ - Support for timestamped and non-timestamped data
14
+
15
+ 2. **Pandas DataFrame Support**:
16
+ - Convert Pandas DataFrames to transaction lists
17
+ - Compatible with Arrow backend
18
+ - Support for timestamped and non-timestamped data
19
+
20
+ 3. **Schema Validation**:
21
+ - Validate DataFrame structure before conversion
22
+ - Clear error messages for non-compliant schemas
23
+ - Type checking and validation
24
+
25
+ 4. **Flexible Input Formats**:
26
+ - Support for grouped transactions (transaction_id + item columns)
27
+ - Support for sequence columns (list/array of items per row)
28
+ - Support for timestamps (optional)
29
+
30
+ Example Usage:
31
+ --------------
32
+ ```python
33
+ import polars as pl
34
+ from gsppy.dataframe_adapters import polars_to_transactions
35
+
36
+ # Grouped format with transaction_id and item columns
37
+ df = pl.DataFrame(
38
+ {
39
+ "transaction_id": [1, 1, 2, 2, 2, 3],
40
+ "item": ["A", "B", "A", "C", "D", "B"],
41
+ }
42
+ )
43
+ transactions = polars_to_transactions(df, transaction_col="transaction_id", item_col="item")
44
+
45
+ # Sequence format with list column
46
+ df = pl.DataFrame({"sequence": [["A", "B"], ["A", "C", "D"], ["B"]]})
47
+ transactions = polars_to_transactions(df, sequence_col="sequence")
48
+
49
+ # With timestamps
50
+ df = pl.DataFrame(
51
+ {
52
+ "transaction_id": [1, 1, 2, 2],
53
+ "item": ["A", "B", "C", "D"],
54
+ "timestamp": [1.0, 2.0, 1.5, 3.0],
55
+ }
56
+ )
57
+ transactions = polars_to_transactions(df, transaction_col="transaction_id", item_col="item", timestamp_col="timestamp")
58
+ ```
59
+
60
+ Author:
61
+ -------
62
+ - **Developed by:** Jackson Antonio do Prado Lima
63
+ - **Email:** jacksonpradolima@gmail.com
64
+
65
+ License:
66
+ --------
67
+ This implementation is distributed under the MIT License.
68
+ """
69
+
70
+ from __future__ import annotations
71
+
72
+ from typing import Any, List, Tuple, Iterable, Optional, Collection, cast
73
+
74
+ import pandas as pd
75
+ import polars as pl
76
+
77
+
78
+ class DataFrameAdapterError(Exception):
79
+ """Exception raised for errors in DataFrame conversion."""
80
+
81
+ pass
82
+
83
+
84
+ def _require_columns(columns: Collection[str], *names: str) -> None:
85
+ for name in names:
86
+ if name not in columns:
87
+ raise DataFrameAdapterError(f"Column '{name}' not found in DataFrame")
88
+
89
+
90
+ def _build_timestamped_transactions(
91
+ sequences: Iterable[Any],
92
+ timestamps: Iterable[Any],
93
+ sequence_col: str,
94
+ timestamp_col: str,
95
+ ) -> List[List[Tuple[str, float]]]:
96
+ result: List[List[Tuple[str, float]]] = []
97
+ for seq, times in zip(sequences, timestamps, strict=True):
98
+ if not isinstance(seq, list) or not isinstance(times, list):
99
+ raise DataFrameAdapterError(f"Both '{sequence_col}' and '{timestamp_col}' must contain lists")
100
+ seq_list: List[Any] = cast(List[Any], seq)
101
+ times_list: List[Any] = cast(List[Any], times)
102
+ if len(seq_list) != len(times_list):
103
+ raise DataFrameAdapterError("Sequence and timestamp lists must have the same length")
104
+ result.append([(str(item), float(ts)) for item, ts in zip(seq_list, times_list, strict=True)])
105
+ return result
106
+
107
+
108
+ def _build_simple_transactions(sequences: Iterable[Any], sequence_col: str) -> List[List[str]]:
109
+ result: List[List[str]] = []
110
+ for seq in sequences:
111
+ if not isinstance(seq, list):
112
+ raise DataFrameAdapterError(f"Column '{sequence_col}' must contain lists")
113
+ seq_list: List[Any] = cast(List[Any], seq)
114
+ result.append([str(item) for item in seq_list])
115
+ return result
116
+
117
+
118
+ def polars_to_transactions(
119
+ df: pl.DataFrame | pl.LazyFrame,
120
+ transaction_col: Optional[str] = None,
121
+ item_col: Optional[str] = None,
122
+ timestamp_col: Optional[str] = None,
123
+ sequence_col: Optional[str] = None,
124
+ ) -> List[List[str]] | List[List[Tuple[str, float]]]:
125
+ """
126
+ Convert a Polars DataFrame to GSP transaction format.
127
+
128
+ This function supports two input formats:
129
+ 1. Grouped format: Rows grouped by transaction_id, with separate columns for items and optional timestamps
130
+ 2. Sequence format: Each row contains a complete transaction as a list/array
131
+
132
+ Parameters:
133
+ df: Polars DataFrame to convert
134
+ transaction_col: Column name for transaction IDs (grouped format)
135
+ item_col: Column name for items (grouped format)
136
+ timestamp_col: Optional column name for timestamps (grouped format)
137
+ sequence_col: Column name containing sequences (sequence format)
138
+
139
+ Returns:
140
+ List of transactions, where each transaction is either:
141
+ - A list of items (strings)
142
+ - A list of (item, timestamp) tuples
143
+
144
+ Raises:
145
+ DataFrameAdapterError: If the DataFrame schema is invalid or required columns are missing
146
+
147
+ Examples:
148
+ >>> import polars as pl
149
+ >>> # Grouped format
150
+ >>> df = pl.DataFrame(
151
+ ... {
152
+ ... "txn_id": [1, 1, 2, 2],
153
+ ... "item": ["A", "B", "C", "D"],
154
+ ... }
155
+ ... )
156
+ >>> polars_to_transactions(df, transaction_col="txn_id", item_col="item")
157
+ [['A', 'B'], ['C', 'D']]
158
+
159
+ >>> # Sequence format
160
+ >>> df = pl.DataFrame({"seq": [["A", "B"], ["C", "D"]]})
161
+ >>> polars_to_transactions(df, sequence_col="seq")
162
+ [['A', 'B'], ['C', 'D']]
163
+ """
164
+ if sequence_col is not None:
165
+ return _polars_sequence_format(df, sequence_col, timestamp_col)
166
+ elif transaction_col is not None and item_col is not None:
167
+ return _polars_grouped_format(df, transaction_col, item_col, timestamp_col)
168
+ else:
169
+ raise DataFrameAdapterError("Must specify either 'sequence_col' or both 'transaction_col' and 'item_col'")
170
+
171
+
172
+ def _polars_sequence_format(
173
+ df: pl.DataFrame | pl.LazyFrame,
174
+ sequence_col: str,
175
+ timestamp_col: Optional[str] = None,
176
+ ) -> List[List[str]] | List[List[Tuple[str, float]]]:
177
+ """
178
+ Convert Polars DataFrame in sequence format.
179
+
180
+ Parameters:
181
+ df: Polars DataFrame or pl.LazyFrame
182
+ sequence_col: Column containing sequences
183
+ timestamp_col: Optional column containing timestamps per sequence
184
+
185
+ Returns:
186
+ List of transactions
187
+ """
188
+ # Collect pl.LazyFrame if needed
189
+ if isinstance(df, pl.LazyFrame):
190
+ df = df.collect()
191
+
192
+ _require_columns(df.columns, sequence_col)
193
+
194
+ sequences: List[Any] = df[sequence_col].to_list()
195
+
196
+ if timestamp_col is not None:
197
+ _require_columns(df.columns, timestamp_col)
198
+
199
+ timestamps: List[Any] = df[timestamp_col].to_list()
200
+
201
+ # Create timestamped transactions
202
+ return _build_timestamped_transactions(sequences, timestamps, sequence_col, timestamp_col)
203
+ else:
204
+ # Create non-timestamped transactions
205
+ return _build_simple_transactions(sequences, sequence_col)
206
+
207
+
208
+ def _polars_grouped_format(
209
+ df: pl.DataFrame | pl.LazyFrame,
210
+ transaction_col: str,
211
+ item_col: str,
212
+ timestamp_col: Optional[str] = None,
213
+ ) -> List[List[str]] | List[List[Tuple[str, float]]]:
214
+ """
215
+ Convert Polars DataFrame in grouped format.
216
+
217
+ Parameters:
218
+ df: Polars DataFrame or pl.LazyFrame
219
+ transaction_col: Column containing transaction IDs
220
+ item_col: Column containing items
221
+ timestamp_col: Optional column containing timestamps
222
+
223
+ Returns:
224
+ List of transactions
225
+ """
226
+ # Collect pl.LazyFrame if needed
227
+ if isinstance(df, pl.LazyFrame):
228
+ df = df.collect()
229
+
230
+ # Validate required columns exist
231
+ _require_columns(df.columns, transaction_col, item_col)
232
+
233
+ # Sort by transaction and optionally timestamp
234
+ sort_cols = [transaction_col]
235
+ if timestamp_col is not None:
236
+ _require_columns(df.columns, timestamp_col)
237
+ sort_cols.append(timestamp_col)
238
+
239
+ df_sorted = df.sort(sort_cols)
240
+
241
+ # Group by transaction
242
+ if timestamp_col is not None:
243
+ # Create timestamped transactions
244
+ grouped = df_sorted.group_by(transaction_col, maintain_order=True).agg(
245
+ [
246
+ pl.col(item_col).alias("items"),
247
+ pl.col(timestamp_col).alias("timestamps"),
248
+ ]
249
+ )
250
+
251
+ result: List[List[Tuple[str, float]]] = []
252
+ for row in grouped.iter_rows(named=True):
253
+ items = row["items"]
254
+ timestamps = row["timestamps"]
255
+ result.append([(str(item), float(ts)) for item, ts in zip(items, timestamps, strict=False)])
256
+ return result
257
+ else:
258
+ # Create non-timestamped transactions
259
+ grouped = df_sorted.group_by(transaction_col, maintain_order=True).agg(pl.col(item_col).alias("items"))
260
+
261
+ result_simple: List[List[str]] = []
262
+ for row in grouped.iter_rows(named=True):
263
+ items = row["items"]
264
+ result_simple.append([str(item) for item in items])
265
+ return result_simple
266
+
267
+
268
+ def pandas_to_transactions(
269
+ df: pd.DataFrame,
270
+ transaction_col: Optional[str] = None,
271
+ item_col: Optional[str] = None,
272
+ timestamp_col: Optional[str] = None,
273
+ sequence_col: Optional[str] = None,
274
+ ) -> List[List[str]] | List[List[Tuple[str, float]]]:
275
+ """
276
+ Convert a Pandas DataFrame to GSP transaction format.
277
+
278
+ This function supports two input formats:
279
+ 1. Grouped format: Rows grouped by transaction_id, with separate columns for items and optional timestamps
280
+ 2. Sequence format: Each row contains a complete transaction as a list/array
281
+
282
+ Parameters:
283
+ df: Pandas DataFrame to convert
284
+ transaction_col: Column name for transaction IDs (grouped format)
285
+ item_col: Column name for items (grouped format)
286
+ timestamp_col: Optional column name for timestamps (grouped format)
287
+ sequence_col: Column name containing sequences (sequence format)
288
+
289
+ Returns:
290
+ List of transactions, where each transaction is either:
291
+ - A list of items (strings)
292
+ - A list of (item, timestamp) tuples
293
+
294
+ Raises:
295
+ DataFrameAdapterError: If the DataFrame schema is invalid or required columns are missing
296
+
297
+ Examples:
298
+ >>> import pandas as pd
299
+ >>> # Grouped format
300
+ >>> df = pd.DataFrame(
301
+ ... {
302
+ ... "txn_id": [1, 1, 2, 2],
303
+ ... "item": ["A", "B", "C", "D"],
304
+ ... }
305
+ ... )
306
+ >>> pandas_to_transactions(df, transaction_col="txn_id", item_col="item")
307
+ [['A', 'B'], ['C', 'D']]
308
+
309
+ >>> # Sequence format
310
+ >>> df = pd.DataFrame({"seq": [["A", "B"], ["C", "D"]]})
311
+ >>> pandas_to_transactions(df, sequence_col="seq")
312
+ [['A', 'B'], ['C', 'D']]
313
+ """
314
+ if sequence_col is not None:
315
+ return _pandas_sequence_format(df, sequence_col, timestamp_col)
316
+ elif transaction_col is not None and item_col is not None:
317
+ return _pandas_grouped_format(df, transaction_col, item_col, timestamp_col)
318
+ else:
319
+ raise DataFrameAdapterError("Must specify either 'sequence_col' or both 'transaction_col' and 'item_col'")
320
+
321
+
322
+ def _pandas_sequence_format(
323
+ df: pd.DataFrame,
324
+ sequence_col: str,
325
+ timestamp_col: Optional[str] = None,
326
+ ) -> List[List[str]] | List[List[Tuple[str, float]]]:
327
+ """
328
+ Convert Pandas DataFrame in sequence format.
329
+
330
+ Parameters:
331
+ df: Pandas DataFrame
332
+ sequence_col: Column containing sequences
333
+ timestamp_col: Optional column containing timestamps per sequence
334
+
335
+ Returns:
336
+ List of transactions
337
+ """
338
+ _require_columns(df.columns, sequence_col)
339
+
340
+ sequences: List[Any] = df[sequence_col].tolist()
341
+
342
+ if timestamp_col is not None:
343
+ _require_columns(df.columns, timestamp_col)
344
+
345
+ timestamps: List[Any] = df[timestamp_col].tolist()
346
+
347
+ # Create timestamped transactions
348
+ return _build_timestamped_transactions(sequences, timestamps, sequence_col, timestamp_col)
349
+ else:
350
+ # Create non-timestamped transactions
351
+ return _build_simple_transactions(sequences, sequence_col)
352
+
353
+
354
+ def _pandas_grouped_format(
355
+ df: pd.DataFrame,
356
+ transaction_col: str,
357
+ item_col: str,
358
+ timestamp_col: Optional[str] = None,
359
+ ) -> List[List[str]] | List[List[Tuple[str, float]]]:
360
+ """
361
+ Convert Pandas DataFrame in grouped format.
362
+
363
+ Parameters:
364
+ df: Pandas DataFrame
365
+ transaction_col: Column containing transaction IDs
366
+ item_col: Column containing items
367
+ timestamp_col: Optional column containing timestamps
368
+
369
+ Returns:
370
+ List of transactions
371
+ """
372
+ # Validate required columns exist
373
+ _require_columns(df.columns, transaction_col, item_col)
374
+
375
+ # Sort by transaction and optionally timestamp
376
+ sort_cols = [transaction_col]
377
+ if timestamp_col is not None:
378
+ _require_columns(df.columns, timestamp_col)
379
+ sort_cols.append(timestamp_col)
380
+
381
+ df_sorted = df.sort_values(by=sort_cols)
382
+
383
+ # Group by transaction
384
+ if timestamp_col is not None:
385
+ # Create timestamped transactions
386
+ grouped = df_sorted.groupby(transaction_col, sort=False)
387
+ result: List[List[Tuple[str, float]]] = []
388
+ for _, group in grouped:
389
+ items: List[Any] = group[item_col].tolist()
390
+ timestamps_list: List[Any] = group[timestamp_col].tolist()
391
+ result.append([(str(item), float(ts)) for item, ts in zip(items, timestamps_list, strict=True)])
392
+ return result
393
+ else:
394
+ # Create non-timestamped transactions
395
+ grouped = df_sorted.groupby(transaction_col, sort=False)
396
+ result_simple: List[List[str]] = []
397
+ for _, group in grouped:
398
+ items_list: List[Any] = group[item_col].tolist()
399
+ result_simple.append([str(item) for item in items_list])
400
+ return result_simple
401
+
402
+
403
+ def detect_dataframe_type(data: Any) -> Optional[str]:
404
+ """
405
+ Detect the type of DataFrame (Polars or Pandas).
406
+
407
+ Parameters:
408
+ data: Data to check
409
+
410
+ Returns:
411
+ 'polars' if Polars DataFrame, 'pandas' if Pandas DataFrame, None otherwise
412
+ """
413
+ if isinstance(data, (pl.DataFrame, pl.LazyFrame)):
414
+ return "polars"
415
+
416
+ if isinstance(data, pd.DataFrame):
417
+ return "pandas"
418
+
419
+ return None
420
+
421
+
422
+ def dataframe_to_transactions(
423
+ df: pl.DataFrame | pl.LazyFrame | pd.DataFrame,
424
+ transaction_col: Optional[str] = None,
425
+ item_col: Optional[str] = None,
426
+ timestamp_col: Optional[str] = None,
427
+ sequence_col: Optional[str] = None,
428
+ ) -> List[List[str]] | List[List[Tuple[str, float]]]:
429
+ """
430
+ Convert any supported DataFrame to GSP transaction format.
431
+
432
+ Automatically detects whether the input is a Polars or Pandas DataFrame
433
+ and uses the appropriate conversion function.
434
+
435
+ Parameters:
436
+ df: DataFrame to convert (Polars or Pandas)
437
+ transaction_col: Column name for transaction IDs (grouped format)
438
+ item_col: Column name for items (grouped format)
439
+ timestamp_col: Optional column name for timestamps (grouped format)
440
+ sequence_col: Column name containing sequences (sequence format)
441
+
442
+ Returns:
443
+ List of transactions
444
+
445
+ Raises:
446
+ DataFrameAdapterError: If the input is not a recognized DataFrame type
447
+ """
448
+ df_type = detect_dataframe_type(df)
449
+
450
+ if df_type == "polars":
451
+ return polars_to_transactions(df, transaction_col, item_col, timestamp_col, sequence_col) # type: ignore
452
+ elif df_type == "pandas":
453
+ return pandas_to_transactions(df, transaction_col, item_col, timestamp_col, sequence_col) # type: ignore
454
+ else:
455
+ raise DataFrameAdapterError(
456
+ "Input must be a Polars or Pandas DataFrame. "
457
+ "Install required libraries with: pip install 'gsppy[dataframe]'"
458
+ )
gsppy/enums.py ADDED
@@ -0,0 +1,49 @@
1
+ """Shared enums and constants for GSP-Py.
2
+
3
+ This module centralizes reusable definitions for file formats and extension groups
4
+ used across the CLI and DataFrame helpers, keeping validation consistent and
5
+ reducing duplication.
6
+
7
+ Highlights:
8
+ - `FileExtension` enum for canonical file suffixes
9
+ - Predefined extension sets for format detection
10
+ - Shared user-facing error message template
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from enum import Enum
16
+
17
+
18
+ class FileExtension(str, Enum):
19
+ JSON = ".json"
20
+ CSV = ".csv"
21
+ PARQUET = ".parquet"
22
+ PQ = ".pq"
23
+ ARROW = ".arrow"
24
+ FEATHER = ".feather"
25
+
26
+
27
+ class FileFormat(str, Enum):
28
+ """Supported file formats for loading transaction data."""
29
+ AUTO = "auto"
30
+ JSON = "json"
31
+ CSV = "csv"
32
+ SPM = "spm"
33
+ PARQUET = "parquet"
34
+ ARROW = "arrow"
35
+
36
+
37
+ DATAFRAME_EXTENSIONS = {
38
+ FileExtension.PARQUET.value,
39
+ FileExtension.PQ.value,
40
+ FileExtension.ARROW.value,
41
+ FileExtension.FEATHER.value,
42
+ }
43
+
44
+ PARQUET_EXTENSIONS = {FileExtension.PARQUET.value, FileExtension.PQ.value}
45
+ ARROW_EXTENSIONS = {FileExtension.ARROW.value, FileExtension.FEATHER.value}
46
+
47
+ SUPPORTED_EXTENSIONS_MESSAGE = (
48
+ "Unsupported file format '{extension}'. Supported formats: .json, .csv, .parquet, .arrow, .feather"
49
+ )