gsppy 3.5.0__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsppy/__init__.py +49 -0
- gsppy/cli.py +314 -11
- gsppy/dataframe_adapters.py +458 -0
- gsppy/enums.py +49 -0
- gsppy/gsp.py +205 -11
- gsppy/pruning.py +412 -0
- gsppy/token_mapper.py +99 -0
- gsppy/utils.py +120 -0
- {gsppy-3.5.0.dist-info → gsppy-4.0.0.dist-info}/METADATA +465 -13
- gsppy-4.0.0.dist-info/RECORD +15 -0
- gsppy-3.5.0.dist-info/RECORD +0 -11
- {gsppy-3.5.0.dist-info → gsppy-4.0.0.dist-info}/WHEEL +0 -0
- {gsppy-3.5.0.dist-info → gsppy-4.0.0.dist-info}/entry_points.txt +0 -0
- {gsppy-3.5.0.dist-info → gsppy-4.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gsppy
|
|
3
|
-
Version:
|
|
3
|
+
Version: 4.0.0
|
|
4
4
|
Summary: GSP (Generalized Sequence Pattern) algorithm in Python
|
|
5
5
|
Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
|
|
6
6
|
Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
@@ -32,14 +32,20 @@ Classifier: Intended Audience :: Science/Research
|
|
|
32
32
|
Classifier: License :: OSI Approved :: MIT License
|
|
33
33
|
Classifier: Natural Language :: English
|
|
34
34
|
Classifier: Operating System :: OS Independent
|
|
35
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
36
35
|
Classifier: Programming Language :: Python :: 3.11
|
|
37
36
|
Classifier: Programming Language :: Python :: 3.12
|
|
38
37
|
Classifier: Programming Language :: Python :: 3.13
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
39
39
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
40
40
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
41
|
-
Requires-Python: >=3.
|
|
41
|
+
Requires-Python: >=3.11
|
|
42
42
|
Requires-Dist: click>=8.0.0
|
|
43
|
+
Requires-Dist: typing-extensions>=4.0.0
|
|
44
|
+
Provides-Extra: dataframe
|
|
45
|
+
Requires-Dist: pandas-stubs>=2.3.3.260113; extra == 'dataframe'
|
|
46
|
+
Requires-Dist: pandas>=3.0.0; extra == 'dataframe'
|
|
47
|
+
Requires-Dist: polars>=1.37.1; extra == 'dataframe'
|
|
48
|
+
Requires-Dist: pyarrow>=10.0.0; extra == 'dataframe'
|
|
43
49
|
Provides-Extra: dev
|
|
44
50
|
Requires-Dist: cython==3.2.4; extra == 'dev'
|
|
45
51
|
Requires-Dist: hatch==1.16.3; extra == 'dev'
|
|
@@ -50,9 +56,9 @@ Requires-Dist: pyright==1.1.408; extra == 'dev'
|
|
|
50
56
|
Requires-Dist: pytest-benchmark==5.2.3; extra == 'dev'
|
|
51
57
|
Requires-Dist: pytest-cov==7.0.0; extra == 'dev'
|
|
52
58
|
Requires-Dist: pytest==9.0.2; extra == 'dev'
|
|
53
|
-
Requires-Dist: ruff==0.14.
|
|
59
|
+
Requires-Dist: ruff==0.14.14; extra == 'dev'
|
|
54
60
|
Requires-Dist: tox==4.34.1; extra == 'dev'
|
|
55
|
-
Requires-Dist: ty==0.0.
|
|
61
|
+
Requires-Dist: ty==0.0.14; extra == 'dev'
|
|
56
62
|
Provides-Extra: docs
|
|
57
63
|
Requires-Dist: mkdocs-gen-files<1,>=0.5; extra == 'docs'
|
|
58
64
|
Requires-Dist: mkdocs-literate-nav<1,>=0.6; extra == 'docs'
|
|
@@ -71,7 +77,7 @@ Description-Content-Type: text/markdown
|
|
|
71
77
|
|
|
72
78
|
[](https://pypi.org/project/gsppy/)
|
|
73
79
|
[](https://pypi.org/project/gsppy)
|
|
74
|
-

|
|
75
81
|
|
|
76
82
|
[](https://securityscorecards.dev/viewer/?uri=github.com/jacksonpradolima/gsp-py)
|
|
77
83
|
[](https://github.com/jacksonpradolima/gsp-py/actions/workflows/slsa-provenance.yml)
|
|
@@ -89,7 +95,7 @@ Description-Content-Type: text/markdown
|
|
|
89
95
|
Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal mining, and user journey discovery.
|
|
90
96
|
|
|
91
97
|
> [!IMPORTANT]
|
|
92
|
-
> GSP-Py is compatible with Python 3.
|
|
98
|
+
> GSP-Py is compatible with Python 3.11 and later versions!
|
|
93
99
|
|
|
94
100
|
---
|
|
95
101
|
|
|
@@ -105,6 +111,7 @@ Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal m
|
|
|
105
111
|
6. [💡 Usage](#usage)
|
|
106
112
|
- [✅ Example: Analyzing Sales Data](#example-analyzing-sales-data)
|
|
107
113
|
- [📊 Explanation: Support and Results](#explanation-support-and-results)
|
|
114
|
+
- [📊 DataFrame Input Support](#dataframe-input-support)
|
|
108
115
|
- [⏱️ Temporal Constraints](#temporal-constraints)
|
|
109
116
|
7. [⌨️ Typing](#typing)
|
|
110
117
|
8. [🌟 Planned Features](#planned-features)
|
|
@@ -356,6 +363,34 @@ Your input file should be either:
|
|
|
356
363
|
Bread,Milk,Diaper,Coke
|
|
357
364
|
```
|
|
358
365
|
|
|
366
|
+
- **SPM/GSP Format**: Uses delimiters to separate elements and sequences. This format is commonly used in sequential pattern mining datasets.
|
|
367
|
+
- `-1`: Marks the end of an element (itemset)
|
|
368
|
+
- `-2`: Marks the end of a sequence (transaction)
|
|
369
|
+
|
|
370
|
+
Example:
|
|
371
|
+
```text
|
|
372
|
+
1 2 -1 3 -1 -2
|
|
373
|
+
4 -1 5 6 -1 -2
|
|
374
|
+
1 -1 2 3 -1 -2
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
The above represents:
|
|
378
|
+
- Transaction 1: `[[1, 2], [3]]` → flattened to `[1, 2, 3]`
|
|
379
|
+
- Transaction 2: `[[4], [5, 6]]` → flattened to `[4, 5, 6]`
|
|
380
|
+
- Transaction 3: `[[1], [2, 3]]` → flattened to `[1, 2, 3]`
|
|
381
|
+
|
|
382
|
+
String tokens are also supported:
|
|
383
|
+
```text
|
|
384
|
+
A B -1 C -1 -2
|
|
385
|
+
D -1 E F -1 -2
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
- **Parquet/Arrow Files**: Modern columnar data formats (requires 'gsppy[dataframe]')
|
|
389
|
+
```bash
|
|
390
|
+
pip install 'gsppy[dataframe]'
|
|
391
|
+
```
|
|
392
|
+
This installs optional dependencies: `polars`, `pandas`, and `pyarrow` for DataFrame support.
|
|
393
|
+
|
|
359
394
|
### Running the CLI
|
|
360
395
|
|
|
361
396
|
Use the following command to run GSPPy on your data:
|
|
@@ -370,9 +405,16 @@ Or for CSV files:
|
|
|
370
405
|
gsppy --file path/to/transactions.csv --min_support 0.3 --backend rust
|
|
371
406
|
```
|
|
372
407
|
|
|
408
|
+
For SPM/GSP format files, use the `--format spm` option:
|
|
409
|
+
|
|
410
|
+
```bash
|
|
411
|
+
gsppy --file path/to/data.txt --format spm --min_support 0.3
|
|
412
|
+
```
|
|
413
|
+
|
|
373
414
|
#### CLI Options
|
|
374
415
|
|
|
375
|
-
- `--file`: Path to your input file (JSON or
|
|
416
|
+
- `--file`: Path to your input file (JSON, CSV, or SPM format). **Required**.
|
|
417
|
+
- `--format`: File format to use. Options: `auto` (default, auto-detect from extension), `json`, `csv`, `spm`, `parquet`, `arrow`.
|
|
376
418
|
- `--min_support`: Minimum support threshold as a fraction (e.g., `0.3` for 30%). Default is `0.2`.
|
|
377
419
|
- `--backend`: Backend to use for support counting. One of `auto` (default), `python`, `rust`, or `gpu`.
|
|
378
420
|
- `--verbose`: Enable detailed logging with timestamps, log levels, and process IDs for debugging and traceability.
|
|
@@ -517,6 +559,83 @@ Verbose mode provides:
|
|
|
517
559
|
|
|
518
560
|
For complete documentation on logging, see [docs/logging.md](docs/logging.md).
|
|
519
561
|
|
|
562
|
+
### Loading SPM/GSP Format Files
|
|
563
|
+
|
|
564
|
+
GSP-Py supports loading datasets in the classical SPM/GSP delimiter format, which is widely used in sequential pattern mining research. This format uses:
|
|
565
|
+
- `-1` to mark the end of an element (itemset)
|
|
566
|
+
- `-2` to mark the end of a sequence (transaction)
|
|
567
|
+
|
|
568
|
+
#### Using the SPM Loader
|
|
569
|
+
|
|
570
|
+
```python
|
|
571
|
+
from gsppy.utils import read_transactions_from_spm
|
|
572
|
+
from gsppy import GSP
|
|
573
|
+
|
|
574
|
+
# Load SPM format file
|
|
575
|
+
transactions = read_transactions_from_spm('data.txt')
|
|
576
|
+
|
|
577
|
+
# Run GSP algorithm
|
|
578
|
+
gsp = GSP(transactions)
|
|
579
|
+
result = gsp.search(min_support=0.3)
|
|
580
|
+
```
|
|
581
|
+
|
|
582
|
+
#### SPM Format Examples
|
|
583
|
+
|
|
584
|
+
**Simple sequence file (`data.txt`):**
|
|
585
|
+
```text
|
|
586
|
+
1 2 -1 3 -1 -2
|
|
587
|
+
4 -1 5 6 -1 -2
|
|
588
|
+
1 -1 2 3 -1 -2
|
|
589
|
+
```
|
|
590
|
+
|
|
591
|
+
This represents:
|
|
592
|
+
- Transaction 1: Items [1, 2] followed by item [3] → flattened to [1, 2, 3]
|
|
593
|
+
- Transaction 2: Item [4] followed by items [5, 6] → flattened to [4, 5, 6]
|
|
594
|
+
- Transaction 3: Item [1] followed by items [2, 3] → flattened to [1, 2, 3]
|
|
595
|
+
|
|
596
|
+
**String tokens are also supported:**
|
|
597
|
+
```text
|
|
598
|
+
A B -1 C -1 -2
|
|
599
|
+
D -1 E F -1 -2
|
|
600
|
+
```
|
|
601
|
+
|
|
602
|
+
#### Token Mapping
|
|
603
|
+
|
|
604
|
+
For workflows requiring conversion between string tokens and integer IDs, use the `TokenMapper`:
|
|
605
|
+
|
|
606
|
+
```python
|
|
607
|
+
from gsppy.utils import read_transactions_from_spm
|
|
608
|
+
from gsppy import TokenMapper
|
|
609
|
+
|
|
610
|
+
# Load with mappings
|
|
611
|
+
transactions, str_to_int, int_to_str = read_transactions_from_spm(
|
|
612
|
+
'data.txt',
|
|
613
|
+
return_mappings=True
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
print("String to Int:", str_to_int)
|
|
617
|
+
# Output: {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4, '6': 5}
|
|
618
|
+
|
|
619
|
+
print("Int to String:", int_to_str)
|
|
620
|
+
# Output: {0: '1', 1: '2', 2: '3', 3: '4', 4: '5', 5: '6'}
|
|
621
|
+
|
|
622
|
+
# Use the TokenMapper class directly
|
|
623
|
+
mapper = TokenMapper()
|
|
624
|
+
id_a = mapper.add_token("A")
|
|
625
|
+
id_b = mapper.add_token("B")
|
|
626
|
+
print(f"A -> {id_a}, B -> {id_b}")
|
|
627
|
+
# Output: A -> 0, B -> 1
|
|
628
|
+
```
|
|
629
|
+
|
|
630
|
+
#### Edge Cases Handled
|
|
631
|
+
|
|
632
|
+
The SPM loader gracefully handles:
|
|
633
|
+
- Empty lines (skipped)
|
|
634
|
+
- Missing `-2` delimiter at end of line
|
|
635
|
+
- Extra or consecutive delimiters
|
|
636
|
+
- Mixed-length elements in sequences
|
|
637
|
+
- Both integer and string tokens
|
|
638
|
+
|
|
520
639
|
### Output
|
|
521
640
|
|
|
522
641
|
The algorithm will return a list of patterns with their corresponding support.
|
|
@@ -583,6 +702,208 @@ result = gsp.search(min_support=0.5) # Need at least 2/4 sequences
|
|
|
583
702
|
|
|
584
703
|
---
|
|
585
704
|
|
|
705
|
+
## 📊 DataFrame Input Support
|
|
706
|
+
|
|
707
|
+
GSP-Py supports **Polars and Pandas DataFrames** as input, enabling high-performance workflows with modern data formats like Arrow and Parquet. This feature is particularly useful for large-scale data engineering pipelines and integration with existing data processing workflows.
|
|
708
|
+
|
|
709
|
+
### Installation
|
|
710
|
+
|
|
711
|
+
Install GSP-Py with DataFrame support:
|
|
712
|
+
|
|
713
|
+
```bash
|
|
714
|
+
pip install 'gsppy[dataframe]'
|
|
715
|
+
```
|
|
716
|
+
|
|
717
|
+
This installs the optional dependencies: `polars`, `pandas`, and `pyarrow`.
|
|
718
|
+
|
|
719
|
+
### DataFrame Input Formats
|
|
720
|
+
|
|
721
|
+
GSP-Py supports two DataFrame formats:
|
|
722
|
+
|
|
723
|
+
#### 1. Grouped Format (Transaction ID + Item Columns)
|
|
724
|
+
|
|
725
|
+
Use when your data has separate rows for each item in a transaction:
|
|
726
|
+
|
|
727
|
+
```python
|
|
728
|
+
import polars as pl
|
|
729
|
+
from gsppy import GSP
|
|
730
|
+
|
|
731
|
+
# Polars DataFrame with transaction_id and item columns
|
|
732
|
+
df = pl.DataFrame({
|
|
733
|
+
"transaction_id": [1, 1, 2, 2, 2, 3, 3],
|
|
734
|
+
"item": ["Bread", "Milk", "Bread", "Diaper", "Beer", "Milk", "Coke"],
|
|
735
|
+
})
|
|
736
|
+
|
|
737
|
+
# Run GSP directly on the DataFrame
|
|
738
|
+
gsp = GSP(df, transaction_col="transaction_id", item_col="item")
|
|
739
|
+
patterns = gsp.search(min_support=0.3)
|
|
740
|
+
|
|
741
|
+
for level, freq_patterns in enumerate(patterns, start=1):
|
|
742
|
+
print(f"\n{level}-Sequence Patterns:")
|
|
743
|
+
for pattern, support in freq_patterns.items():
|
|
744
|
+
print(f" {pattern}: {support}")
|
|
745
|
+
```
|
|
746
|
+
|
|
747
|
+
#### 2. Sequence Format (List Column)
|
|
748
|
+
|
|
749
|
+
Use when each row contains a complete transaction as a list:
|
|
750
|
+
|
|
751
|
+
```python
|
|
752
|
+
import pandas as pd
|
|
753
|
+
from gsppy import GSP
|
|
754
|
+
|
|
755
|
+
# Pandas DataFrame with sequences as lists
|
|
756
|
+
df = pd.DataFrame({
|
|
757
|
+
"transaction": [
|
|
758
|
+
["Bread", "Milk"],
|
|
759
|
+
["Bread", "Diaper", "Beer"],
|
|
760
|
+
["Milk", "Coke"],
|
|
761
|
+
]
|
|
762
|
+
})
|
|
763
|
+
|
|
764
|
+
gsp = GSP(df, sequence_col="transaction")
|
|
765
|
+
patterns = gsp.search(min_support=0.3)
|
|
766
|
+
```
|
|
767
|
+
|
|
768
|
+
### DataFrame with Timestamps
|
|
769
|
+
|
|
770
|
+
DataFrames support temporal constraints for time-aware pattern mining:
|
|
771
|
+
|
|
772
|
+
```python
|
|
773
|
+
import polars as pl
|
|
774
|
+
from gsppy import GSP
|
|
775
|
+
|
|
776
|
+
# Grouped format with timestamps
|
|
777
|
+
df = pl.DataFrame({
|
|
778
|
+
"transaction_id": [1, 1, 1, 2, 2, 2],
|
|
779
|
+
"item": ["Login", "Browse", "Purchase", "Login", "Browse", "Purchase"],
|
|
780
|
+
"timestamp": [0, 2, 5, 0, 1, 15], # Time in seconds
|
|
781
|
+
})
|
|
782
|
+
|
|
783
|
+
# Find patterns where consecutive events occur within 10 seconds
|
|
784
|
+
gsp = GSP(
|
|
785
|
+
df,
|
|
786
|
+
transaction_col="transaction_id",
|
|
787
|
+
item_col="item",
|
|
788
|
+
timestamp_col="timestamp",
|
|
789
|
+
maxgap=10
|
|
790
|
+
)
|
|
791
|
+
patterns = gsp.search(min_support=0.5)
|
|
792
|
+
```
|
|
793
|
+
|
|
794
|
+
For sequence format with timestamps:
|
|
795
|
+
|
|
796
|
+
```python
|
|
797
|
+
import pandas as pd
|
|
798
|
+
from gsppy import GSP
|
|
799
|
+
|
|
800
|
+
df = pd.DataFrame({
|
|
801
|
+
"sequence": [["A", "B", "C"], ["A", "D"]],
|
|
802
|
+
"timestamps": [[1, 2, 3], [1, 5]], # Timestamps per item
|
|
803
|
+
})
|
|
804
|
+
|
|
805
|
+
gsp = GSP(df, sequence_col="sequence", timestamp_col="timestamps", maxgap=3)
|
|
806
|
+
patterns = gsp.search(min_support=0.5)
|
|
807
|
+
```
|
|
808
|
+
|
|
809
|
+
### Working with Parquet and Arrow Files
|
|
810
|
+
|
|
811
|
+
DataFrames enable seamless integration with columnar storage formats:
|
|
812
|
+
|
|
813
|
+
```python
|
|
814
|
+
import polars as pl
|
|
815
|
+
from gsppy import GSP
|
|
816
|
+
|
|
817
|
+
# Read directly from Parquet
|
|
818
|
+
df = pl.read_parquet("transactions.parquet")
|
|
819
|
+
|
|
820
|
+
# Run GSP with automatic schema detection
|
|
821
|
+
gsp = GSP(df, transaction_col="txn_id", item_col="product")
|
|
822
|
+
patterns = gsp.search(min_support=0.2)
|
|
823
|
+
|
|
824
|
+
# Or use Pandas with Arrow backend
|
|
825
|
+
import pandas as pd
|
|
826
|
+
df_pandas = pd.read_parquet("transactions.parquet", engine="pyarrow")
|
|
827
|
+
gsp = GSP(df_pandas, transaction_col="txn_id", item_col="product")
|
|
828
|
+
patterns = gsp.search(min_support=0.2)
|
|
829
|
+
```
|
|
830
|
+
|
|
831
|
+
### Performance Considerations
|
|
832
|
+
|
|
833
|
+
DataFrames offer performance benefits for large datasets:
|
|
834
|
+
|
|
835
|
+
- **Polars**: Leverages Arrow for zero-copy operations and parallel processing
|
|
836
|
+
- **Pandas**: Compatible with Arrow backend for efficient memory usage
|
|
837
|
+
- **Parquet/Arrow**: Columnar storage enables efficient filtering and reading
|
|
838
|
+
- **Schema validation**: Errors are caught early with clear messages
|
|
839
|
+
|
|
840
|
+
### DataFrame Schema Requirements
|
|
841
|
+
|
|
842
|
+
**Grouped Format:**
|
|
843
|
+
- `transaction_col`: Column containing transaction/sequence IDs (any type)
|
|
844
|
+
- `item_col`: Column containing items (any type, converted to strings)
|
|
845
|
+
- `timestamp_col` (optional): Column containing timestamps (numeric)
|
|
846
|
+
|
|
847
|
+
**Sequence Format:**
|
|
848
|
+
- `sequence_col`: Column containing lists of items
|
|
849
|
+
- `timestamp_col` (optional): Column containing lists of timestamps (must match sequence lengths)
|
|
850
|
+
|
|
851
|
+
### Error Handling
|
|
852
|
+
|
|
853
|
+
GSP-Py provides clear error messages for schema issues:
|
|
854
|
+
|
|
855
|
+
```python
|
|
856
|
+
import polars as pl
|
|
857
|
+
from gsppy import GSP
|
|
858
|
+
|
|
859
|
+
df = pl.DataFrame({
|
|
860
|
+
"txn_id": [1, 2],
|
|
861
|
+
"product": ["A", "B"],
|
|
862
|
+
})
|
|
863
|
+
|
|
864
|
+
# ❌ Missing required column
|
|
865
|
+
try:
|
|
866
|
+
gsp = GSP(df, transaction_col="txn_id", item_col="item") # 'item' doesn't exist
|
|
867
|
+
except ValueError as e:
|
|
868
|
+
print(f"Error: {e}") # "Column 'item' not found in DataFrame"
|
|
869
|
+
|
|
870
|
+
# ❌ Invalid format specification
|
|
871
|
+
try:
|
|
872
|
+
gsp = GSP(df) # Must specify either sequence_col or both transaction_col and item_col
|
|
873
|
+
except ValueError as e:
|
|
874
|
+
print(f"Error: {e}") # "Must specify either 'sequence_col' or both 'transaction_col' and 'item_col'"
|
|
875
|
+
```
|
|
876
|
+
|
|
877
|
+
### Backward Compatibility
|
|
878
|
+
|
|
879
|
+
Traditional list-based input continues to work:
|
|
880
|
+
|
|
881
|
+
```python
|
|
882
|
+
from gsppy import GSP
|
|
883
|
+
|
|
884
|
+
# Lists still work as before
|
|
885
|
+
transactions = [["A", "B"], ["A", "C"], ["B", "C"]]
|
|
886
|
+
gsp = GSP(transactions)
|
|
887
|
+
patterns = gsp.search(min_support=0.5)
|
|
888
|
+
```
|
|
889
|
+
|
|
890
|
+
DataFrame parameters cannot be mixed with list input:
|
|
891
|
+
|
|
892
|
+
```python
|
|
893
|
+
transactions = [["A", "B"], ["C", "D"]]
|
|
894
|
+
|
|
895
|
+
# ❌ This raises an error
|
|
896
|
+
gsp = GSP(transactions, transaction_col="txn") # ValueError: DataFrame parameters cannot be used with list input
|
|
897
|
+
```
|
|
898
|
+
|
|
899
|
+
### Examples and Tests
|
|
900
|
+
|
|
901
|
+
For complete examples and edge cases, see:
|
|
902
|
+
- [`tests/test_dataframe.py`](tests/test_dataframe.py) - Comprehensive test suite
|
|
903
|
+
- DataFrame adapter documentation in [`gsppy/dataframe_adapters.py`](gsppy/dataframe_adapters.py)
|
|
904
|
+
|
|
905
|
+
---
|
|
906
|
+
|
|
586
907
|
## ⏱️ Temporal Constraints
|
|
587
908
|
|
|
588
909
|
GSP-Py supports **time-constrained sequential pattern mining** with three powerful temporal constraints: `mingap`, `maxgap`, and `maxspan`. These constraints enable domain-specific applications such as medical event mining, retail analytics, and temporal user journey discovery.
|
|
@@ -590,7 +911,7 @@ GSP-Py supports **time-constrained sequential pattern mining** with three powerf
|
|
|
590
911
|
### Temporal Constraint Parameters
|
|
591
912
|
|
|
592
913
|
- **`mingap`**: Minimum time gap required between consecutive items in a pattern
|
|
593
|
-
- **`maxgap`**: Maximum time gap allowed between consecutive items in a pattern
|
|
914
|
+
- **`maxgap`**: Maximum time gap allowed between consecutive items in a pattern
|
|
594
915
|
- **`maxspan`**: Maximum time span from the first to the last item in a pattern
|
|
595
916
|
|
|
596
917
|
### Using Temporal Constraints
|
|
@@ -705,6 +1026,140 @@ result = gsp.search(min_support=0.5)
|
|
|
705
1026
|
|
|
706
1027
|
---
|
|
707
1028
|
|
|
1029
|
+
## 🔧 Flexible Candidate Pruning
|
|
1030
|
+
|
|
1031
|
+
GSP-Py supports **flexible candidate pruning strategies** that allow you to customize how candidate sequences are filtered during pattern mining. This enables optimization for different dataset characteristics and mining requirements.
|
|
1032
|
+
|
|
1033
|
+
### Built-in Pruning Strategies
|
|
1034
|
+
|
|
1035
|
+
#### 1. Support-Based Pruning (Default)
|
|
1036
|
+
|
|
1037
|
+
The standard GSP pruning based on minimum support threshold:
|
|
1038
|
+
|
|
1039
|
+
```python
|
|
1040
|
+
from gsppy.gsp import GSP
|
|
1041
|
+
from gsppy.pruning import SupportBasedPruning
|
|
1042
|
+
|
|
1043
|
+
# Explicit support-based pruning
|
|
1044
|
+
pruner = SupportBasedPruning(min_support_fraction=0.3)
|
|
1045
|
+
gsp = GSP(transactions, pruning_strategy=pruner)
|
|
1046
|
+
result = gsp.search(min_support=0.3)
|
|
1047
|
+
```
|
|
1048
|
+
|
|
1049
|
+
#### 2. Frequency-Based Pruning
|
|
1050
|
+
|
|
1051
|
+
Prunes candidates based on absolute frequency (minimum number of occurrences):
|
|
1052
|
+
|
|
1053
|
+
```python
|
|
1054
|
+
from gsppy.pruning import FrequencyBasedPruning
|
|
1055
|
+
|
|
1056
|
+
# Require patterns to appear at least 5 times
|
|
1057
|
+
pruner = FrequencyBasedPruning(min_frequency=5)
|
|
1058
|
+
gsp = GSP(transactions, pruning_strategy=pruner)
|
|
1059
|
+
result = gsp.search(min_support=0.2)
|
|
1060
|
+
```
|
|
1061
|
+
|
|
1062
|
+
**Use case**: When you need patterns to occur a minimum absolute number of times, regardless of dataset size.
|
|
1063
|
+
|
|
1064
|
+
#### 3. Temporal-Aware Pruning
|
|
1065
|
+
|
|
1066
|
+
Optimizes pruning for time-constrained pattern mining by pre-filtering infeasible patterns:
|
|
1067
|
+
|
|
1068
|
+
```python
|
|
1069
|
+
from gsppy.pruning import TemporalAwarePruning
|
|
1070
|
+
|
|
1071
|
+
# Prune patterns that cannot satisfy temporal constraints
|
|
1072
|
+
pruner = TemporalAwarePruning(
|
|
1073
|
+
mingap=1,
|
|
1074
|
+
maxgap=5,
|
|
1075
|
+
maxspan=10,
|
|
1076
|
+
min_support_fraction=0.3
|
|
1077
|
+
)
|
|
1078
|
+
gsp = GSP(timestamped_transactions, mingap=1, maxgap=5, maxspan=10, pruning_strategy=pruner)
|
|
1079
|
+
result = gsp.search(min_support=0.3)
|
|
1080
|
+
```
|
|
1081
|
+
|
|
1082
|
+
**Use case**: Improves performance for temporal pattern mining by eliminating patterns that cannot satisfy temporal constraints.
|
|
1083
|
+
|
|
1084
|
+
#### 4. Combined Pruning
|
|
1085
|
+
|
|
1086
|
+
Combines multiple pruning strategies for aggressive filtering:
|
|
1087
|
+
|
|
1088
|
+
```python
|
|
1089
|
+
from gsppy.pruning import CombinedPruning, SupportBasedPruning, FrequencyBasedPruning
|
|
1090
|
+
|
|
1091
|
+
# Apply both support and frequency constraints
|
|
1092
|
+
strategies = [
|
|
1093
|
+
SupportBasedPruning(min_support_fraction=0.3),
|
|
1094
|
+
FrequencyBasedPruning(min_frequency=5)
|
|
1095
|
+
]
|
|
1096
|
+
pruner = CombinedPruning(strategies)
|
|
1097
|
+
gsp = GSP(transactions, pruning_strategy=pruner)
|
|
1098
|
+
result = gsp.search(min_support=0.3)
|
|
1099
|
+
```
|
|
1100
|
+
|
|
1101
|
+
**Use case**: When you want to combine multiple filtering criteria for more selective pattern discovery.
|
|
1102
|
+
|
|
1103
|
+
### Custom Pruning Strategies
|
|
1104
|
+
|
|
1105
|
+
You can create custom pruning strategies by implementing the `PruningStrategy` interface:
|
|
1106
|
+
|
|
1107
|
+
```python
|
|
1108
|
+
from gsppy.pruning import PruningStrategy
|
|
1109
|
+
from typing import Dict, Optional, Tuple
|
|
1110
|
+
|
|
1111
|
+
class MyCustomPruner(PruningStrategy):
|
|
1112
|
+
def should_prune(
|
|
1113
|
+
self,
|
|
1114
|
+
candidate: Tuple[str, ...],
|
|
1115
|
+
support_count: int,
|
|
1116
|
+
total_transactions: int,
|
|
1117
|
+
context: Optional[Dict] = None
|
|
1118
|
+
) -> bool:
|
|
1119
|
+
# Custom pruning logic
|
|
1120
|
+
# Return True to prune (filter out), False to keep
|
|
1121
|
+
pattern_length = len(candidate)
|
|
1122
|
+
# Example: Prune very long patterns with low support
|
|
1123
|
+
if pattern_length > 5 and support_count < 10:
|
|
1124
|
+
return True
|
|
1125
|
+
return False
|
|
1126
|
+
|
|
1127
|
+
# Use your custom pruner
|
|
1128
|
+
custom_pruner = MyCustomPruner()
|
|
1129
|
+
gsp = GSP(transactions, pruning_strategy=custom_pruner)
|
|
1130
|
+
result = gsp.search(min_support=0.2)
|
|
1131
|
+
```
|
|
1132
|
+
|
|
1133
|
+
### Performance Characteristics
|
|
1134
|
+
|
|
1135
|
+
Different pruning strategies have different performance tradeoffs:
|
|
1136
|
+
|
|
1137
|
+
| Strategy | Pruning Aggressiveness | Use Case | Performance Impact |
|
|
1138
|
+
|----------|----------------------|----------|-------------------|
|
|
1139
|
+
| **SupportBased** | Moderate | General-purpose mining | Baseline performance |
|
|
1140
|
+
| **FrequencyBased** | High (for large datasets) | Require absolute frequency | Faster on large datasets |
|
|
1141
|
+
| **TemporalAware** | High (for temporal data) | Time-constrained patterns | Significant speedup for temporal mining |
|
|
1142
|
+
| **Combined** | Very High | Selective pattern discovery | Fastest, but may miss edge cases |
|
|
1143
|
+
|
|
1144
|
+
### Benchmarking Pruning Strategies
|
|
1145
|
+
|
|
1146
|
+
To compare pruning strategies on your dataset:
|
|
1147
|
+
|
|
1148
|
+
```bash
|
|
1149
|
+
# Compare all strategies
|
|
1150
|
+
python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy all
|
|
1151
|
+
|
|
1152
|
+
# Benchmark a specific strategy
|
|
1153
|
+
python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy frequency
|
|
1154
|
+
|
|
1155
|
+
# Run multiple rounds for averaging
|
|
1156
|
+
python benchmarks/bench_pruning.py --n_tx 1000 --vocab 100 --min_support 0.2 --strategy all --rounds 3
|
|
1157
|
+
```
|
|
1158
|
+
|
|
1159
|
+
See `benchmarks/bench_pruning.py` for the complete benchmarking script.
|
|
1160
|
+
|
|
1161
|
+
---
|
|
1162
|
+
|
|
708
1163
|
## ⌨️ Typing
|
|
709
1164
|
|
|
710
1165
|
`gsppy` ships inline type information (PEP 561) via a bundled `py.typed` marker. The public API is re-exported from
|
|
@@ -718,10 +1173,7 @@ larger applications.
|
|
|
718
1173
|
|
|
719
1174
|
We are actively working to improve GSP-Py. Here are some exciting features planned for future releases:
|
|
720
1175
|
|
|
721
|
-
1. **
|
|
722
|
-
- Enable users to define their own pruning logic during the mining process.
|
|
723
|
-
|
|
724
|
-
2. **Support for Preprocessing and Postprocessing**:
|
|
1176
|
+
1. **Support for Preprocessing and Postprocessing**:
|
|
725
1177
|
- Add hooks to allow users to transform datasets before mining and customize the output results.
|
|
726
1178
|
|
|
727
1179
|
Want to contribute or suggest an
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
gsppy/__init__.py,sha256=OtUaUNorr3nCc-d-lc4JHvxGJsGtOqcSw-8J_zP7Zng,2251
|
|
2
|
+
gsppy/accelerate.py,sha256=rDho3ysADETpuhT2SF9voBjd3XRaQUzuA5k_baNACF8,11020
|
|
3
|
+
gsppy/cli.py,sha256=SJtF7azZeBZpvcBhhRkXWyRpJMce_h96SLRqiSVUtu0,22440
|
|
4
|
+
gsppy/dataframe_adapters.py,sha256=urAu32a4YsMRnm0yGvxT_XrRHfB_EYWClHH2f4OHH8w,15773
|
|
5
|
+
gsppy/enums.py,sha256=2LxMWGJNWMgjhCWv_nzKWXi4iHU1S12qns3DpBUraAw,1265
|
|
6
|
+
gsppy/gsp.py,sha256=nCwzsVkOn4DqqCbxGCucpsHp9FZqGEphdJcc3fpHhrY,32743
|
|
7
|
+
gsppy/pruning.py,sha256=hOoQoH1k_gzACBy6qr_cvwth9WDmKuLmJyVRDbHjFFM,14779
|
|
8
|
+
gsppy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
gsppy/token_mapper.py,sha256=JhPe_IZMnbM3GzdQwzleYIkE2aVw01QoYdG1TXWoCqw,2818
|
|
10
|
+
gsppy/utils.py,sha256=Ys5B9aJxJBCEXe51HK00nq3-Yf7fIGntoOzSvxSFlro,17592
|
|
11
|
+
gsppy-4.0.0.dist-info/METADATA,sha256=6p3zWP9LFtQ8WfW0PXFdvgYJ-LA0wCzVTBQZ_u1Up4k,43059
|
|
12
|
+
gsppy-4.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
13
|
+
gsppy-4.0.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
14
|
+
gsppy-4.0.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
|
|
15
|
+
gsppy-4.0.0.dist-info/RECORD,,
|
gsppy-3.5.0.dist-info/RECORD
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
gsppy/__init__.py,sha256=NMVa-ZWT449wuxZMF9Ym7p-DChOxOibaaqlpPxksfuo,805
|
|
2
|
-
gsppy/accelerate.py,sha256=rDho3ysADETpuhT2SF9voBjd3XRaQUzuA5k_baNACF8,11020
|
|
3
|
-
gsppy/cli.py,sha256=-viXa8VFIF-QvrHYy1vtDxtMm50sM_tZq5B5DMZ1Jtw,12516
|
|
4
|
-
gsppy/gsp.py,sha256=k72pvdmD6jU4AId2rrHQrJ4FBUgtkuC0ntEY8QHGi5c,24486
|
|
5
|
-
gsppy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
gsppy/utils.py,sha256=dAEq1hEZMN0ZjoocKs_ZIgOI9j_Y6rJEAKneul3zNRo,13501
|
|
7
|
-
gsppy-3.5.0.dist-info/METADATA,sha256=ix2X_VEUTved_DaTsSJMERT-CZ34TUYF0XMC2KeNeuE,29747
|
|
8
|
-
gsppy-3.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
9
|
-
gsppy-3.5.0.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
10
|
-
gsppy-3.5.0.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
|
|
11
|
-
gsppy-3.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|