gsppy 4.0.0__tar.gz → 4.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {gsppy-4.0.0 → gsppy-4.2.0}/CHANGELOG.md +70 -0
  2. {gsppy-4.0.0 → gsppy-4.2.0}/PKG-INFO +222 -1
  3. {gsppy-4.0.0 → gsppy-4.2.0}/README.md +221 -0
  4. {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/__init__.py +10 -0
  5. {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/cli.py +2 -2
  6. {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/gsp.py +290 -87
  7. gsppy-4.2.0/gsppy/sequence.py +371 -0
  8. {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/utils.py +333 -33
  9. {gsppy-4.0.0 → gsppy-4.2.0}/pyproject.toml +1 -1
  10. {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_gsp.py +6 -1
  11. gsppy-4.2.0/tests/test_gsp_sequence_integration.py +345 -0
  12. gsppy-4.2.0/tests/test_itemsets.py +300 -0
  13. gsppy-4.2.0/tests/test_sequence.py +466 -0
  14. {gsppy-4.0.0 → gsppy-4.2.0}/tox.ini +1 -0
  15. {gsppy-4.0.0 → gsppy-4.2.0}/.gitignore +0 -0
  16. {gsppy-4.0.0 → gsppy-4.2.0}/CONTRIBUTING.md +0 -0
  17. {gsppy-4.0.0 → gsppy-4.2.0}/LICENSE +0 -0
  18. {gsppy-4.0.0 → gsppy-4.2.0}/SECURITY.md +0 -0
  19. {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/accelerate.py +0 -0
  20. {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/dataframe_adapters.py +0 -0
  21. {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/enums.py +0 -0
  22. {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/pruning.py +0 -0
  23. {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/py.typed +0 -0
  24. {gsppy-4.0.0 → gsppy-4.2.0}/gsppy/token_mapper.py +0 -0
  25. {gsppy-4.0.0 → gsppy-4.2.0}/rust/Cargo.lock +0 -0
  26. {gsppy-4.0.0 → gsppy-4.2.0}/rust/Cargo.toml +0 -0
  27. {gsppy-4.0.0 → gsppy-4.2.0}/rust/src/lib.rs +0 -0
  28. {gsppy-4.0.0 → gsppy-4.2.0}/tests/__init__.py +0 -0
  29. {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_cli.py +0 -0
  30. {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_dataframe.py +0 -0
  31. {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_gsp_fuzzing.py +0 -0
  32. {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_pruning.py +0 -0
  33. {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_spm_format.py +0 -0
  34. {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_temporal_constraints.py +0 -0
  35. {gsppy-4.0.0 → gsppy-4.2.0}/tests/test_utils.py +0 -0
@@ -1,6 +1,76 @@
1
1
  # CHANGELOG
2
2
 
3
3
 
4
+ ## v4.2.0 (2026-02-01)
5
+
6
+ ### Chores
7
+
8
+ - Update uv.lock for version 4.1.0
9
+ ([`5ed3d9e`](https://github.com/jacksonpradolima/gsp-py/commit/5ed3d9e46cf158a2261462cb8974b6bbb452f32e))
10
+
11
+ ### Features
12
+
13
+ - Add itemset support for co-occurrence semantics in sequence mining
14
+ ([`90805b1`](https://github.com/jacksonpradolima/gsp-py/commit/90805b190f40ebf34a72da0bbe949cb627140337))
15
+
16
+
17
+ ## v4.1.0 (2026-02-01)
18
+
19
+ ### Bug Fixes
20
+
21
+ - Address code review feedback - add type annotations and remove unused variables
22
+ ([`bf62d14`](https://github.com/jacksonpradolima/gsp-py/commit/bf62d144d8f1be1e7716291d41af955450612c81))
23
+
24
+ Co-authored-by: jacksonpradolima <7774063+jacksonpradolima@users.noreply.github.com>
25
+
26
+ ### Chores
27
+
28
+ - Update uv.lock for version 4.0.0
29
+ ([`f1ae2af`](https://github.com/jacksonpradolima/gsp-py/commit/f1ae2af2aa71ea44b9d8625ed647da79259ec096))
30
+
31
+ ### Documentation
32
+
33
+ - Add Sequence documentation and examples to README
34
+ ([`62d0d02`](https://github.com/jacksonpradolima/gsp-py/commit/62d0d02c19c5751331df53e680cc0b9aee19677b))
35
+
36
+ Co-authored-by: jacksonpradolima <7774063+jacksonpradolima@users.noreply.github.com>
37
+
38
+ - Update docs/ with Sequence abstraction documentation
39
+ ([`2368cf3`](https://github.com/jacksonpradolima/gsp-py/commit/2368cf30239139e8e2af5457ee6acf14db30ef06))
40
+
41
+ Co-authored-by: jacksonpradolima <7774063+jacksonpradolima@users.noreply.github.com>
42
+
43
+ ### Features
44
+
45
+ - Add Sequence abstraction class with comprehensive tests
46
+ ([`6011bdb`](https://github.com/jacksonpradolima/gsp-py/commit/6011bdb7104755d109b58261b36e1dd1c36b2d61))
47
+
48
+ Co-authored-by: jacksonpradolima <7774063+jacksonpradolima@users.noreply.github.com>
49
+
50
+ - Integrate Sequence objects with GSP.search() via return_sequences parameter
51
+ ([`7476588`](https://github.com/jacksonpradolima/gsp-py/commit/7476588f2b277276748e0550366014f2a93d8ef5))
52
+
53
+ Co-authored-by: jacksonpradolima <7774063+jacksonpradolima@users.noreply.github.com>
54
+
55
+ - Introduce Sequence abstraction for typed pattern representation
56
+ ([`01ca37b`](https://github.com/jacksonpradolima/gsp-py/commit/01ca37b9bc4572eb7b1c1eaf6fdf26ca2324a3c5))
57
+
58
+ ### Refactoring
59
+
60
+ - Address code review feedback - remove redundant checks
61
+ ([`621e940`](https://github.com/jacksonpradolima/gsp-py/commit/621e9403379ae0fd07bf45b97616b9979f2d4aa6))
62
+
63
+ Co-authored-by: jacksonpradolima <7774063+jacksonpradolima@users.noreply.github.com>
64
+
65
+ - Reduce cognitive complexity in sequence_example.py and fix f-string
66
+ ([`63ac4f9`](https://github.com/jacksonpradolima/gsp-py/commit/63ac4f9ceb869a5228cdccdcf6a9d0b9f46f0350))
67
+
68
+ Co-authored-by: jacksonpradolima <7774063+jacksonpradolima@users.noreply.github.com>
69
+
70
+ - Update type annotations and improve search method in GSP class
71
+ ([`e2e9a3f`](https://github.com/jacksonpradolima/gsp-py/commit/e2e9a3f473d1e0c5d6990c8b7c5837a251761032))
72
+
73
+
4
74
  ## v4.0.0 (2026-02-01)
5
75
 
6
76
  ### Chores
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gsppy
3
- Version: 4.0.0
3
+ Version: 4.2.0
4
4
  Summary: GSP (Generalized Sequence Pattern) algorithm in Python
5
5
  Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
6
6
  Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
@@ -112,6 +112,7 @@ Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal m
112
112
  - [✅ Example: Analyzing Sales Data](#example-analyzing-sales-data)
113
113
  - [📊 Explanation: Support and Results](#explanation-support-and-results)
114
114
  - [📊 DataFrame Input Support](#dataframe-input-support)
115
+ - [🔗 Itemset Support](#itemset-support)
115
116
  - [⏱️ Temporal Constraints](#temporal-constraints)
116
117
  7. [⌨️ Typing](#typing)
117
118
  8. [🌟 Planned Features](#planned-features)
@@ -559,6 +560,82 @@ Verbose mode provides:
559
560
 
560
561
  For complete documentation on logging, see [docs/logging.md](docs/logging.md).
561
562
 
563
+ ### Using Sequence Objects for Rich Pattern Representation
564
+
565
+ GSP-Py 4.0+ introduces a **Sequence abstraction class** that provides a richer, more maintainable way to work with sequential patterns. The Sequence class encapsulates pattern items, support counts, and optional metadata in an immutable, hashable object.
566
+
567
+ #### Traditional Dict-based Output (Default)
568
+
569
+ ```python
570
+ from gsppy import GSP
571
+
572
+ transactions = [
573
+ ['Bread', 'Milk'],
574
+ ['Bread', 'Diaper', 'Beer', 'Eggs'],
575
+ ['Milk', 'Diaper', 'Beer', 'Coke']
576
+ ]
577
+
578
+ gsp = GSP(transactions)
579
+ result = gsp.search(min_support=0.3)
580
+
581
+ # Returns: [{('Bread',): 4, ('Milk',): 4, ...}, {('Bread', 'Milk'): 3, ...}, ...]
582
+ for level_patterns in result:
583
+ for pattern, support in level_patterns.items():
584
+ print(f"Pattern: {pattern}, Support: {support}")
585
+ ```
586
+
587
+ #### Sequence Objects (New Feature)
588
+
589
+ ```python
590
+ from gsppy import GSP
591
+
592
+ transactions = [
593
+ ['Bread', 'Milk'],
594
+ ['Bread', 'Diaper', 'Beer', 'Eggs'],
595
+ ['Milk', 'Diaper', 'Beer', 'Coke']
596
+ ]
597
+
598
+ gsp = GSP(transactions)
599
+ result = gsp.search(min_support=0.3, return_sequences=True)
600
+
601
+ # Returns: [[Sequence(('Bread',), support=4), ...], [Sequence(('Bread', 'Milk'), support=3), ...], ...]
602
+ for level_patterns in result:
603
+ for seq in level_patterns:
604
+ print(f"Pattern: {seq.items}, Support: {seq.support}, Length: {seq.length}")
605
+ # Access sequence properties
606
+ print(f" First item: {seq.first_item}, Last item: {seq.last_item}")
607
+ # Check if item is in sequence
608
+ if "Milk" in seq:
609
+ print(f" Contains Milk!")
610
+ ```
611
+
612
+ #### Key Benefits of Sequence Objects
613
+
614
+ 1. **Rich API**: Access pattern properties like `length`, `first_item`, `last_item`
615
+ 2. **Type Safety**: IDE autocomplete and better type hints
616
+ 3. **Immutable & Hashable**: Can be used as dictionary keys
617
+ 4. **Extensible**: Add metadata for confidence, lift, or custom properties
618
+ 5. **Backward Compatible**: Convert to/from dict format as needed
619
+
620
+ ```python
621
+ from gsppy import Sequence, sequences_to_dict, dict_to_sequences
622
+
623
+ # Create custom sequences
624
+ seq = Sequence.from_tuple(("A", "B", "C"), support=5)
625
+
626
+ # Extend sequences
627
+ extended = seq.extend("D") # Creates Sequence(("A", "B", "C", "D"))
628
+
629
+ # Add metadata
630
+ seq_with_meta = seq.with_metadata(confidence=0.85, lift=1.5)
631
+
632
+ # Convert between formats for compatibility
633
+ seq_result = gsp.search(min_support=0.3, return_sequences=True)
634
+ dict_format = sequences_to_dict(seq_result[0]) # Convert to dict
635
+ ```
636
+
637
+ For a complete example, see [examples/sequence_example.py](examples/sequence_example.py).
638
+
562
639
  ### Loading SPM/GSP Format Files
563
640
 
564
641
  GSP-Py supports loading datasets in the classical SPM/GSP delimiter format, which is widely used in sequential pattern mining research. This format uses:
@@ -904,6 +981,150 @@ For complete examples and edge cases, see:
904
981
 
905
982
  ---
906
983
 
984
+ ## 🔗 Itemset Support
985
+
986
+ GSP-Py supports **itemsets** within sequence elements, enabling you to capture **co-occurrence** of multiple items at the same time step. This is crucial for applications where items occur together rather than in strict sequential order.
987
+
988
+ ### What are Itemsets?
989
+
990
+ - **Flat sequences**: `['A', 'B', 'C']` - each item occurs at a separate time step
991
+ - **Itemset sequences**: `[['A', 'B'], ['C']]` - items A and B occur together at the first time step, then C occurs later
992
+
993
+ ### Why Use Itemsets?
994
+
995
+ Itemsets are essential when temporal co-occurrence matters in your domain:
996
+
997
+ - **Market basket analysis**: Customers buy multiple items in a single shopping trip, then return for more items later
998
+ - **Web analytics**: Users open multiple pages in parallel tabs before moving to the next set of pages
999
+ - **Event logs**: Multiple events can occur simultaneously in complex systems
1000
+ - **Purchase patterns**: Items bought together vs. items bought in sequence
1001
+
1002
+ ### Using Itemsets
1003
+
1004
+ #### Basic Example
1005
+
1006
+ ```python
1007
+ from gsppy import GSP
1008
+
1009
+ # Itemset format: nested lists where inner lists are items that occur together
1010
+ transactions = [
1011
+ [['Bread', 'Milk'], ['Eggs']], # Bought Bread & Milk together, then Eggs later
1012
+ [['Bread', 'Milk', 'Butter']], # Bought all three items together
1013
+ [['Bread', 'Milk'], ['Eggs']], # Same pattern as customer 1
1014
+ ]
1015
+
1016
+ gsp = GSP(transactions)
1017
+ patterns = gsp.search(min_support=0.5)
1018
+
1019
+ # Pattern ('Bread',) will match any itemset containing Bread
1020
+ # Pattern ('Bread', 'Eggs') will match sequences where Bread appears before Eggs
1021
+ # (even if they're in different itemsets)
1022
+ ```
1023
+
1024
+ #### Backward Compatibility with Flat Sequences
1025
+
1026
+ GSP-Py automatically normalizes flat sequences to itemsets internally, ensuring full backward compatibility:
1027
+
1028
+ ```python
1029
+ from gsppy import GSP
1030
+
1031
+ # These are equivalent after normalization:
1032
+ flat_transactions = [['A', 'B', 'C']] # Flat format
1033
+ itemset_transactions = [[['A'], ['B'], ['C']]] # Equivalent itemset format
1034
+
1035
+ # Both produce the same results
1036
+ gsp1 = GSP(flat_transactions)
1037
+ gsp2 = GSP(itemset_transactions)
1038
+
1039
+ # Patterns are identical
1040
+ patterns1 = gsp1.search(min_support=0.5)
1041
+ patterns2 = gsp2.search(min_support=0.5)
1042
+ ```
1043
+
1044
+ ### Itemset Matching Semantics
1045
+
1046
+ Pattern matching with itemsets uses **subset semantics**:
1047
+
1048
+ - A pattern element matches a sequence element if all items in the pattern element are present in the sequence element
1049
+ - Example: Pattern `[['A', 'B']]` matches sequence element `['A', 'B', 'C']` because {A, B} ⊆ {A, B, C}
1050
+ - Pattern elements must appear in order across the sequence
1051
+
1052
+ ```python
1053
+ from gsppy import GSP
1054
+
1055
+ transactions = [
1056
+ [['A', 'B', 'D'], ['E'], ['C', 'F']], # A,B,D together, then E, then C,F together
1057
+ ]
1058
+
1059
+ gsp = GSP(transactions)
1060
+
1061
+ # Pattern ('A', 'C') will match because:
1062
+ # - 'A' is in first itemset ['A', 'B', 'D'] ✓
1063
+ # - 'C' appears later in third itemset ['C', 'F'] ✓
1064
+ # - Order is preserved ✓
1065
+ ```
1066
+
1067
+ ### Reading Itemsets from SPM Format
1068
+
1069
+ The SPM/GSP format supports itemsets using delimiters:
1070
+
1071
+ - `-1`: End of itemset
1072
+ - `-2`: End of sequence
1073
+
1074
+ ```python
1075
+ from gsppy.utils import read_transactions_from_spm
1076
+
1077
+ # SPM file content:
1078
+ # 1 2 -1 3 -1 -2
1079
+ # 1 -1 3 4 -1 -2
1080
+
1081
+ # Read with itemsets preserved
1082
+ transactions = read_transactions_from_spm("data.txt", preserve_itemsets=True)
1083
+ # Result: [[['1', '2'], ['3']], [['1'], ['3', '4']]]
1084
+
1085
+ # Read with itemsets flattened (backward compatible)
1086
+ transactions = read_transactions_from_spm("data.txt", preserve_itemsets=False)
1087
+ # Result: [['1', '2', '3'], ['1', '3', '4']]
1088
+ ```
1089
+
1090
+ ### Itemsets with Timestamps
1091
+
1092
+ Itemsets work seamlessly with temporal constraints:
1093
+
1094
+ ```python
1095
+ from gsppy import GSP
1096
+
1097
+ # Itemsets with timestamps: [(item, timestamp), ...]
1098
+ transactions = [
1099
+ [[('Login', 0), ('Home', 0)], [('Product', 5)], [('Checkout', 10)]],
1100
+ [[('Login', 0)], [('Home', 2), ('Product', 2)], [('Checkout', 15)]],
1101
+ ]
1102
+
1103
+ # Find patterns where events in the same itemset occur together
1104
+ # and subsequent itemsets occur within maxgap time units
1105
+ gsp = GSP(transactions, maxgap=10)
1106
+ patterns = gsp.search(min_support=0.5)
1107
+ ```
1108
+
1109
+ ### Complete Example
1110
+
1111
+ See [examples/itemset_example.py](examples/itemset_example.py) for comprehensive examples including:
1112
+
1113
+ - Market basket analysis with itemsets
1114
+ - Web clickstream with parallel page views
1115
+ - Comparison of flat vs. itemset semantics
1116
+ - Reading and processing SPM format files
1117
+
1118
+ ### Key Takeaways
1119
+
1120
+ ✓ **Itemsets capture co-occurrence** of items at the same time step
1121
+ ✓ **Flat sequences are automatically normalized** to itemsets internally
1122
+ ✓ **Both formats work seamlessly** with GSP-Py
1123
+ ✓ **Use itemsets when temporal co-occurrence matters** in your domain
1124
+ ✓ **SPM format supports** both flat and itemset representations
1125
+
1126
+ ---
1127
+
907
1128
  ## ⏱️ Temporal Constraints
908
1129
 
909
1130
  GSP-Py supports **time-constrained sequential pattern mining** with three powerful temporal constraints: `mingap`, `maxgap`, and `maxspan`. These constraints enable domain-specific applications such as medical event mining, retail analytics, and temporal user journey discovery.
@@ -39,6 +39,7 @@ Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal m
39
39
  - [✅ Example: Analyzing Sales Data](#example-analyzing-sales-data)
40
40
  - [📊 Explanation: Support and Results](#explanation-support-and-results)
41
41
  - [📊 DataFrame Input Support](#dataframe-input-support)
42
+ - [🔗 Itemset Support](#itemset-support)
42
43
  - [⏱️ Temporal Constraints](#temporal-constraints)
43
44
  7. [⌨️ Typing](#typing)
44
45
  8. [🌟 Planned Features](#planned-features)
@@ -486,6 +487,82 @@ Verbose mode provides:
486
487
 
487
488
  For complete documentation on logging, see [docs/logging.md](docs/logging.md).
488
489
 
490
+ ### Using Sequence Objects for Rich Pattern Representation
491
+
492
+ GSP-Py 4.0+ introduces a **Sequence abstraction class** that provides a richer, more maintainable way to work with sequential patterns. The Sequence class encapsulates pattern items, support counts, and optional metadata in an immutable, hashable object.
493
+
494
+ #### Traditional Dict-based Output (Default)
495
+
496
+ ```python
497
+ from gsppy import GSP
498
+
499
+ transactions = [
500
+ ['Bread', 'Milk'],
501
+ ['Bread', 'Diaper', 'Beer', 'Eggs'],
502
+ ['Milk', 'Diaper', 'Beer', 'Coke']
503
+ ]
504
+
505
+ gsp = GSP(transactions)
506
+ result = gsp.search(min_support=0.3)
507
+
508
+ # Returns: [{('Bread',): 4, ('Milk',): 4, ...}, {('Bread', 'Milk'): 3, ...}, ...]
509
+ for level_patterns in result:
510
+ for pattern, support in level_patterns.items():
511
+ print(f"Pattern: {pattern}, Support: {support}")
512
+ ```
513
+
514
+ #### Sequence Objects (New Feature)
515
+
516
+ ```python
517
+ from gsppy import GSP
518
+
519
+ transactions = [
520
+ ['Bread', 'Milk'],
521
+ ['Bread', 'Diaper', 'Beer', 'Eggs'],
522
+ ['Milk', 'Diaper', 'Beer', 'Coke']
523
+ ]
524
+
525
+ gsp = GSP(transactions)
526
+ result = gsp.search(min_support=0.3, return_sequences=True)
527
+
528
+ # Returns: [[Sequence(('Bread',), support=4), ...], [Sequence(('Bread', 'Milk'), support=3), ...], ...]
529
+ for level_patterns in result:
530
+ for seq in level_patterns:
531
+ print(f"Pattern: {seq.items}, Support: {seq.support}, Length: {seq.length}")
532
+ # Access sequence properties
533
+ print(f" First item: {seq.first_item}, Last item: {seq.last_item}")
534
+ # Check if item is in sequence
535
+ if "Milk" in seq:
536
+ print(f" Contains Milk!")
537
+ ```
538
+
539
+ #### Key Benefits of Sequence Objects
540
+
541
+ 1. **Rich API**: Access pattern properties like `length`, `first_item`, `last_item`
542
+ 2. **Type Safety**: IDE autocomplete and better type hints
543
+ 3. **Immutable & Hashable**: Can be used as dictionary keys
544
+ 4. **Extensible**: Add metadata for confidence, lift, or custom properties
545
+ 5. **Backward Compatible**: Convert to/from dict format as needed
546
+
547
+ ```python
548
+ from gsppy import Sequence, sequences_to_dict, dict_to_sequences
549
+
550
+ # Create custom sequences
551
+ seq = Sequence.from_tuple(("A", "B", "C"), support=5)
552
+
553
+ # Extend sequences
554
+ extended = seq.extend("D") # Creates Sequence(("A", "B", "C", "D"))
555
+
556
+ # Add metadata
557
+ seq_with_meta = seq.with_metadata(confidence=0.85, lift=1.5)
558
+
559
+ # Convert between formats for compatibility
560
+ seq_result = gsp.search(min_support=0.3, return_sequences=True)
561
+ dict_format = sequences_to_dict(seq_result[0]) # Convert to dict
562
+ ```
563
+
564
+ For a complete example, see [examples/sequence_example.py](examples/sequence_example.py).
565
+
489
566
  ### Loading SPM/GSP Format Files
490
567
 
491
568
  GSP-Py supports loading datasets in the classical SPM/GSP delimiter format, which is widely used in sequential pattern mining research. This format uses:
@@ -831,6 +908,150 @@ For complete examples and edge cases, see:
831
908
 
832
909
  ---
833
910
 
911
+ ## 🔗 Itemset Support
912
+
913
+ GSP-Py supports **itemsets** within sequence elements, enabling you to capture **co-occurrence** of multiple items at the same time step. This is crucial for applications where items occur together rather than in strict sequential order.
914
+
915
+ ### What are Itemsets?
916
+
917
+ - **Flat sequences**: `['A', 'B', 'C']` - each item occurs at a separate time step
918
+ - **Itemset sequences**: `[['A', 'B'], ['C']]` - items A and B occur together at the first time step, then C occurs later
919
+
920
+ ### Why Use Itemsets?
921
+
922
+ Itemsets are essential when temporal co-occurrence matters in your domain:
923
+
924
+ - **Market basket analysis**: Customers buy multiple items in a single shopping trip, then return for more items later
925
+ - **Web analytics**: Users open multiple pages in parallel tabs before moving to the next set of pages
926
+ - **Event logs**: Multiple events can occur simultaneously in complex systems
927
+ - **Purchase patterns**: Items bought together vs. items bought in sequence
928
+
929
+ ### Using Itemsets
930
+
931
+ #### Basic Example
932
+
933
+ ```python
934
+ from gsppy import GSP
935
+
936
+ # Itemset format: nested lists where inner lists are items that occur together
937
+ transactions = [
938
+ [['Bread', 'Milk'], ['Eggs']], # Bought Bread & Milk together, then Eggs later
939
+ [['Bread', 'Milk', 'Butter']], # Bought all three items together
940
+ [['Bread', 'Milk'], ['Eggs']], # Same pattern as customer 1
941
+ ]
942
+
943
+ gsp = GSP(transactions)
944
+ patterns = gsp.search(min_support=0.5)
945
+
946
+ # Pattern ('Bread',) will match any itemset containing Bread
947
+ # Pattern ('Bread', 'Eggs') will match sequences where Bread appears before Eggs
948
+ # (even if they're in different itemsets)
949
+ ```
950
+
951
+ #### Backward Compatibility with Flat Sequences
952
+
953
+ GSP-Py automatically normalizes flat sequences to itemsets internally, ensuring full backward compatibility:
954
+
955
+ ```python
956
+ from gsppy import GSP
957
+
958
+ # These are equivalent after normalization:
959
+ flat_transactions = [['A', 'B', 'C']] # Flat format
960
+ itemset_transactions = [[['A'], ['B'], ['C']]] # Equivalent itemset format
961
+
962
+ # Both produce the same results
963
+ gsp1 = GSP(flat_transactions)
964
+ gsp2 = GSP(itemset_transactions)
965
+
966
+ # Patterns are identical
967
+ patterns1 = gsp1.search(min_support=0.5)
968
+ patterns2 = gsp2.search(min_support=0.5)
969
+ ```
970
+
971
+ ### Itemset Matching Semantics
972
+
973
+ Pattern matching with itemsets uses **subset semantics**:
974
+
975
+ - A pattern element matches a sequence element if all items in the pattern element are present in the sequence element
976
+ - Example: Pattern `[['A', 'B']]` matches sequence element `['A', 'B', 'C']` because {A, B} ⊆ {A, B, C}
977
+ - Pattern elements must appear in order across the sequence
978
+
979
+ ```python
980
+ from gsppy import GSP
981
+
982
+ transactions = [
983
+ [['A', 'B', 'D'], ['E'], ['C', 'F']], # A,B,D together, then E, then C,F together
984
+ ]
985
+
986
+ gsp = GSP(transactions)
987
+
988
+ # Pattern ('A', 'C') will match because:
989
+ # - 'A' is in first itemset ['A', 'B', 'D'] ✓
990
+ # - 'C' appears later in third itemset ['C', 'F'] ✓
991
+ # - Order is preserved ✓
992
+ ```
993
+
994
+ ### Reading Itemsets from SPM Format
995
+
996
+ The SPM/GSP format supports itemsets using delimiters:
997
+
998
+ - `-1`: End of itemset
999
+ - `-2`: End of sequence
1000
+
1001
+ ```python
1002
+ from gsppy.utils import read_transactions_from_spm
1003
+
1004
+ # SPM file content:
1005
+ # 1 2 -1 3 -1 -2
1006
+ # 1 -1 3 4 -1 -2
1007
+
1008
+ # Read with itemsets preserved
1009
+ transactions = read_transactions_from_spm("data.txt", preserve_itemsets=True)
1010
+ # Result: [[['1', '2'], ['3']], [['1'], ['3', '4']]]
1011
+
1012
+ # Read with itemsets flattened (backward compatible)
1013
+ transactions = read_transactions_from_spm("data.txt", preserve_itemsets=False)
1014
+ # Result: [['1', '2', '3'], ['1', '3', '4']]
1015
+ ```
1016
+
1017
+ ### Itemsets with Timestamps
1018
+
1019
+ Itemsets work seamlessly with temporal constraints:
1020
+
1021
+ ```python
1022
+ from gsppy import GSP
1023
+
1024
+ # Itemsets with timestamps: [(item, timestamp), ...]
1025
+ transactions = [
1026
+ [[('Login', 0), ('Home', 0)], [('Product', 5)], [('Checkout', 10)]],
1027
+ [[('Login', 0)], [('Home', 2), ('Product', 2)], [('Checkout', 15)]],
1028
+ ]
1029
+
1030
+ # Find patterns where events in the same itemset occur together
1031
+ # and subsequent itemsets occur within maxgap time units
1032
+ gsp = GSP(transactions, maxgap=10)
1033
+ patterns = gsp.search(min_support=0.5)
1034
+ ```
1035
+
1036
+ ### Complete Example
1037
+
1038
+ See [examples/itemset_example.py](examples/itemset_example.py) for comprehensive examples including:
1039
+
1040
+ - Market basket analysis with itemsets
1041
+ - Web clickstream with parallel page views
1042
+ - Comparison of flat vs. itemset semantics
1043
+ - Reading and processing SPM format files
1044
+
1045
+ ### Key Takeaways
1046
+
1047
+ ✓ **Itemsets capture co-occurrence** of items at the same time step
1048
+ ✓ **Flat sequences are automatically normalized** to itemsets internally
1049
+ ✓ **Both formats work seamlessly** with GSP-Py
1050
+ ✓ **Use itemsets when temporal co-occurrence matters** in your domain
1051
+ ✓ **SPM format supports** both flat and itemset representations
1052
+
1053
+ ---
1054
+
834
1055
  ## ⏱️ Temporal Constraints
835
1056
 
836
1057
  GSP-Py supports **time-constrained sequential pattern mining** with three powerful temporal constraints: `mingap`, `maxgap`, and `maxspan`. These constraints enable domain-specific applications such as medical event mining, retail analytics, and temporal user journey discovery.
@@ -24,6 +24,12 @@ from gsppy.pruning import (
24
24
  FrequencyBasedPruning,
25
25
  create_default_pruning_strategy,
26
26
  )
27
+ from gsppy.sequence import (
28
+ Sequence,
29
+ sequences_to_dict,
30
+ dict_to_sequences,
31
+ to_sequence,
32
+ )
27
33
  from gsppy.token_mapper import TokenMapper
28
34
 
29
35
  # DataFrame adapters are optional - import only if dependencies are available
@@ -63,6 +69,10 @@ __all__ = [
63
69
  "TemporalAwarePruning",
64
70
  "CombinedPruning",
65
71
  "create_default_pruning_strategy",
72
+ "Sequence",
73
+ "sequences_to_dict",
74
+ "dict_to_sequences",
75
+ "to_sequence",
66
76
  "TokenMapper",
67
77
  ]
68
78
 
@@ -35,7 +35,7 @@ import csv
35
35
  import sys
36
36
  import json
37
37
  import logging
38
- from typing import Any, Dict, List, Tuple, Union, Optional, cast
38
+ from typing import Any, List, Tuple, Union, Optional, cast
39
39
 
40
40
  import click
41
41
 
@@ -608,7 +608,7 @@ def main(
608
608
  # Initialize and run GSP algorithm
609
609
  try:
610
610
  gsp = GSP(transactions, mingap=mingap, maxgap=maxgap, maxspan=maxspan, verbose=verbose)
611
- patterns: List[Dict[Tuple[str, ...], int]] = gsp.search(min_support=min_support)
611
+ patterns = gsp.search(min_support=min_support, return_sequences=False)
612
612
  logger.info("Frequent Patterns Found:")
613
613
  for i, level in enumerate(patterns, start=1):
614
614
  logger.info(f"\n{i}-Sequence Patterns:")