gsppy 3.0.1__py3-none-any.whl → 3.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsppy/__init__.py +24 -0
- gsppy/accelerate.py +24 -5
- gsppy/cli.py +1 -1
- gsppy/gsp.py +18 -2
- gsppy/py.typed +0 -0
- gsppy/utils.py +23 -6
- {gsppy-3.0.1.dist-info → gsppy-3.2.7.dist-info}/METADATA +112 -25
- gsppy-3.2.7.dist-info/RECORD +11 -0
- {gsppy-3.0.1.dist-info → gsppy-3.2.7.dist-info}/WHEEL +1 -1
- gsppy-3.0.1.dist-info/RECORD +0 -10
- {gsppy-3.0.1.dist-info → gsppy-3.2.7.dist-info}/entry_points.txt +0 -0
- {gsppy-3.0.1.dist-info → gsppy-3.2.7.dist-info}/licenses/LICENSE +0 -0
gsppy/__init__.py
CHANGED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Public interface for the :mod:`gsppy` package.
|
|
2
|
+
|
|
3
|
+
This module centralizes the primary entry points, including the :class:`~gsppy.gsp.GSP`
|
|
4
|
+
implementation, CLI helpers for loading transactional data, and the package version string.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from importlib import metadata as importlib_metadata
|
|
8
|
+
|
|
9
|
+
from gsppy.cli import detect_and_read_file, read_transactions_from_csv, read_transactions_from_json, setup_logging
|
|
10
|
+
from gsppy.gsp import GSP
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
__version__ = importlib_metadata.version("gsppy")
|
|
14
|
+
except importlib_metadata.PackageNotFoundError: # pragma: no cover - handled only in editable installs
|
|
15
|
+
__version__ = "0.0.0"
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"GSP",
|
|
19
|
+
"detect_and_read_file",
|
|
20
|
+
"read_transactions_from_csv",
|
|
21
|
+
"read_transactions_from_json",
|
|
22
|
+
"setup_logging",
|
|
23
|
+
"__version__",
|
|
24
|
+
]
|
gsppy/accelerate.py
CHANGED
|
@@ -21,6 +21,7 @@ from .utils import split_into_batches, is_subsequence_in_list
|
|
|
21
21
|
|
|
22
22
|
# Optional GPU (CuPy) support
|
|
23
23
|
_gpu_available = False
|
|
24
|
+
cp: Any | None = None
|
|
24
25
|
try: # pragma: no cover - optional dependency path
|
|
25
26
|
import cupy as _cp_mod # type: ignore[import-not-found]
|
|
26
27
|
|
|
@@ -126,8 +127,8 @@ def _support_counts_gpu_singletons(
|
|
|
126
127
|
if not flat:
|
|
127
128
|
return []
|
|
128
129
|
|
|
129
|
-
cp_flat = cp.asarray(flat, dtype=cp.int32) # type: ignore[name-defined]
|
|
130
|
-
counts = cp.bincount(cp_flat, minlength=vocab_size) # type: ignore[attr-defined]
|
|
130
|
+
cp_flat = cp.asarray(flat, dtype=cp.int32) # type: ignore[name-defined, union-attr]
|
|
131
|
+
counts = cp.bincount(cp_flat, minlength=vocab_size) # type: ignore[attr-defined, union-attr]
|
|
131
132
|
counts_host: Any = counts.get() # back to host as a NumPy array
|
|
132
133
|
|
|
133
134
|
out: List[Tuple[List[int], int]] = []
|
|
@@ -178,6 +179,17 @@ def support_counts(
|
|
|
178
179
|
fall back to CPU for the rest
|
|
179
180
|
- "python": force pure-Python fallback
|
|
180
181
|
- otherwise: try Rust first and fall back to Python
|
|
182
|
+
|
|
183
|
+
Example:
|
|
184
|
+
Running a search with an explicit backend:
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
from gsppy.accelerate import support_counts
|
|
188
|
+
|
|
189
|
+
transactions = [("A", "B"), ("A", "C")]
|
|
190
|
+
candidates = [("A",), ("B",), ("A", "B")]
|
|
191
|
+
counts = support_counts(transactions, candidates, min_support_abs=1, backend="python")
|
|
192
|
+
```
|
|
181
193
|
"""
|
|
182
194
|
backend_sel = (backend or _env_backend()).lower()
|
|
183
195
|
|
|
@@ -222,7 +234,8 @@ def support_counts(
|
|
|
222
234
|
try:
|
|
223
235
|
other_enc = [enc for enc, _ in others]
|
|
224
236
|
res = cast(
|
|
225
|
-
List[Tuple[List[int], int]],
|
|
237
|
+
List[Tuple[List[int], int]],
|
|
238
|
+
_compute_supports_rust(enc_tx, other_enc, int(min_support_abs)), # ty:ignore[call-non-callable]
|
|
226
239
|
)
|
|
227
240
|
for enc_cand, freq in res:
|
|
228
241
|
out[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
|
|
@@ -247,7 +260,10 @@ def support_counts(
|
|
|
247
260
|
# use rust
|
|
248
261
|
enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
|
|
249
262
|
enc_cands = _encode_candidates(candidates, vocab)
|
|
250
|
-
result = cast(
|
|
263
|
+
result = cast(
|
|
264
|
+
List[Tuple[List[int], int]],
|
|
265
|
+
_compute_supports_rust(enc_tx, enc_cands, int(min_support_abs)), # ty:ignore[call-non-callable]
|
|
266
|
+
)
|
|
251
267
|
out_rust: Dict[Tuple[str, ...], int] = {}
|
|
252
268
|
for enc_cand, freq in result:
|
|
253
269
|
out_rust[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
|
|
@@ -258,7 +274,10 @@ def support_counts(
|
|
|
258
274
|
enc_tx, inv_vocab, vocab = _get_encoded_transactions(transactions)
|
|
259
275
|
enc_cands = _encode_candidates(candidates, vocab)
|
|
260
276
|
try:
|
|
261
|
-
result = cast(
|
|
277
|
+
result = cast(
|
|
278
|
+
List[Tuple[List[int], int]],
|
|
279
|
+
_compute_supports_rust(enc_tx, enc_cands, int(min_support_abs)), # ty:ignore[call-non-callable]
|
|
280
|
+
)
|
|
262
281
|
out2: Dict[Tuple[str, ...], int] = {}
|
|
263
282
|
for enc_cand, freq in result:
|
|
264
283
|
out2[tuple(inv_vocab[i] for i in enc_cand)] = int(freq)
|
gsppy/cli.py
CHANGED
|
@@ -45,7 +45,7 @@ logging.basicConfig(
|
|
|
45
45
|
format="%(message)s", # Simplified to keep CLI output clean
|
|
46
46
|
handlers=[logging.StreamHandler(sys.stdout)],
|
|
47
47
|
)
|
|
48
|
-
logger = logging.getLogger(__name__)
|
|
48
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
|
49
49
|
|
|
50
50
|
|
|
51
51
|
def setup_logging(verbose: bool) -> None:
|
gsppy/gsp.py
CHANGED
|
@@ -95,7 +95,7 @@ from collections import Counter
|
|
|
95
95
|
from gsppy.utils import split_into_batches, is_subsequence_in_list, generate_candidates_from_previous
|
|
96
96
|
from gsppy.accelerate import support_counts as support_counts_accel
|
|
97
97
|
|
|
98
|
-
logger = logging.getLogger(__name__)
|
|
98
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
|
99
99
|
|
|
100
100
|
|
|
101
101
|
class GSP:
|
|
@@ -171,7 +171,7 @@ class GSP:
|
|
|
171
171
|
raise ValueError(msg)
|
|
172
172
|
|
|
173
173
|
logger.info("Pre-processing transactions...")
|
|
174
|
-
self.max_size = max(len(item) for item in raw_transactions)
|
|
174
|
+
self.max_size: int = max(len(item) for item in raw_transactions)
|
|
175
175
|
self.transactions: List[Tuple[str, ...]] = [tuple(transaction) for transaction in raw_transactions]
|
|
176
176
|
counts: Counter[str] = Counter(chain.from_iterable(raw_transactions))
|
|
177
177
|
# Start with singleton candidates (1-sequences)
|
|
@@ -295,6 +295,22 @@ class GSP:
|
|
|
295
295
|
- Information about the algorithm's start, intermediate progress (candidates filtered),
|
|
296
296
|
and completion.
|
|
297
297
|
- Status updates for each iteration until the algorithm terminates.
|
|
298
|
+
|
|
299
|
+
Example:
|
|
300
|
+
Basic usage with the default backend:
|
|
301
|
+
|
|
302
|
+
```python
|
|
303
|
+
from gsppy.gsp import GSP
|
|
304
|
+
|
|
305
|
+
transactions = [
|
|
306
|
+
["Bread", "Milk"],
|
|
307
|
+
["Bread", "Diaper", "Beer", "Eggs"],
|
|
308
|
+
["Milk", "Diaper", "Beer", "Coke"],
|
|
309
|
+
]
|
|
310
|
+
|
|
311
|
+
gsp = GSP(transactions)
|
|
312
|
+
patterns = gsp.search(min_support=0.3)
|
|
313
|
+
```
|
|
298
314
|
"""
|
|
299
315
|
if not 0.0 < min_support <= 1.0:
|
|
300
316
|
raise ValueError("Minimum support must be in the range (0.0, 1.0]")
|
gsppy/py.typed
ADDED
|
File without changes
|
gsppy/utils.py
CHANGED
|
@@ -5,14 +5,14 @@ and generating candidate patterns from previously frequent patterns.
|
|
|
5
5
|
|
|
6
6
|
The key functionalities include:
|
|
7
7
|
1. Splitting a list of items into smaller batches for easier processing.
|
|
8
|
-
2. Checking for the existence of
|
|
8
|
+
2. Checking for the existence of an ordered (non-contiguous) subsequence within a sequence,
|
|
9
9
|
with caching to optimize repeated comparisons.
|
|
10
10
|
3. Generating candidate patterns from a dictionary of frequent patterns
|
|
11
11
|
to support pattern generation tasks in algorithms like sequence mining.
|
|
12
12
|
|
|
13
13
|
Main functionalities:
|
|
14
14
|
- `split_into_batches`: Splits a list of items into smaller batches based on a specified batch size.
|
|
15
|
-
- `is_subsequence_in_list`: Determines if a subsequence exists within another sequence,
|
|
15
|
+
- `is_subsequence_in_list`: Determines if a subsequence exists within another sequence in order,
|
|
16
16
|
using caching to improve performance.
|
|
17
17
|
- `generate_candidates_from_previous`: Generates candidate patterns by joining previously
|
|
18
18
|
identified frequent patterns.
|
|
@@ -46,7 +46,10 @@ def split_into_batches(
|
|
|
46
46
|
@lru_cache(maxsize=None)
|
|
47
47
|
def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ...]) -> bool:
|
|
48
48
|
"""
|
|
49
|
-
Check if a subsequence exists within a sequence as
|
|
49
|
+
Check if a subsequence exists within a sequence as an ordered (non-contiguous) subsequence.
|
|
50
|
+
|
|
51
|
+
This function implements the standard GSP semantics where items in the subsequence
|
|
52
|
+
must appear in the same order in the sequence, but not necessarily contiguously.
|
|
50
53
|
|
|
51
54
|
Parameters:
|
|
52
55
|
subsequence: (tuple): The sequence to search for.
|
|
@@ -54,6 +57,14 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
|
|
|
54
57
|
|
|
55
58
|
Returns:
|
|
56
59
|
bool: True if the subsequence is found, False otherwise.
|
|
60
|
+
|
|
61
|
+
Examples:
|
|
62
|
+
>>> is_subsequence_in_list(('a', 'c'), ('a', 'b', 'c'))
|
|
63
|
+
True
|
|
64
|
+
>>> is_subsequence_in_list(('a', 'c'), ('c', 'a'))
|
|
65
|
+
False
|
|
66
|
+
>>> is_subsequence_in_list(('a', 'b'), ('a', 'b', 'c'))
|
|
67
|
+
True
|
|
57
68
|
"""
|
|
58
69
|
# Handle the case where the subsequence is empty - it should not exist in any sequence
|
|
59
70
|
if not subsequence:
|
|
@@ -61,12 +72,18 @@ def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ..
|
|
|
61
72
|
|
|
62
73
|
len_sub, len_seq = len(subsequence), len(sequence)
|
|
63
74
|
|
|
64
|
-
# Return False if the
|
|
75
|
+
# Return False if the subsequence is longer than the sequence
|
|
65
76
|
if len_sub > len_seq:
|
|
66
77
|
return False
|
|
67
78
|
|
|
68
|
-
# Use
|
|
69
|
-
|
|
79
|
+
# Use two-pointer approach to check if subsequence exists in order
|
|
80
|
+
sub_idx = 0
|
|
81
|
+
for seq_idx in range(len_seq):
|
|
82
|
+
if sequence[seq_idx] == subsequence[sub_idx]:
|
|
83
|
+
sub_idx += 1
|
|
84
|
+
if sub_idx == len_sub:
|
|
85
|
+
return True
|
|
86
|
+
return False
|
|
70
87
|
|
|
71
88
|
|
|
72
89
|
def generate_candidates_from_previous(prev_patterns: Dict[Tuple[str, ...], int]) -> List[Tuple[str, ...]]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gsppy
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.2.7
|
|
4
4
|
Summary: GSP (Generalized Sequence Pattern) algorithm in Python
|
|
5
5
|
Project-URL: Homepage, https://github.com/jacksonpradolima/gsp-py
|
|
6
6
|
Author-email: Jackson Antonio do Prado Lima <jacksonpradolima@gmail.com>
|
|
@@ -41,27 +41,34 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
41
41
|
Requires-Python: >=3.10
|
|
42
42
|
Requires-Dist: click>=8.0.0
|
|
43
43
|
Provides-Extra: dev
|
|
44
|
-
Requires-Dist: cython==3.
|
|
45
|
-
Requires-Dist: hatch==1.
|
|
46
|
-
Requires-Dist: hatchling==1.
|
|
47
|
-
Requires-Dist:
|
|
48
|
-
Requires-Dist:
|
|
49
|
-
Requires-Dist:
|
|
50
|
-
Requires-Dist: pytest-benchmark==5.1.0; extra == 'dev'
|
|
44
|
+
Requires-Dist: cython==3.2.3; extra == 'dev'
|
|
45
|
+
Requires-Dist: hatch==1.16.2; extra == 'dev'
|
|
46
|
+
Requires-Dist: hatchling==1.28.0; extra == 'dev'
|
|
47
|
+
Requires-Dist: pylint==4.0.4; extra == 'dev'
|
|
48
|
+
Requires-Dist: pyright==1.1.407; extra == 'dev'
|
|
49
|
+
Requires-Dist: pytest-benchmark==5.2.3; extra == 'dev'
|
|
51
50
|
Requires-Dist: pytest-cov==7.0.0; extra == 'dev'
|
|
52
|
-
Requires-Dist: pytest==
|
|
53
|
-
Requires-Dist: ruff==0.
|
|
54
|
-
Requires-Dist: tox==4.
|
|
51
|
+
Requires-Dist: pytest==9.0.2; extra == 'dev'
|
|
52
|
+
Requires-Dist: ruff==0.14.10; extra == 'dev'
|
|
53
|
+
Requires-Dist: tox==4.32.0; extra == 'dev'
|
|
54
|
+
Requires-Dist: ty==0.0.8; extra == 'dev'
|
|
55
|
+
Provides-Extra: docs
|
|
56
|
+
Requires-Dist: mkdocs-gen-files<1,>=0.5; extra == 'docs'
|
|
57
|
+
Requires-Dist: mkdocs-literate-nav<1,>=0.6; extra == 'docs'
|
|
58
|
+
Requires-Dist: mkdocs-material<10,>=9.5; extra == 'docs'
|
|
59
|
+
Requires-Dist: mkdocs<2,>=1.6; extra == 'docs'
|
|
60
|
+
Requires-Dist: mkdocstrings[python]<0.27,>=0.26; extra == 'docs'
|
|
55
61
|
Provides-Extra: gpu
|
|
56
62
|
Requires-Dist: cupy<14,>=11; extra == 'gpu'
|
|
57
63
|
Provides-Extra: rust
|
|
58
|
-
Requires-Dist: maturin==1.
|
|
64
|
+
Requires-Dist: maturin==1.10.2; extra == 'rust'
|
|
59
65
|
Description-Content-Type: text/markdown
|
|
60
66
|
|
|
61
67
|
[]()
|
|
62
68
|

|
|
63
69
|
[](https://doi.org/10.5281/zenodo.3333987)
|
|
64
70
|
|
|
71
|
+
[](https://jacksonpradolima.github.io/gsp-py/)
|
|
65
72
|
[](https://pypi.org/project/gsppy/)
|
|
66
73
|
[](https://sonarcloud.io/summary/new_code?id=jacksonpradolima_gsp-py)
|
|
67
74
|
[](https://sonarcloud.io/summary/new_code?id=jacksonpradolima_gsp-py)
|
|
@@ -87,13 +94,15 @@ Sequence Pattern (GSP)** algorithm. Ideal for market basket analysis, temporal m
|
|
|
87
94
|
- [❖ Clone Repository](#option-1-clone-the-repository)
|
|
88
95
|
- [❖ Install via PyPI](#option-2-install-via-pip)
|
|
89
96
|
4. [🛠️ Developer Installation](#developer-installation)
|
|
90
|
-
5. [
|
|
97
|
+
5. [📖 Documentation](#documentation)
|
|
98
|
+
6. [💡 Usage](#usage)
|
|
91
99
|
- [✅ Example: Analyzing Sales Data](#example-analyzing-sales-data)
|
|
92
100
|
- [📊 Explanation: Support and Results](#explanation-support-and-results)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
101
|
+
7. [⌨️ Typing](#typing)
|
|
102
|
+
8. [🌟 Planned Features](#planned-features)
|
|
103
|
+
9. [🤝 Contributing](#contributing)
|
|
104
|
+
10. [📝 License](#license)
|
|
105
|
+
11. [📖 Citation](#citation)
|
|
97
106
|
|
|
98
107
|
---
|
|
99
108
|
|
|
@@ -104,14 +113,15 @@ principles**. Using support thresholds, GSP identifies frequent sequences of ite
|
|
|
104
113
|
|
|
105
114
|
### Key Features:
|
|
106
115
|
|
|
116
|
+
- **Ordered (non-contiguous) matching**: Detects patterns where items appear in order but not necessarily adjacent, following standard GSP semantics. For example, the pattern `('A', 'C')` is found in the sequence `['A', 'B', 'C']`.
|
|
107
117
|
- **Support-based pruning**: Only retains sequences that meet the minimum support threshold.
|
|
108
118
|
- **Candidate generation**: Iteratively generates candidate sequences of increasing length.
|
|
109
119
|
- **General-purpose**: Useful in retail, web analytics, social networks, temporal sequence mining, and more.
|
|
110
120
|
|
|
111
121
|
For example:
|
|
112
122
|
|
|
113
|
-
- In a shopping dataset, GSP can identify patterns like "Customers who buy bread and milk often purchase diapers next.
|
|
114
|
-
- In a website clickstream, GSP might find patterns like "Users visit A, then go to
|
|
123
|
+
- In a shopping dataset, GSP can identify patterns like "Customers who buy bread and milk often purchase diapers next" - even if other items appear between bread and milk.
|
|
124
|
+
- In a website clickstream, GSP might find patterns like "Users visit A, then eventually go to C" - capturing user journeys with intermediate steps.
|
|
115
125
|
|
|
116
126
|
---
|
|
117
127
|
|
|
@@ -244,7 +254,7 @@ make install # sync deps (from uv.lock) + install project (-e .)
|
|
|
244
254
|
make test # pytest -n auto
|
|
245
255
|
make lint # ruff check .
|
|
246
256
|
make format # ruff --fix
|
|
247
|
-
make typecheck # pyright
|
|
257
|
+
make typecheck # pyright + ty
|
|
248
258
|
make pre-commit-install # install the pre-commit hook
|
|
249
259
|
make pre-commit-run # run pre-commit on all files
|
|
250
260
|
|
|
@@ -258,6 +268,41 @@ make bench-big # run large benchmark
|
|
|
258
268
|
> [!NOTE]
|
|
259
269
|
> Tox in this project uses the "tox-uv" plugin. When running `make tox` or `tox`, missing Python interpreters can be provisioned automatically via uv (no need to pre-install all versions). This makes local setup faster.
|
|
260
270
|
|
|
271
|
+
## 🔏 Release assets and verification
|
|
272
|
+
|
|
273
|
+
Every GitHub release bundles artifacts to help you validate what you download:
|
|
274
|
+
|
|
275
|
+
- Built wheels and source distributions produced by the automated publish workflow.
|
|
276
|
+
- `sbom.json` (CycloneDX) generated with [Syft](https://github.com/anchore/syft).
|
|
277
|
+
- Sigstore-generated `.sig` and `.pem` files for each artifact, created using GitHub OIDC identity.
|
|
278
|
+
|
|
279
|
+
To verify a downloaded artifact from a release:
|
|
280
|
+
|
|
281
|
+
```bash
|
|
282
|
+
python -m pip install sigstore # installs the CLI
|
|
283
|
+
sigstore verify identity \
|
|
284
|
+
--certificate gsppy-<version>-py3-none-any.whl.pem \
|
|
285
|
+
--signature gsppy-<version>-py3-none-any.whl.sig \
|
|
286
|
+
--cert-identity "https://github.com/jacksonpradolima/gsp-py/.github/workflows/publish.yml@refs/tags/v<version>" \
|
|
287
|
+
--cert-oidc-issuer https://token.actions.githubusercontent.com \
|
|
288
|
+
gsppy-<version>-py3-none-any.whl
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
Replace `<version>` with the numeric package version (for example, `3.1.1`) in the filenames; in `--cert-identity`, this becomes `v<version>` (for example, `v3.1.1`). Adjust the filenames for the sdist (`.tar.gz`) if preferred. The same release page also hosts `sbom.json` for supply-chain inspection.
|
|
292
|
+
|
|
293
|
+
## 📖 Documentation
|
|
294
|
+
|
|
295
|
+
- **Live site:** https://jacksonpradolima.github.io/gsp-py/
|
|
296
|
+
- **Build locally:**
|
|
297
|
+
|
|
298
|
+
```bash
|
|
299
|
+
uv venv .venv
|
|
300
|
+
uv sync --extra docs
|
|
301
|
+
uv run mkdocs serve
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
The docs use MkDocs with the Material theme and mkdocstrings to render the Python API directly from docstrings.
|
|
305
|
+
|
|
261
306
|
## 💡 Usage
|
|
262
307
|
|
|
263
308
|
The library is designed to be easy to use and integrate with your own projects. You can use GSP-Py either programmatically (Python API) or directly from the command line (CLI).
|
|
@@ -427,30 +472,72 @@ Sample Output:
|
|
|
427
472
|
```python
|
|
428
473
|
[
|
|
429
474
|
{('Bread',): 4, ('Milk',): 4, ('Diaper',): 4, ('Beer',): 3, ('Coke',): 2},
|
|
430
|
-
{('Bread', 'Milk'): 3, ('Milk', 'Diaper'): 3, ('Diaper', 'Beer'): 3},
|
|
431
|
-
{('Bread', 'Milk', 'Diaper'): 2, ('Milk', 'Diaper', 'Beer'): 2}
|
|
475
|
+
{('Bread', 'Milk'): 3, ('Bread', 'Diaper'): 3, ('Bread', 'Beer'): 2, ('Milk', 'Diaper'): 3, ('Milk', 'Beer'): 2, ('Milk', 'Coke'): 2, ('Diaper', 'Beer'): 3, ('Diaper', 'Coke'): 2},
|
|
476
|
+
{('Bread', 'Milk', 'Diaper'): 2, ('Bread', 'Diaper', 'Beer'): 2, ('Milk', 'Diaper', 'Beer'): 2, ('Milk', 'Diaper', 'Coke'): 2}
|
|
432
477
|
]
|
|
433
478
|
```
|
|
434
479
|
|
|
435
480
|
- The **first dictionary** contains single-item sequences with their frequencies (e.g., `('Bread',): 4` means "Bread"
|
|
436
481
|
appears in 4 transactions).
|
|
437
482
|
- The **second dictionary** contains 2-item sequential patterns (e.g., `('Bread', 'Milk'): 3` means the sequence "
|
|
438
|
-
Bread → Milk" appears in 3 transactions).
|
|
483
|
+
Bread → Milk" appears in 3 transactions). Note that patterns like `('Bread', 'Beer')` are detected even when they don't appear adjacent in transactions - they just need to appear in order.
|
|
439
484
|
- The **third dictionary** contains 3-item sequential patterns (e.g., `('Bread', 'Milk', 'Diaper'): 2` means the
|
|
440
485
|
sequence "Bread → Milk → Diaper" appears in 2 transactions).
|
|
441
486
|
|
|
442
487
|
> [!NOTE]
|
|
443
|
-
> The **support** of a sequence is calculated as the fraction of transactions containing the sequence, e.g.,
|
|
444
|
-
`
|
|
488
|
+
> The **support** of a sequence is calculated as the fraction of transactions containing the sequence **in order** (not necessarily contiguously), e.g.,
|
|
489
|
+
`('Bread', 'Milk')` appears in 3 out of 5 transactions → Support = `3 / 5 = 0.6` (60%).
|
|
445
490
|
> This insight helps identify frequently occurring sequential patterns in datasets, such as shopping trends or user
|
|
446
491
|
> behavior.
|
|
447
492
|
|
|
493
|
+
> [!IMPORTANT]
|
|
494
|
+
> **Non-contiguous (ordered) matching**: GSP-Py detects patterns where items appear in the specified order but not necessarily adjacent. For example, the pattern `('Bread', 'Beer')` matches the transaction `['Bread', 'Milk', 'Diaper', 'Beer']` because Bread appears before Beer, even though they are not adjacent. This follows the standard GSP algorithm semantics for sequential pattern mining.
|
|
495
|
+
|
|
496
|
+
### Understanding Non-Contiguous Pattern Matching
|
|
497
|
+
|
|
498
|
+
GSP-Py follows the standard GSP algorithm semantics by detecting **ordered (non-contiguous)** subsequences. This means:
|
|
499
|
+
|
|
500
|
+
- ✅ **Order matters**: Items must appear in the specified sequence order
|
|
501
|
+
- ✅ **Gaps allowed**: Items don't need to be adjacent
|
|
502
|
+
- ❌ **Wrong order rejected**: Items appearing in different order won't match
|
|
503
|
+
|
|
504
|
+
**Example:**
|
|
505
|
+
|
|
506
|
+
```python
|
|
507
|
+
from gsppy.gsp import GSP
|
|
508
|
+
|
|
509
|
+
sequences = [
|
|
510
|
+
['a', 'b', 'c'], # Contains: (a,b), (a,c), (b,c), (a,b,c)
|
|
511
|
+
['a', 'c'], # Contains: (a,c)
|
|
512
|
+
['b', 'c', 'a'], # Contains: (b,c), (b,a), (c,a)
|
|
513
|
+
['a', 'b', 'c', 'd'], # Contains: (a,b), (a,c), (a,d), (b,c), (b,d), (c,d), etc.
|
|
514
|
+
]
|
|
515
|
+
|
|
516
|
+
gsp = GSP(sequences)
|
|
517
|
+
result = gsp.search(min_support=0.5) # Need at least 2/4 sequences
|
|
518
|
+
|
|
519
|
+
# Pattern ('a', 'c') is found with support=3 because:
|
|
520
|
+
# - It appears in ['a', 'b', 'c'] (with 'b' in between)
|
|
521
|
+
# - It appears in ['a', 'c'] (adjacent)
|
|
522
|
+
# - It appears in ['a', 'b', 'c', 'd'] (with 'b' in between)
|
|
523
|
+
# Total: 3 out of 4 sequences = 75% support ✅
|
|
524
|
+
```
|
|
525
|
+
|
|
448
526
|
|
|
449
527
|
> [!TIP]
|
|
450
528
|
> For more complex examples, find example scripts in the [`gsppy/tests`](gsppy/tests) folder.
|
|
451
529
|
|
|
452
530
|
---
|
|
453
531
|
|
|
532
|
+
## ⌨️ Typing
|
|
533
|
+
|
|
534
|
+
`gsppy` ships inline type information (PEP 561) via a bundled `py.typed` marker. The public API is re-exported from
|
|
535
|
+
`gsppy` directly—import `GSP` for programmatic use or reuse the CLI helpers (`detect_and_read_file`,
|
|
536
|
+
`read_transactions_from_json`, `read_transactions_from_csv`, and `setup_logging`) when embedding the tool in
|
|
537
|
+
larger applications.
|
|
538
|
+
|
|
539
|
+
---
|
|
540
|
+
|
|
454
541
|
## 🌟 Planned Features
|
|
455
542
|
|
|
456
543
|
We are actively working to improve GSP-Py. Here are some exciting features planned for future releases:
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
gsppy/__init__.py,sha256=FcWEYkzMCiqIBmc4yhgIXFKzvSNjJA7LX7juUabvoJ4,784
|
|
2
|
+
gsppy/accelerate.py,sha256=2I3IA42FyPZvfwc0-f0bovZ8YgbdvJXj0qDlYWSWiXI,10998
|
|
3
|
+
gsppy/cli.py,sha256=W5udAPKOjlxi-c-RKcz5HW-sDgoap4ojHD87bd-X498,6583
|
|
4
|
+
gsppy/gsp.py,sha256=aCtPrldVNCkwj6wwytrZzbayYKkXi9Om-3xzrHUMkLQ,15293
|
|
5
|
+
gsppy/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
gsppy/utils.py,sha256=KtjfDgsTwvwxIyA2KCQmgu8cFkBqQvMZN8Ct5NB60Tc,3952
|
|
7
|
+
gsppy-3.2.7.dist-info/METADATA,sha256=H7qZ7b0DGtca_pA9uiY0fijQTvPmsFeHHGg9fzKc6V0,22130
|
|
8
|
+
gsppy-3.2.7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
9
|
+
gsppy-3.2.7.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
10
|
+
gsppy-3.2.7.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
|
|
11
|
+
gsppy-3.2.7.dist-info/RECORD,,
|
gsppy-3.0.1.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
gsppy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
gsppy/accelerate.py,sha256=YO3YQFzo2VAC6IXOTnQnOajkZO7SabkieGb1IPgWdSI,10407
|
|
3
|
-
gsppy/cli.py,sha256=wsGoc_utxpRfgCF9vPOAyLDTOJZ8NaiwiUny5VyIYvQ,6567
|
|
4
|
-
gsppy/gsp.py,sha256=GCHFhOu-DyHEPsse_OXzf9IaZoigF8ouRqgn_OsZBvA,14855
|
|
5
|
-
gsppy/utils.py,sha256=YlV0F64lnd2Xymf6XnYr6mMLYWV2f2yjaHkZbAS1Qs0,3362
|
|
6
|
-
gsppy-3.0.1.dist-info/METADATA,sha256=vt35btl69hnEM4R1Kz-U5m-2MHEN5hdkHBk-uJ2eKAw,17670
|
|
7
|
-
gsppy-3.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
8
|
-
gsppy-3.0.1.dist-info/entry_points.txt,sha256=smvmcIWk424ARIGKOC_BM42hpT_SptKPcIeqs-8u8lM,41
|
|
9
|
-
gsppy-3.0.1.dist-info/licenses/LICENSE,sha256=AlXanKSqFzo_o-87gp3Qw3XzbmnfxYy7O0xJOcQGWJo,1086
|
|
10
|
-
gsppy-3.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|