bm-preprocessing 0.2.1__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bm_preprocessing-0.3.0/.gitignore +24 -0
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/PKG-INFO +4 -1
- bm_preprocessing-0.3.0/USAGE.md +96 -0
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/pyproject.toml +4 -1
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/__init__.py +3 -1
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/all.py +5 -5
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/apriori.py +5 -5
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/hash.py +5 -5
- bm_preprocessing-0.3.0/src/bm_preprocessing/DM/hunts.py +30 -0
- bm_preprocessing-0.3.0/src/bm_preprocessing/DM/hunts_test.py +30 -0
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/preprocessing.py +5 -5
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/sources/all.py +16 -14
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/sources/apriori.py +18 -12
- bm_preprocessing-0.3.0/src/bm_preprocessing/DM/sources/data.csv +11 -0
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/sources/hash.py +33 -17
- bm_preprocessing-0.3.0/src/bm_preprocessing/DM/sources/hunts.py +96 -0
- bm_preprocessing-0.3.0/src/bm_preprocessing/DM/sources/hunts_test.py +101 -0
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/sources/preprocessing.py +12 -5
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/IR/all.py +5 -5
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/IR/sources/all.py +32 -20
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/__init__.py +1 -2
- bm_preprocessing-0.2.1/.gitignore +0 -10
- bm_preprocessing-0.2.1/uv.lock +0 -1100
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/README.md +0 -0
- {bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/IR/__init__.py +0 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
dist/
|
|
6
|
+
build/
|
|
7
|
+
*.egg
|
|
8
|
+
|
|
9
|
+
# Virtual environment
|
|
10
|
+
.venv/
|
|
11
|
+
|
|
12
|
+
# IDE
|
|
13
|
+
.vscode/
|
|
14
|
+
.idea/
|
|
15
|
+
|
|
16
|
+
# Generated images
|
|
17
|
+
*.png
|
|
18
|
+
|
|
19
|
+
# OS files
|
|
20
|
+
Thumbs.db
|
|
21
|
+
.DS_Store
|
|
22
|
+
|
|
23
|
+
# UV
|
|
24
|
+
uv.lock
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bm-preprocessing
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: A package to preprocess text data
|
|
5
5
|
Requires-Python: >=3.8
|
|
6
6
|
Requires-Dist: build>=1.2.2.post1
|
|
7
|
+
Requires-Dist: graphviz>=0.20.3
|
|
8
|
+
Requires-Dist: matplotlib>=3.7.5
|
|
9
|
+
Requires-Dist: pandas>=2.0.3
|
|
7
10
|
Requires-Dist: twine>=6.1.0
|
|
8
11
|
Description-Content-Type: text/markdown
|
|
9
12
|
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# bm-preprocessing Usage Guide
|
|
2
|
+
|
|
3
|
+
## Installation
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install bm-preprocessing
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Usage in Python File
|
|
12
|
+
|
|
13
|
+
Create a file `example.py`:
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
# Import modules
|
|
17
|
+
from bm_preprocessing.IR import all
|
|
18
|
+
from bm_preprocessing.DM import apriori, hash, hunts, hunts_test, preprocessing
|
|
19
|
+
|
|
20
|
+
# Print the source code
|
|
21
|
+
print("=== IR All Module ===")
|
|
22
|
+
print(all)
|
|
23
|
+
|
|
24
|
+
print("\n=== DM Apriori Module ===")
|
|
25
|
+
print(apriori)
|
|
26
|
+
|
|
27
|
+
print("\n=== DM Hash Module ===")
|
|
28
|
+
print(hash)
|
|
29
|
+
|
|
30
|
+
print("\n=== DM Hunts Module ===")
|
|
31
|
+
print(hunts)
|
|
32
|
+
|
|
33
|
+
print("\n=== DM Hunts Test Module ===")
|
|
34
|
+
print(hunts_test)
|
|
35
|
+
|
|
36
|
+
print("\n=== DM Preprocessing Module ===")
|
|
37
|
+
print(preprocessing)
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Run it:
|
|
41
|
+
```bash
|
|
42
|
+
python example.py
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Usage in Terminal (Interactive Python)
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
python
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Then in the Python REPL:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
>>> from bm_preprocessing.IR import all
|
|
57
|
+
>>> print(all)
|
|
58
|
+
# Prints entire IR/all.py source code
|
|
59
|
+
|
|
60
|
+
>>> from bm_preprocessing.DM import apriori
|
|
61
|
+
>>> print(apriori)
|
|
62
|
+
# Prints entire DM/apriori.py source code
|
|
63
|
+
|
|
64
|
+
>>> from bm_preprocessing.DM import hunts, hunts_test
|
|
65
|
+
>>> print(hunts)
|
|
66
|
+
# Prints entire DM/hunts.py source code
|
|
67
|
+
>>> print(hunts_test)
|
|
68
|
+
# Prints entire DM/hunts_test.py source code
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## One-liner in Terminal
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
python -c "from bm_preprocessing.IR import all; print(all)"
|
|
77
|
+
python -c "from bm_preprocessing.DM import apriori; print(apriori)"
|
|
78
|
+
python -c "from bm_preprocessing.DM import hash; print(hash)"
|
|
79
|
+
python -c "from bm_preprocessing.DM import hunts; print(hunts)"
|
|
80
|
+
python -c "from bm_preprocessing.DM import hunts_test; print(hunts_test)"
|
|
81
|
+
python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## Available Modules
|
|
87
|
+
|
|
88
|
+
| Import | Description |
|
|
89
|
+
|--------|-------------|
|
|
90
|
+
| `from bm_preprocessing.IR import all` | Information Retrieval (BM25, TF-IDF, Boolean) |
|
|
91
|
+
| `from bm_preprocessing.DM import all` | Data Mining algorithms |
|
|
92
|
+
| `from bm_preprocessing.DM import apriori` | Apriori algorithm |
|
|
93
|
+
| `from bm_preprocessing.DM import hash` | Hash-based mining |
|
|
94
|
+
| `from bm_preprocessing.DM import hunts` | Hunt's decision tree algorithm |
|
|
95
|
+
| `from bm_preprocessing.DM import hunts_test` | Hunt's decision tree with visualization |
|
|
96
|
+
| `from bm_preprocessing.DM import preprocessing` | Data preprocessing utilities |
|
|
@@ -4,12 +4,15 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "bm-preprocessing"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.0"
|
|
8
8
|
description = "A package to preprocess text data"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
11
11
|
dependencies = [
|
|
12
12
|
"build>=1.2.2.post1",
|
|
13
|
+
"graphviz>=0.20.3",
|
|
14
|
+
"matplotlib>=3.7.5",
|
|
15
|
+
"pandas>=2.0.3",
|
|
13
16
|
"twine>=6.1.0",
|
|
14
17
|
]
|
|
15
18
|
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
from .all import all
|
|
4
4
|
from .apriori import apriori
|
|
5
5
|
from .hash import hash
|
|
6
|
+
from .hunts import hunts
|
|
7
|
+
from .hunts_test import hunts_test
|
|
6
8
|
from .preprocessing import preprocessing
|
|
7
9
|
|
|
8
|
-
__all__ = ["all", "apriori", "hash", "preprocessing"]
|
|
10
|
+
__all__ = ["all", "apriori", "hash", "hunts", "hunts_test", "preprocessing"]
|
|
@@ -5,22 +5,22 @@ from pathlib import Path
|
|
|
5
5
|
|
|
6
6
|
class SourceCodeModule:
|
|
7
7
|
"""A class that displays source code when printed."""
|
|
8
|
-
|
|
8
|
+
|
|
9
9
|
def __init__(self, name: str, source_path: Path):
|
|
10
10
|
self.name = name
|
|
11
11
|
self._source_path = source_path
|
|
12
12
|
self._source_code = None
|
|
13
|
-
|
|
13
|
+
|
|
14
14
|
@property
|
|
15
15
|
def source_code(self) -> str:
|
|
16
16
|
"""Lazily load source code."""
|
|
17
17
|
if self._source_code is None:
|
|
18
|
-
self._source_code = self._source_path.read_text(encoding=
|
|
18
|
+
self._source_code = self._source_path.read_text(encoding="utf-8")
|
|
19
19
|
return self._source_code
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
def __repr__(self) -> str:
|
|
22
22
|
return self.source_code
|
|
23
|
-
|
|
23
|
+
|
|
24
24
|
def __str__(self) -> str:
|
|
25
25
|
return self.source_code
|
|
26
26
|
|
|
@@ -5,22 +5,22 @@ from pathlib import Path
|
|
|
5
5
|
|
|
6
6
|
class SourceCodeModule:
|
|
7
7
|
"""A class that displays source code when printed."""
|
|
8
|
-
|
|
8
|
+
|
|
9
9
|
def __init__(self, name: str, source_path: Path):
|
|
10
10
|
self.name = name
|
|
11
11
|
self._source_path = source_path
|
|
12
12
|
self._source_code = None
|
|
13
|
-
|
|
13
|
+
|
|
14
14
|
@property
|
|
15
15
|
def source_code(self) -> str:
|
|
16
16
|
"""Lazily load source code."""
|
|
17
17
|
if self._source_code is None:
|
|
18
|
-
self._source_code = self._source_path.read_text(encoding=
|
|
18
|
+
self._source_code = self._source_path.read_text(encoding="utf-8")
|
|
19
19
|
return self._source_code
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
def __repr__(self) -> str:
|
|
22
22
|
return self.source_code
|
|
23
|
-
|
|
23
|
+
|
|
24
24
|
def __str__(self) -> str:
|
|
25
25
|
return self.source_code
|
|
26
26
|
|
|
@@ -5,22 +5,22 @@ from pathlib import Path
|
|
|
5
5
|
|
|
6
6
|
class SourceCodeModule:
|
|
7
7
|
"""A class that displays source code when printed."""
|
|
8
|
-
|
|
8
|
+
|
|
9
9
|
def __init__(self, name: str, source_path: Path):
|
|
10
10
|
self.name = name
|
|
11
11
|
self._source_path = source_path
|
|
12
12
|
self._source_code = None
|
|
13
|
-
|
|
13
|
+
|
|
14
14
|
@property
|
|
15
15
|
def source_code(self) -> str:
|
|
16
16
|
"""Lazily load source code."""
|
|
17
17
|
if self._source_code is None:
|
|
18
|
-
self._source_code = self._source_path.read_text(encoding=
|
|
18
|
+
self._source_code = self._source_path.read_text(encoding="utf-8")
|
|
19
19
|
return self._source_code
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
def __repr__(self) -> str:
|
|
22
22
|
return self.source_code
|
|
23
|
-
|
|
23
|
+
|
|
24
24
|
def __str__(self) -> str:
|
|
25
25
|
return self.source_code
|
|
26
26
|
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Source code loader for DM/hunts.py"""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SourceCodeModule:
|
|
7
|
+
"""A class that displays source code when printed."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, name: str, source_path: Path):
|
|
10
|
+
self.name = name
|
|
11
|
+
self._source_path = source_path
|
|
12
|
+
self._source_code = None
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def source_code(self) -> str:
|
|
16
|
+
"""Lazily load source code."""
|
|
17
|
+
if self._source_code is None:
|
|
18
|
+
self._source_code = self._source_path.read_text(encoding="utf-8")
|
|
19
|
+
return self._source_code
|
|
20
|
+
|
|
21
|
+
def __repr__(self) -> str:
|
|
22
|
+
return self.source_code
|
|
23
|
+
|
|
24
|
+
def __str__(self) -> str:
|
|
25
|
+
return self.source_code
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Get the path to the source file
|
|
29
|
+
_source_file = Path(__file__).parent / "sources" / "hunts.py"
|
|
30
|
+
hunts = SourceCodeModule("DM.hunts", _source_file)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Source code loader for DM/hunts_test.py"""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SourceCodeModule:
|
|
7
|
+
"""A class that displays source code when printed."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, name: str, source_path: Path):
|
|
10
|
+
self.name = name
|
|
11
|
+
self._source_path = source_path
|
|
12
|
+
self._source_code = None
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def source_code(self) -> str:
|
|
16
|
+
"""Lazily load source code."""
|
|
17
|
+
if self._source_code is None:
|
|
18
|
+
self._source_code = self._source_path.read_text(encoding="utf-8")
|
|
19
|
+
return self._source_code
|
|
20
|
+
|
|
21
|
+
def __repr__(self) -> str:
|
|
22
|
+
return self.source_code
|
|
23
|
+
|
|
24
|
+
def __str__(self) -> str:
|
|
25
|
+
return self.source_code
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Get the path to the source file
|
|
29
|
+
_source_file = Path(__file__).parent / "sources" / "hunts_test.py"
|
|
30
|
+
hunts_test = SourceCodeModule("DM.hunts_test", _source_file)
|
|
@@ -5,22 +5,22 @@ from pathlib import Path
|
|
|
5
5
|
|
|
6
6
|
class SourceCodeModule:
|
|
7
7
|
"""A class that displays source code when printed."""
|
|
8
|
-
|
|
8
|
+
|
|
9
9
|
def __init__(self, name: str, source_path: Path):
|
|
10
10
|
self.name = name
|
|
11
11
|
self._source_path = source_path
|
|
12
12
|
self._source_code = None
|
|
13
|
-
|
|
13
|
+
|
|
14
14
|
@property
|
|
15
15
|
def source_code(self) -> str:
|
|
16
16
|
"""Lazily load source code."""
|
|
17
17
|
if self._source_code is None:
|
|
18
|
-
self._source_code = self._source_path.read_text(encoding=
|
|
18
|
+
self._source_code = self._source_path.read_text(encoding="utf-8")
|
|
19
19
|
return self._source_code
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
def __repr__(self) -> str:
|
|
22
22
|
return self.source_code
|
|
23
|
-
|
|
23
|
+
|
|
24
24
|
def __str__(self) -> str:
|
|
25
25
|
return self.source_code
|
|
26
26
|
|
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
from itertools import combinations, chain
|
|
2
1
|
from collections import defaultdict
|
|
2
|
+
from itertools import chain, combinations
|
|
3
3
|
|
|
4
4
|
min_support = 2
|
|
5
5
|
min_conf = 0.7
|
|
6
6
|
|
|
7
7
|
transactions = {
|
|
8
|
-
"T1": {"I1","I2","I4","I5","I6"},
|
|
9
|
-
"T2": {"I2","I4","I6"},
|
|
10
|
-
"T3": {"I2","I3"},
|
|
11
|
-
"T4": {"I1","I2","I4"},
|
|
12
|
-
"T5": {"I1","I2","I3"},
|
|
13
|
-
"T6": {"I2","I3"},
|
|
14
|
-
"T7": {"I1","I3"},
|
|
15
|
-
"T8": {"I1","I2","I3","I5"},
|
|
16
|
-
"T9": {"I1","I2","I3"},
|
|
17
|
-
"T10": {"I1","I2","I4","I5"},
|
|
18
|
-
"T11": {"I5","I6"}
|
|
8
|
+
"T1": {"I1", "I2", "I4", "I5", "I6"},
|
|
9
|
+
"T2": {"I2", "I4", "I6"},
|
|
10
|
+
"T3": {"I2", "I3"},
|
|
11
|
+
"T4": {"I1", "I2", "I4"},
|
|
12
|
+
"T5": {"I1", "I2", "I3"},
|
|
13
|
+
"T6": {"I2", "I3"},
|
|
14
|
+
"T7": {"I1", "I3"},
|
|
15
|
+
"T8": {"I1", "I2", "I3", "I5"},
|
|
16
|
+
"T9": {"I1", "I2", "I3"},
|
|
17
|
+
"T10": {"I1", "I2", "I4", "I5"},
|
|
18
|
+
"T11": {"I5", "I6"},
|
|
19
19
|
}
|
|
20
20
|
|
|
21
21
|
genL = lambda C: {k: v for k, v in C.items() if v >= min_support}
|
|
@@ -85,13 +85,15 @@ for k, v in C.items():
|
|
|
85
85
|
"Empty" if not v else "\n".join(f"{set(x)} : {y}" for x, y in v.items()),
|
|
86
86
|
)
|
|
87
87
|
for k, v in L.items():
|
|
88
|
-
print(
|
|
88
|
+
print(
|
|
89
|
+
f"\nL{k}:\n", "Empty" if not v else "\n".join(f"{set(x)} : {v[x]}" for x in v)
|
|
90
|
+
)
|
|
89
91
|
|
|
90
92
|
print(f"\nAssociation Rules (conf >= {min_conf:.0%}):")
|
|
91
93
|
for a, c, s, conf in rules:
|
|
92
94
|
print(f"{set(a)} => {set(c)} | support: {s:.2f}, confidence: {conf:.2f}")
|
|
93
95
|
|
|
94
|
-
from itertools import
|
|
96
|
+
from itertools import chain, combinations
|
|
95
97
|
|
|
96
98
|
transactions = {
|
|
97
99
|
"10": {"A", "C", "D"},
|
{bm_preprocessing-0.2.1 → bm_preprocessing-0.3.0}/src/bm_preprocessing/DM/sources/apriori.py
RENAMED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
2
|
from itertools import combinations
|
|
3
3
|
|
|
4
|
+
|
|
4
5
|
def print_table(data, title):
|
|
5
6
|
print(f"\n--- {title} ---")
|
|
6
7
|
for itemset, count in data.items():
|
|
7
8
|
print(f"{itemset}: {count}")
|
|
8
9
|
|
|
10
|
+
|
|
9
11
|
C = {}
|
|
10
12
|
L = {}
|
|
11
13
|
|
|
@@ -18,26 +20,26 @@ def generate_candidates(prev_frequent_itemsets, k):
|
|
|
18
20
|
union_set = set(itemset1).union(set(itemset2))
|
|
19
21
|
if len(union_set) == k:
|
|
20
22
|
candidates.add(tuple(sorted(union_set)))
|
|
21
|
-
|
|
23
|
+
|
|
22
24
|
return sorted(list(candidates))
|
|
23
25
|
|
|
24
26
|
|
|
25
27
|
def count_candidates(candidates, transactions):
|
|
26
|
-
|
|
28
|
+
|
|
27
29
|
candidate_count = defaultdict(int)
|
|
28
|
-
|
|
30
|
+
|
|
29
31
|
for candidate in candidates:
|
|
30
32
|
for transaction in transactions.values():
|
|
31
33
|
if all(item in transaction for item in candidate):
|
|
32
34
|
candidate_count[candidate] += 1
|
|
33
|
-
|
|
35
|
+
|
|
34
36
|
return candidate_count
|
|
35
37
|
|
|
36
38
|
|
|
37
39
|
def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
|
|
38
40
|
|
|
39
41
|
filtered_candidates = {}
|
|
40
|
-
|
|
42
|
+
|
|
41
43
|
for itemset, count in candidate_count.items():
|
|
42
44
|
|
|
43
45
|
if count >= min_support:
|
|
@@ -45,7 +47,9 @@ def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
|
|
|
45
47
|
filtered_candidates[itemset] = count
|
|
46
48
|
else:
|
|
47
49
|
subsets = combinations(itemset, len(itemset) - 1)
|
|
48
|
-
if all(
|
|
50
|
+
if all(
|
|
51
|
+
tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets
|
|
52
|
+
):
|
|
49
53
|
filtered_candidates[itemset] = count
|
|
50
54
|
|
|
51
55
|
return filtered_candidates
|
|
@@ -53,12 +57,14 @@ def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
|
|
|
53
57
|
|
|
54
58
|
def apriori(transactions, min_support):
|
|
55
59
|
|
|
56
|
-
items = sorted(
|
|
60
|
+
items = sorted(
|
|
61
|
+
set(item for transaction in transactions.values() for item in transaction)
|
|
62
|
+
)
|
|
57
63
|
c1_list = [(item,) for item in items]
|
|
58
|
-
|
|
64
|
+
|
|
59
65
|
C[1] = count_candidates(c1_list, transactions)
|
|
60
66
|
L[1] = prune_candidates(C[1], min_support)
|
|
61
|
-
|
|
67
|
+
|
|
62
68
|
print_table(C[1], "Candidate 1-itemsets (C1)")
|
|
63
69
|
print_table(L[1], "Frequent 1-itemsets (L1)")
|
|
64
70
|
|
|
@@ -66,12 +72,12 @@ def apriori(transactions, min_support):
|
|
|
66
72
|
|
|
67
73
|
while True:
|
|
68
74
|
|
|
69
|
-
candidates = generate_candidates(L[k-1].keys(), k)
|
|
75
|
+
candidates = generate_candidates(L[k - 1].keys(), k)
|
|
70
76
|
if not candidates:
|
|
71
77
|
break
|
|
72
78
|
|
|
73
79
|
C[k] = count_candidates(candidates, transactions)
|
|
74
|
-
L[k] = prune_candidates(C[k], min_support, L[k-1].keys())
|
|
80
|
+
L[k] = prune_candidates(C[k], min_support, L[k - 1].keys())
|
|
75
81
|
|
|
76
82
|
if not L[k]:
|
|
77
83
|
print_table(C[k], f"Candidate {k}-itemsets (C{k})")
|
|
@@ -99,7 +105,7 @@ def main():
|
|
|
99
105
|
}
|
|
100
106
|
|
|
101
107
|
min_support = 2
|
|
102
|
-
|
|
108
|
+
|
|
103
109
|
apriori(transactions, min_support)
|
|
104
110
|
|
|
105
111
|
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Tid,Home Owner,Marital Status,Annual Income,Default id
|
|
2
|
+
1,Yes,Single,125K,No
|
|
3
|
+
2,No,Married,100K,No
|
|
4
|
+
3,No,Single,70K,No
|
|
5
|
+
4,Yes,Married,120K,No
|
|
6
|
+
5,No,Divorced,95K,Yes
|
|
7
|
+
6,No,Married,60K,No
|
|
8
|
+
7,Yes,Divorced,220K,No
|
|
9
|
+
8,No,Single,85K,Yes
|
|
10
|
+
9,No,Married,75K,No
|
|
11
|
+
10,No,Single,90K,Yes
|
|
@@ -1,14 +1,17 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
2
|
from itertools import combinations
|
|
3
3
|
|
|
4
|
+
|
|
4
5
|
def print_table(data, title):
|
|
5
6
|
print(f"\n--- {title} ---")
|
|
6
7
|
for itemset, count in data.items():
|
|
7
8
|
print(f"{itemset}: {count}")
|
|
8
9
|
|
|
10
|
+
|
|
9
11
|
C = {}
|
|
10
12
|
L = {}
|
|
11
13
|
|
|
14
|
+
|
|
12
15
|
class Bucket:
|
|
13
16
|
def __init__(self):
|
|
14
17
|
self.address: int
|
|
@@ -24,26 +27,26 @@ def generate_candidates(prev_frequent_itemsets, k):
|
|
|
24
27
|
union_set = set(itemset1).union(set(itemset2))
|
|
25
28
|
if len(union_set) == k:
|
|
26
29
|
candidates.add(tuple(sorted(union_set)))
|
|
27
|
-
|
|
30
|
+
|
|
28
31
|
return sorted(list(candidates))
|
|
29
32
|
|
|
30
33
|
|
|
31
34
|
def count_candidates(candidates, transactions):
|
|
32
|
-
|
|
35
|
+
|
|
33
36
|
candidate_count = defaultdict(int)
|
|
34
|
-
|
|
37
|
+
|
|
35
38
|
for candidate in candidates:
|
|
36
39
|
for transaction in transactions.values():
|
|
37
40
|
if all(item in transaction for item in candidate):
|
|
38
41
|
candidate_count[candidate] += 1
|
|
39
|
-
|
|
42
|
+
|
|
40
43
|
return candidate_count
|
|
41
44
|
|
|
42
45
|
|
|
43
46
|
def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
|
|
44
47
|
|
|
45
48
|
filtered_candidates = {}
|
|
46
|
-
|
|
49
|
+
|
|
47
50
|
for itemset, count in candidate_count.items():
|
|
48
51
|
|
|
49
52
|
if count >= min_support:
|
|
@@ -51,7 +54,9 @@ def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
|
|
|
51
54
|
filtered_candidates[itemset] = count
|
|
52
55
|
else:
|
|
53
56
|
subsets = combinations(itemset, len(itemset) - 1)
|
|
54
|
-
if all(
|
|
57
|
+
if all(
|
|
58
|
+
tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets
|
|
59
|
+
):
|
|
55
60
|
filtered_candidates[itemset] = count
|
|
56
61
|
|
|
57
62
|
return filtered_candidates
|
|
@@ -59,12 +64,14 @@ def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
|
|
|
59
64
|
|
|
60
65
|
def apriori(transactions, min_support):
|
|
61
66
|
|
|
62
|
-
items = sorted(
|
|
67
|
+
items = sorted(
|
|
68
|
+
set(item for transaction in transactions.values() for item in transaction)
|
|
69
|
+
)
|
|
63
70
|
c1_list = [(item,) for item in items]
|
|
64
|
-
|
|
71
|
+
|
|
65
72
|
C[1] = count_candidates(c1_list, transactions)
|
|
66
73
|
L[1] = prune_candidates(C[1], min_support)
|
|
67
|
-
|
|
74
|
+
|
|
68
75
|
print_table(C[1], "Candidate 1-itemsets (C1)")
|
|
69
76
|
print_table(L[1], "Frequent 1-itemsets (L1)")
|
|
70
77
|
|
|
@@ -77,7 +84,9 @@ def apriori(transactions, min_support):
|
|
|
77
84
|
|
|
78
85
|
buckets = [Bucket(addr) for addr in range(7)]
|
|
79
86
|
|
|
80
|
-
items_list = sorted(
|
|
87
|
+
items_list = sorted(
|
|
88
|
+
set(item for transaction in transactions.values() for item in transaction)
|
|
89
|
+
)
|
|
81
90
|
ranks = {item: idx + 1 for idx, item in enumerate(items_list)}
|
|
82
91
|
|
|
83
92
|
hash_fn = lambda item1, item2: (ranks[item1] * 10 + ranks[item2]) % 7
|
|
@@ -91,25 +100,32 @@ def apriori(transactions, min_support):
|
|
|
91
100
|
|
|
92
101
|
print("\n--- Hash Table Buckets ---")
|
|
93
102
|
for bucket in buckets:
|
|
94
|
-
print(
|
|
95
|
-
|
|
103
|
+
print(
|
|
104
|
+
f"Address: {bucket.address}, Count: {bucket.count}, Itemsets: {bucket.itemsets}"
|
|
105
|
+
)
|
|
106
|
+
|
|
96
107
|
# Filter
|
|
97
|
-
L2 = {
|
|
108
|
+
L2 = {
|
|
109
|
+
itemset: bucket.count
|
|
110
|
+
for bucket in buckets
|
|
111
|
+
for itemset in bucket.itemsets
|
|
112
|
+
if bucket.count >= min_support
|
|
113
|
+
}
|
|
98
114
|
print_table(L2, "Frequent 2-itemsets after Hashing (L2)")
|
|
99
115
|
|
|
100
|
-
C["2"] = generate_candidates(L[k-1].keys(), k)
|
|
116
|
+
C["2"] = generate_candidates(L[k - 1].keys(), k)
|
|
101
117
|
L["2"] = L2
|
|
102
118
|
|
|
103
119
|
k = 3
|
|
104
120
|
|
|
105
121
|
while True:
|
|
106
122
|
|
|
107
|
-
candidates = generate_candidates(L[k-1].keys(), k)
|
|
123
|
+
candidates = generate_candidates(L[k - 1].keys(), k)
|
|
108
124
|
if not candidates:
|
|
109
125
|
break
|
|
110
126
|
|
|
111
127
|
C[k] = count_candidates(candidates, transactions)
|
|
112
|
-
L[k] = prune_candidates(C[k], min_support, L[k-1].keys())
|
|
128
|
+
L[k] = prune_candidates(C[k], min_support, L[k - 1].keys())
|
|
113
129
|
|
|
114
130
|
if not L[k]:
|
|
115
131
|
print_table(C[k], f"Candidate {k}-itemsets (C{k})")
|
|
@@ -137,7 +153,7 @@ def main():
|
|
|
137
153
|
}
|
|
138
154
|
|
|
139
155
|
min_support = 2
|
|
140
|
-
|
|
156
|
+
|
|
141
157
|
apriori(transactions, min_support)
|
|
142
158
|
|
|
143
159
|
|