bm-preprocessing 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bm_preprocessing-0.1.0/.gitignore +10 -0
- bm_preprocessing-0.1.0/PKG-INFO +33 -0
- bm_preprocessing-0.1.0/README.md +24 -0
- bm_preprocessing-0.1.0/pyproject.toml +17 -0
- bm_preprocessing-0.1.0/src/bm_preprocessing/DM/__init__.py +8 -0
- bm_preprocessing-0.1.0/src/bm_preprocessing/DM/all.py +30 -0
- bm_preprocessing-0.1.0/src/bm_preprocessing/DM/apriori.py +30 -0
- bm_preprocessing-0.1.0/src/bm_preprocessing/DM/hash.py +30 -0
- bm_preprocessing-0.1.0/src/bm_preprocessing/DM/preprocessing.py +30 -0
- bm_preprocessing-0.1.0/src/bm_preprocessing/DM/sources/all.py +157 -0
- bm_preprocessing-0.1.0/src/bm_preprocessing/DM/sources/apriori.py +107 -0
- bm_preprocessing-0.1.0/src/bm_preprocessing/DM/sources/hash.py +145 -0
- bm_preprocessing-0.1.0/src/bm_preprocessing/DM/sources/preprocessing.py +35 -0
- bm_preprocessing-0.1.0/src/bm_preprocessing/IR/__init__.py +5 -0
- bm_preprocessing-0.1.0/src/bm_preprocessing/IR/all.py +30 -0
- bm_preprocessing-0.1.0/src/bm_preprocessing/IR/sources/all.py +186 -0
- bm_preprocessing-0.1.0/src/bm_preprocessing/__init__.py +6 -0
- bm_preprocessing-0.1.0/uv.lock +1100 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bm-preprocessing
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A package to view source code from DM and IR modules
|
|
5
|
+
Requires-Python: >=3.8
|
|
6
|
+
Requires-Dist: build>=1.2.2.post1
|
|
7
|
+
Requires-Dist: twine>=6.1.0
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
|
|
10
|
+
# bm-preprocessing
|
|
11
|
+
|
|
12
|
+
A Python package that displays source code from DM and IR modules.
|
|
13
|
+
|
|
14
|
+
## Usage
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
from bm_preprocessing.IR import all
|
|
18
|
+
print(all) # Prints the entire source code of IR/all.py
|
|
19
|
+
|
|
20
|
+
from bm_preprocessing.DM import apriori
|
|
21
|
+
print(apriori) # Prints the entire source code of DM/apriori.py
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Available Modules
|
|
25
|
+
|
|
26
|
+
### IR
|
|
27
|
+
- `all` - Information Retrieval algorithms
|
|
28
|
+
|
|
29
|
+
### DM
|
|
30
|
+
- `all` - Data Mining algorithms
|
|
31
|
+
- `apriori` - Apriori algorithm implementation
|
|
32
|
+
- `hash` - Hash-based mining
|
|
33
|
+
- `preprocessing` - Data preprocessing utilities
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# bm-preprocessing
|
|
2
|
+
|
|
3
|
+
A Python package that displays source code from DM and IR modules.
|
|
4
|
+
|
|
5
|
+
## Usage
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from bm_preprocessing.IR import all
|
|
9
|
+
print(all) # Prints the entire source code of IR/all.py
|
|
10
|
+
|
|
11
|
+
from bm_preprocessing.DM import apriori
|
|
12
|
+
print(apriori) # Prints the entire source code of DM/apriori.py
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Available Modules
|
|
16
|
+
|
|
17
|
+
### IR
|
|
18
|
+
- `all` - Information Retrieval algorithms
|
|
19
|
+
|
|
20
|
+
### DM
|
|
21
|
+
- `all` - Data Mining algorithms
|
|
22
|
+
- `apriori` - Apriori algorithm implementation
|
|
23
|
+
- `hash` - Hash-based mining
|
|
24
|
+
- `preprocessing` - Data preprocessing utilities
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "bm-preprocessing"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A package to view source code from DM and IR modules"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"build>=1.2.2.post1",
|
|
13
|
+
"twine>=6.1.0",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[tool.hatch.build.targets.wheel]
|
|
17
|
+
packages = ["src/bm_preprocessing"]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Source code loader for DM/all.py"""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SourceCodeModule:
|
|
7
|
+
"""A class that displays source code when printed."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, name: str, source_path: Path):
|
|
10
|
+
self.name = name
|
|
11
|
+
self._source_path = source_path
|
|
12
|
+
self._source_code = None
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def source_code(self) -> str:
|
|
16
|
+
"""Lazily load source code."""
|
|
17
|
+
if self._source_code is None:
|
|
18
|
+
self._source_code = self._source_path.read_text(encoding='utf-8')
|
|
19
|
+
return self._source_code
|
|
20
|
+
|
|
21
|
+
def __repr__(self) -> str:
|
|
22
|
+
return self.source_code
|
|
23
|
+
|
|
24
|
+
def __str__(self) -> str:
|
|
25
|
+
return self.source_code
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Get the path to the source file
|
|
29
|
+
_source_file = Path(__file__).parent / "sources" / "all.py"
|
|
30
|
+
all = SourceCodeModule("DM.all", _source_file)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Source code loader for DM/apriori.py"""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SourceCodeModule:
|
|
7
|
+
"""A class that displays source code when printed."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, name: str, source_path: Path):
|
|
10
|
+
self.name = name
|
|
11
|
+
self._source_path = source_path
|
|
12
|
+
self._source_code = None
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def source_code(self) -> str:
|
|
16
|
+
"""Lazily load source code."""
|
|
17
|
+
if self._source_code is None:
|
|
18
|
+
self._source_code = self._source_path.read_text(encoding='utf-8')
|
|
19
|
+
return self._source_code
|
|
20
|
+
|
|
21
|
+
def __repr__(self) -> str:
|
|
22
|
+
return self.source_code
|
|
23
|
+
|
|
24
|
+
def __str__(self) -> str:
|
|
25
|
+
return self.source_code
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Get the path to the source file
|
|
29
|
+
_source_file = Path(__file__).parent / "sources" / "apriori.py"
|
|
30
|
+
apriori = SourceCodeModule("DM.apriori", _source_file)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Source code loader for DM/hash.py"""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SourceCodeModule:
|
|
7
|
+
"""A class that displays source code when printed."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, name: str, source_path: Path):
|
|
10
|
+
self.name = name
|
|
11
|
+
self._source_path = source_path
|
|
12
|
+
self._source_code = None
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def source_code(self) -> str:
|
|
16
|
+
"""Lazily load source code."""
|
|
17
|
+
if self._source_code is None:
|
|
18
|
+
self._source_code = self._source_path.read_text(encoding='utf-8')
|
|
19
|
+
return self._source_code
|
|
20
|
+
|
|
21
|
+
def __repr__(self) -> str:
|
|
22
|
+
return self.source_code
|
|
23
|
+
|
|
24
|
+
def __str__(self) -> str:
|
|
25
|
+
return self.source_code
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Get the path to the source file
|
|
29
|
+
_source_file = Path(__file__).parent / "sources" / "hash.py"
|
|
30
|
+
hash = SourceCodeModule("DM.hash", _source_file)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Source code loader for DM/preprocessing.py"""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SourceCodeModule:
|
|
7
|
+
"""A class that displays source code when printed."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, name: str, source_path: Path):
|
|
10
|
+
self.name = name
|
|
11
|
+
self._source_path = source_path
|
|
12
|
+
self._source_code = None
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def source_code(self) -> str:
|
|
16
|
+
"""Lazily load source code."""
|
|
17
|
+
if self._source_code is None:
|
|
18
|
+
self._source_code = self._source_path.read_text(encoding='utf-8')
|
|
19
|
+
return self._source_code
|
|
20
|
+
|
|
21
|
+
def __repr__(self) -> str:
|
|
22
|
+
return self.source_code
|
|
23
|
+
|
|
24
|
+
def __str__(self) -> str:
|
|
25
|
+
return self.source_code
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Get the path to the source file
|
|
29
|
+
_source_file = Path(__file__).parent / "sources" / "preprocessing.py"
|
|
30
|
+
preprocessing = SourceCodeModule("DM.preprocessing", _source_file)
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from itertools import combinations, chain
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
|
|
4
|
+
min_support = 2
|
|
5
|
+
min_conf = 0.7
|
|
6
|
+
|
|
7
|
+
transactions = {
|
|
8
|
+
"T1": {"I1","I2","I4","I5","I6"},
|
|
9
|
+
"T2": {"I2","I4","I6"},
|
|
10
|
+
"T3": {"I2","I3"},
|
|
11
|
+
"T4": {"I1","I2","I4"},
|
|
12
|
+
"T5": {"I1","I2","I3"},
|
|
13
|
+
"T6": {"I2","I3"},
|
|
14
|
+
"T7": {"I1","I3"},
|
|
15
|
+
"T8": {"I1","I2","I3","I5"},
|
|
16
|
+
"T9": {"I1","I2","I3"},
|
|
17
|
+
"T10": {"I1","I2","I4","I5"},
|
|
18
|
+
"T11": {"I5","I6"}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
genL = lambda C: {k: v for k, v in C.items() if v >= min_support}
|
|
22
|
+
|
|
23
|
+
C, L = {}, {}
|
|
24
|
+
|
|
25
|
+
C[1] = defaultdict(int)
|
|
26
|
+
for t in transactions.values():
|
|
27
|
+
for i in t:
|
|
28
|
+
C[1][frozenset([i])] += 1
|
|
29
|
+
L[1] = genL(C[1])
|
|
30
|
+
|
|
31
|
+
h = lambda p, b=7: sum(sum(ord(c) for c in s) for s in p) % b
|
|
32
|
+
buckets = defaultdict(int)
|
|
33
|
+
for t in transactions.values():
|
|
34
|
+
for p in combinations(sorted(t), 2):
|
|
35
|
+
buckets[h(p)] += 1
|
|
36
|
+
freq = {b for b, c in buckets.items() if c >= min_support}
|
|
37
|
+
|
|
38
|
+
C[2] = defaultdict(int)
|
|
39
|
+
for t in transactions.values():
|
|
40
|
+
for p in combinations(sorted(t), 2):
|
|
41
|
+
if h(p) in freq:
|
|
42
|
+
C[2][frozenset(p)] += 1
|
|
43
|
+
L[2] = genL(C[2])
|
|
44
|
+
|
|
45
|
+
k = 3
|
|
46
|
+
while L[k - 1]:
|
|
47
|
+
C[k] = {
|
|
48
|
+
frozenset(a | b): 0
|
|
49
|
+
for a, b in combinations(L[k - 1], 2)
|
|
50
|
+
if sorted(a)[: k - 2] == sorted(b)[: k - 2]
|
|
51
|
+
and all(frozenset(s) in L[k - 1] for s in combinations(a | b, k - 1))
|
|
52
|
+
}
|
|
53
|
+
for t in transactions.values():
|
|
54
|
+
for c in C[k]:
|
|
55
|
+
if c.issubset(t):
|
|
56
|
+
C[k][c] += 1
|
|
57
|
+
L[k] = genL(C[k])
|
|
58
|
+
if not L[k]:
|
|
59
|
+
break
|
|
60
|
+
k += 1
|
|
61
|
+
|
|
62
|
+
frequent_itemsets = {k: v for Lk in L.values() for k, v in Lk.items()}
|
|
63
|
+
total = len(transactions)
|
|
64
|
+
rules = []
|
|
65
|
+
for itemset, count in frequent_itemsets.items():
|
|
66
|
+
if len(itemset) < 2:
|
|
67
|
+
continue
|
|
68
|
+
for a in chain.from_iterable(
|
|
69
|
+
combinations(itemset, r) for r in range(1, len(itemset))
|
|
70
|
+
):
|
|
71
|
+
antecedent = frozenset(a)
|
|
72
|
+
consequent = itemset - antecedent
|
|
73
|
+
if not consequent:
|
|
74
|
+
continue
|
|
75
|
+
support = count / total
|
|
76
|
+
confidence = (
|
|
77
|
+
count / C[1 if len(antecedent) == 1 else len(antecedent)][antecedent]
|
|
78
|
+
)
|
|
79
|
+
if confidence >= min_conf:
|
|
80
|
+
rules.append((antecedent, consequent, support, confidence))
|
|
81
|
+
|
|
82
|
+
for k, v in C.items():
|
|
83
|
+
print(
|
|
84
|
+
f"\nC{k}:\n",
|
|
85
|
+
"Empty" if not v else "\n".join(f"{set(x)} : {y}" for x, y in v.items()),
|
|
86
|
+
)
|
|
87
|
+
for k, v in L.items():
|
|
88
|
+
print(f"\nL{k}:\n", "Empty" if not v else "\n".join(f"{set(x)} : {v[x]}" for x in v))
|
|
89
|
+
|
|
90
|
+
print(f"\nAssociation Rules (conf >= {min_conf:.0%}):")
|
|
91
|
+
for a, c, s, conf in rules:
|
|
92
|
+
print(f"{set(a)} => {set(c)} | support: {s:.2f}, confidence: {conf:.2f}")
|
|
93
|
+
|
|
94
|
+
from itertools import combinations, chain
|
|
95
|
+
|
|
96
|
+
transactions = {
|
|
97
|
+
"10": {"A", "C", "D"},
|
|
98
|
+
"20": {"B", "C", "E"},
|
|
99
|
+
"30": {"A", "B", "C", "E"},
|
|
100
|
+
"40": {"B", "E"},
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
min_support = 2
|
|
104
|
+
min_conf = 0.7
|
|
105
|
+
|
|
106
|
+
genL = lambda C: {k: v for k, v in C.items() if len(v) >= min_support}
|
|
107
|
+
|
|
108
|
+
C = {1: {}}
|
|
109
|
+
for tid, items in transactions.items():
|
|
110
|
+
for i in items:
|
|
111
|
+
C[1].setdefault(frozenset([i]), set()).add(tid)
|
|
112
|
+
|
|
113
|
+
L = {1: genL(C[1])}
|
|
114
|
+
|
|
115
|
+
k = 2
|
|
116
|
+
while L[k - 1]:
|
|
117
|
+
prev = list(L[k - 1].keys())
|
|
118
|
+
C[k] = {
|
|
119
|
+
frozenset(a | b): L[k - 1][a] & L[k - 1][b]
|
|
120
|
+
for i, a in enumerate(prev)
|
|
121
|
+
for b in prev[i + 1 :]
|
|
122
|
+
if sorted(a)[: k - 2] == sorted(b)[: k - 2]
|
|
123
|
+
}
|
|
124
|
+
L[k] = genL(C[k])
|
|
125
|
+
if not L[k]:
|
|
126
|
+
break
|
|
127
|
+
k += 1
|
|
128
|
+
|
|
129
|
+
frequent_itemsets = {k: v for Lk in L.values() for k, v in Lk.items()}
|
|
130
|
+
total = len(transactions)
|
|
131
|
+
rules = []
|
|
132
|
+
for itemset, tids in frequent_itemsets.items():
|
|
133
|
+
if len(itemset) < 2:
|
|
134
|
+
continue
|
|
135
|
+
for a in chain.from_iterable(
|
|
136
|
+
combinations(itemset, r) for r in range(1, len(itemset))
|
|
137
|
+
):
|
|
138
|
+
antecedent = frozenset(a)
|
|
139
|
+
consequent = itemset - antecedent
|
|
140
|
+
if len(consequent) == 0:
|
|
141
|
+
continue
|
|
142
|
+
support = len(tids) / total
|
|
143
|
+
confidence = len(tids) / len(frequent_itemsets[antecedent])
|
|
144
|
+
if confidence >= min_conf:
|
|
145
|
+
rules.append((antecedent, consequent, support, confidence))
|
|
146
|
+
|
|
147
|
+
for k, v in C.items():
|
|
148
|
+
print(f"\nC{k}:")
|
|
149
|
+
print("Empty" if not v else "\n".join(f"{set(x)} : {y}" for x, y in v.items()))
|
|
150
|
+
|
|
151
|
+
for k, v in L.items():
|
|
152
|
+
print(f"\nL{k}:")
|
|
153
|
+
print("Empty" if not v else "\n".join(f"{set(x)} : {len(y)}" for x, y in v.items()))
|
|
154
|
+
|
|
155
|
+
print(f"\nAssociation Rules (conf >= {min_conf:.0%}):")
|
|
156
|
+
for a, c, s, conf in rules:
|
|
157
|
+
print(f"{set(a)} => {set(c)} | support: {s:.2f}, confidence: {conf:.2f}")
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from itertools import combinations
|
|
3
|
+
|
|
4
|
+
def print_table(data, title):
|
|
5
|
+
print(f"\n--- {title} ---")
|
|
6
|
+
for itemset, count in data.items():
|
|
7
|
+
print(f"{itemset}: {count}")
|
|
8
|
+
|
|
9
|
+
C = {}
|
|
10
|
+
L = {}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def generate_candidates(prev_frequent_itemsets, k):
|
|
14
|
+
|
|
15
|
+
candidates = set()
|
|
16
|
+
|
|
17
|
+
for itemset1, itemset2 in combinations(prev_frequent_itemsets, 2):
|
|
18
|
+
union_set = set(itemset1).union(set(itemset2))
|
|
19
|
+
if len(union_set) == k:
|
|
20
|
+
candidates.add(tuple(sorted(union_set)))
|
|
21
|
+
|
|
22
|
+
return sorted(list(candidates))
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def count_candidates(candidates, transactions):
|
|
26
|
+
|
|
27
|
+
candidate_count = defaultdict(int)
|
|
28
|
+
|
|
29
|
+
for candidate in candidates:
|
|
30
|
+
for transaction in transactions.values():
|
|
31
|
+
if all(item in transaction for item in candidate):
|
|
32
|
+
candidate_count[candidate] += 1
|
|
33
|
+
|
|
34
|
+
return candidate_count
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
|
|
38
|
+
|
|
39
|
+
filtered_candidates = {}
|
|
40
|
+
|
|
41
|
+
for itemset, count in candidate_count.items():
|
|
42
|
+
|
|
43
|
+
if count >= min_support:
|
|
44
|
+
if prev_freq_itemsets is None or len(itemset) == 1:
|
|
45
|
+
filtered_candidates[itemset] = count
|
|
46
|
+
else:
|
|
47
|
+
subsets = combinations(itemset, len(itemset) - 1)
|
|
48
|
+
if all(tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets):
|
|
49
|
+
filtered_candidates[itemset] = count
|
|
50
|
+
|
|
51
|
+
return filtered_candidates
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def apriori(transactions, min_support):
|
|
55
|
+
|
|
56
|
+
items = sorted(set(item for transaction in transactions.values() for item in transaction))
|
|
57
|
+
c1_list = [(item,) for item in items]
|
|
58
|
+
|
|
59
|
+
C[1] = count_candidates(c1_list, transactions)
|
|
60
|
+
L[1] = prune_candidates(C[1], min_support)
|
|
61
|
+
|
|
62
|
+
print_table(C[1], "Candidate 1-itemsets (C1)")
|
|
63
|
+
print_table(L[1], "Frequent 1-itemsets (L1)")
|
|
64
|
+
|
|
65
|
+
k = 2
|
|
66
|
+
|
|
67
|
+
while True:
|
|
68
|
+
|
|
69
|
+
candidates = generate_candidates(L[k-1].keys(), k)
|
|
70
|
+
if not candidates:
|
|
71
|
+
break
|
|
72
|
+
|
|
73
|
+
C[k] = count_candidates(candidates, transactions)
|
|
74
|
+
L[k] = prune_candidates(C[k], min_support, L[k-1].keys())
|
|
75
|
+
|
|
76
|
+
if not L[k]:
|
|
77
|
+
print_table(C[k], f"Candidate {k}-itemsets (C{k})")
|
|
78
|
+
print(f"\nNo frequent {k}-itemsets found. Terminating.\n\n")
|
|
79
|
+
break
|
|
80
|
+
|
|
81
|
+
print_table(C[k], f"Candidate {k}-itemsets (C{k})")
|
|
82
|
+
print_table(L[k], f"Frequent {k}-itemsets (L{k})")
|
|
83
|
+
|
|
84
|
+
k += 1
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def main():
|
|
88
|
+
|
|
89
|
+
transactions = {
|
|
90
|
+
"T100": ["I1", "I2", "I5"],
|
|
91
|
+
"T200": ["I2", "I4"],
|
|
92
|
+
"T300": ["I2", "I3"],
|
|
93
|
+
"T400": ["I1", "I2", "I4"],
|
|
94
|
+
"T500": ["I1", "I3"],
|
|
95
|
+
"T600": ["I2", "I3"],
|
|
96
|
+
"T700": ["I1", "I3"],
|
|
97
|
+
"T800": ["I1", "I2", "I3", "I5"],
|
|
98
|
+
"T900": ["I1", "I2", "I3"],
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
min_support = 2
|
|
102
|
+
|
|
103
|
+
apriori(transactions, min_support)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
if __name__ == "__main__":
|
|
107
|
+
main()
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from itertools import combinations
|
|
3
|
+
|
|
4
|
+
def print_table(data, title):
|
|
5
|
+
print(f"\n--- {title} ---")
|
|
6
|
+
for itemset, count in data.items():
|
|
7
|
+
print(f"{itemset}: {count}")
|
|
8
|
+
|
|
9
|
+
C = {}
|
|
10
|
+
L = {}
|
|
11
|
+
|
|
12
|
+
class Bucket:
|
|
13
|
+
def __init__(self):
|
|
14
|
+
self.address: int
|
|
15
|
+
self.count: int = 0
|
|
16
|
+
self.itemsets: list[tuple] = []
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def generate_candidates(prev_frequent_itemsets, k):
|
|
20
|
+
|
|
21
|
+
candidates = set()
|
|
22
|
+
|
|
23
|
+
for itemset1, itemset2 in combinations(prev_frequent_itemsets, 2):
|
|
24
|
+
union_set = set(itemset1).union(set(itemset2))
|
|
25
|
+
if len(union_set) == k:
|
|
26
|
+
candidates.add(tuple(sorted(union_set)))
|
|
27
|
+
|
|
28
|
+
return sorted(list(candidates))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def count_candidates(candidates, transactions):
|
|
32
|
+
|
|
33
|
+
candidate_count = defaultdict(int)
|
|
34
|
+
|
|
35
|
+
for candidate in candidates:
|
|
36
|
+
for transaction in transactions.values():
|
|
37
|
+
if all(item in transaction for item in candidate):
|
|
38
|
+
candidate_count[candidate] += 1
|
|
39
|
+
|
|
40
|
+
return candidate_count
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
|
|
44
|
+
|
|
45
|
+
filtered_candidates = {}
|
|
46
|
+
|
|
47
|
+
for itemset, count in candidate_count.items():
|
|
48
|
+
|
|
49
|
+
if count >= min_support:
|
|
50
|
+
if prev_freq_itemsets is None or len(itemset) == 1:
|
|
51
|
+
filtered_candidates[itemset] = count
|
|
52
|
+
else:
|
|
53
|
+
subsets = combinations(itemset, len(itemset) - 1)
|
|
54
|
+
if all(tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets):
|
|
55
|
+
filtered_candidates[itemset] = count
|
|
56
|
+
|
|
57
|
+
return filtered_candidates
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def apriori(transactions, min_support):
|
|
61
|
+
|
|
62
|
+
items = sorted(set(item for transaction in transactions.values() for item in transaction))
|
|
63
|
+
c1_list = [(item,) for item in items]
|
|
64
|
+
|
|
65
|
+
C[1] = count_candidates(c1_list, transactions)
|
|
66
|
+
L[1] = prune_candidates(C[1], min_support)
|
|
67
|
+
|
|
68
|
+
print_table(C[1], "Candidate 1-itemsets (C1)")
|
|
69
|
+
print_table(L[1], "Frequent 1-itemsets (L1)")
|
|
70
|
+
|
|
71
|
+
k = 2
|
|
72
|
+
|
|
73
|
+
transactions_combinations = {
|
|
74
|
+
transaction_id: combinations(items, k)
|
|
75
|
+
for transaction_id, items in transactions.items()
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
buckets = [Bucket(addr) for addr in range(7)]
|
|
79
|
+
|
|
80
|
+
items_list = sorted(set(item for transaction in transactions.values() for item in transaction))
|
|
81
|
+
ranks = {item: idx + 1 for idx, item in enumerate(items_list)}
|
|
82
|
+
|
|
83
|
+
hash_fn = lambda item1, item2: (ranks[item1] * 10 + ranks[item2]) % 7
|
|
84
|
+
|
|
85
|
+
for itemset in transactions_combinations.values():
|
|
86
|
+
for item in itemset:
|
|
87
|
+
item1, item2 = item[0], item[1]
|
|
88
|
+
address = hash_fn(item1, item2) % 7
|
|
89
|
+
buckets[address].count += 1
|
|
90
|
+
buckets[address].itemsets.append(item)
|
|
91
|
+
|
|
92
|
+
print("\n--- Hash Table Buckets ---")
|
|
93
|
+
for bucket in buckets:
|
|
94
|
+
print(f"Address: {bucket.address}, Count: {bucket.count}, Itemsets: {bucket.itemsets}")
|
|
95
|
+
|
|
96
|
+
# Filter
|
|
97
|
+
L2 = {itemset: bucket.count for bucket in buckets for itemset in bucket.itemsets if bucket.count >= min_support}
|
|
98
|
+
print_table(L2, "Frequent 2-itemsets after Hashing (L2)")
|
|
99
|
+
|
|
100
|
+
C["2"] = generate_candidates(L[k-1].keys(), k)
|
|
101
|
+
L["2"] = L2
|
|
102
|
+
|
|
103
|
+
k = 3
|
|
104
|
+
|
|
105
|
+
while True:
|
|
106
|
+
|
|
107
|
+
candidates = generate_candidates(L[k-1].keys(), k)
|
|
108
|
+
if not candidates:
|
|
109
|
+
break
|
|
110
|
+
|
|
111
|
+
C[k] = count_candidates(candidates, transactions)
|
|
112
|
+
L[k] = prune_candidates(C[k], min_support, L[k-1].keys())
|
|
113
|
+
|
|
114
|
+
if not L[k]:
|
|
115
|
+
print_table(C[k], f"Candidate {k}-itemsets (C{k})")
|
|
116
|
+
print(f"\nNo frequent {k}-itemsets found. Terminating.\n\n")
|
|
117
|
+
break
|
|
118
|
+
|
|
119
|
+
print_table(C[k], f"Candidate {k}-itemsets (C{k})")
|
|
120
|
+
print_table(L[k], f"Frequent {k}-itemsets (L{k})")
|
|
121
|
+
|
|
122
|
+
k += 1
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def main():
|
|
126
|
+
|
|
127
|
+
transactions = {
|
|
128
|
+
"T100": ["I1", "I2", "I5"],
|
|
129
|
+
"T200": ["I2", "I4"],
|
|
130
|
+
"T300": ["I2", "I3"],
|
|
131
|
+
"T400": ["I1", "I2", "I4"],
|
|
132
|
+
"T500": ["I1", "I3"],
|
|
133
|
+
"T600": ["I2", "I3"],
|
|
134
|
+
"T700": ["I1", "I3"],
|
|
135
|
+
"T800": ["I1", "I2", "I3", "I5"],
|
|
136
|
+
"T900": ["I1", "I2", "I3"],
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
min_support = 2
|
|
140
|
+
|
|
141
|
+
apriori(transactions, min_support)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
if __name__ == "__main__":
|
|
145
|
+
main()
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
# 1. Reading from CSV
|
|
5
|
+
def load_csv(file_path):
|
|
6
|
+
df = pd.read_csv(file_path)
|
|
7
|
+
return df
|
|
8
|
+
|
|
9
|
+
# 2. Reading from Excel
|
|
10
|
+
def load_excel(file_path, sheet_name=0):
|
|
11
|
+
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
|
12
|
+
return df
|
|
13
|
+
|
|
14
|
+
# 3. Mean and Median Fill (For Numerical Columns)
|
|
15
|
+
def impute_numerical(df):
|
|
16
|
+
# Filling with Mean
|
|
17
|
+
df_mean = df.copy()
|
|
18
|
+
# Select only numeric columns for mean/median to avoid errors
|
|
19
|
+
numeric_cols = df_mean.select_dtypes(include=[np.number]).columns
|
|
20
|
+
df_mean[numeric_cols] = df_mean[numeric_cols].fillna(df_mean[numeric_cols].mean())
|
|
21
|
+
|
|
22
|
+
# Filling with Median
|
|
23
|
+
df_median = df.copy()
|
|
24
|
+
df_median[numeric_cols] = df_median[numeric_cols].fillna(df_median[numeric_cols].median())
|
|
25
|
+
|
|
26
|
+
return df_mean, df_median
|
|
27
|
+
|
|
28
|
+
# 4. General Fill NA (For Categorical/Transaction Data)
|
|
29
|
+
def fill_general_na(df, value="Unknown"):
|
|
30
|
+
# Often in transaction data, we fill NaNs with a placeholder or empty string
|
|
31
|
+
return df.fillna(value)
|
|
32
|
+
|
|
33
|
+
# Example Usage:
|
|
34
|
+
# df = load_csv('transactions.csv')
|
|
35
|
+
# df_filled = fill_general_na(df, value="Missing_Item")
|