bm-preprocessing 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.4
2
+ Name: bm-preprocessing
3
+ Version: 0.1.0
4
+ Summary: A package to view source code from DM and IR modules
5
+ Requires-Python: >=3.8
6
+ Requires-Dist: build>=1.2.2.post1
7
+ Requires-Dist: twine>=6.1.0
8
+ Description-Content-Type: text/markdown
9
+
10
+ # bm-preprocessing
11
+
12
+ A Python package that displays source code from DM and IR modules.
13
+
14
+ ## Usage
15
+
16
+ ```python
17
+ from bm_preprocessing.IR import all
18
+ print(all) # Prints the entire source code of IR/all.py
19
+
20
+ from bm_preprocessing.DM import apriori
21
+ print(apriori) # Prints the entire source code of DM/apriori.py
22
+ ```
23
+
24
+ ## Available Modules
25
+
26
+ ### IR
27
+ - `all` - Information Retrieval algorithms
28
+
29
+ ### DM
30
+ - `all` - Data Mining algorithms
31
+ - `apriori` - Apriori algorithm implementation
32
+ - `hash` - Hash-based mining
33
+ - `preprocessing` - Data preprocessing utilities
@@ -0,0 +1,24 @@
1
+ # bm-preprocessing
2
+
3
+ A Python package that displays source code from DM and IR modules.
4
+
5
+ ## Usage
6
+
7
+ ```python
8
+ from bm_preprocessing.IR import all
9
+ print(all) # Prints the entire source code of IR/all.py
10
+
11
+ from bm_preprocessing.DM import apriori
12
+ print(apriori) # Prints the entire source code of DM/apriori.py
13
+ ```
14
+
15
+ ## Available Modules
16
+
17
+ ### IR
18
+ - `all` - Information Retrieval algorithms
19
+
20
+ ### DM
21
+ - `all` - Data Mining algorithms
22
+ - `apriori` - Apriori algorithm implementation
23
+ - `hash` - Hash-based mining
24
+ - `preprocessing` - Data preprocessing utilities
@@ -0,0 +1,17 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "bm-preprocessing"
7
+ version = "0.1.0"
8
+ description = "A package to view source code from DM and IR modules"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ dependencies = [
12
+ "build>=1.2.2.post1",
13
+ "twine>=6.1.0",
14
+ ]
15
+
16
+ [tool.hatch.build.targets.wheel]
17
+ packages = ["src/bm_preprocessing"]
@@ -0,0 +1,8 @@
1
+ """DM subpackage - Data Mining source code."""
2
+
3
+ from .all import all
4
+ from .apriori import apriori
5
+ from .hash import hash
6
+ from .preprocessing import preprocessing
7
+
8
+ __all__ = ["all", "apriori", "hash", "preprocessing"]
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/all.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding='utf-8')
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "all.py"
30
+ all = SourceCodeModule("DM.all", _source_file)
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/apriori.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding='utf-8')
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "apriori.py"
30
+ apriori = SourceCodeModule("DM.apriori", _source_file)
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/hash.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding='utf-8')
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "hash.py"
30
+ hash = SourceCodeModule("DM.hash", _source_file)
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/preprocessing.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding='utf-8')
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "preprocessing.py"
30
+ preprocessing = SourceCodeModule("DM.preprocessing", _source_file)
@@ -0,0 +1,157 @@
1
+ from itertools import combinations, chain
2
+ from collections import defaultdict
3
+
4
+ min_support = 2
5
+ min_conf = 0.7
6
+
7
+ transactions = {
8
+ "T1": {"I1","I2","I4","I5","I6"},
9
+ "T2": {"I2","I4","I6"},
10
+ "T3": {"I2","I3"},
11
+ "T4": {"I1","I2","I4"},
12
+ "T5": {"I1","I2","I3"},
13
+ "T6": {"I2","I3"},
14
+ "T7": {"I1","I3"},
15
+ "T8": {"I1","I2","I3","I5"},
16
+ "T9": {"I1","I2","I3"},
17
+ "T10": {"I1","I2","I4","I5"},
18
+ "T11": {"I5","I6"}
19
+ }
20
+
21
+ genL = lambda C: {k: v for k, v in C.items() if v >= min_support}
22
+
23
+ C, L = {}, {}
24
+
25
+ C[1] = defaultdict(int)
26
+ for t in transactions.values():
27
+ for i in t:
28
+ C[1][frozenset([i])] += 1
29
+ L[1] = genL(C[1])
30
+
31
+ h = lambda p, b=7: sum(sum(ord(c) for c in s) for s in p) % b
32
+ buckets = defaultdict(int)
33
+ for t in transactions.values():
34
+ for p in combinations(sorted(t), 2):
35
+ buckets[h(p)] += 1
36
+ freq = {b for b, c in buckets.items() if c >= min_support}
37
+
38
+ C[2] = defaultdict(int)
39
+ for t in transactions.values():
40
+ for p in combinations(sorted(t), 2):
41
+ if h(p) in freq:
42
+ C[2][frozenset(p)] += 1
43
+ L[2] = genL(C[2])
44
+
45
+ k = 3
46
+ while L[k - 1]:
47
+ C[k] = {
48
+ frozenset(a | b): 0
49
+ for a, b in combinations(L[k - 1], 2)
50
+ if sorted(a)[: k - 2] == sorted(b)[: k - 2]
51
+ and all(frozenset(s) in L[k - 1] for s in combinations(a | b, k - 1))
52
+ }
53
+ for t in transactions.values():
54
+ for c in C[k]:
55
+ if c.issubset(t):
56
+ C[k][c] += 1
57
+ L[k] = genL(C[k])
58
+ if not L[k]:
59
+ break
60
+ k += 1
61
+
62
+ frequent_itemsets = {k: v for Lk in L.values() for k, v in Lk.items()}
63
+ total = len(transactions)
64
+ rules = []
65
+ for itemset, count in frequent_itemsets.items():
66
+ if len(itemset) < 2:
67
+ continue
68
+ for a in chain.from_iterable(
69
+ combinations(itemset, r) for r in range(1, len(itemset))
70
+ ):
71
+ antecedent = frozenset(a)
72
+ consequent = itemset - antecedent
73
+ if not consequent:
74
+ continue
75
+ support = count / total
76
+ confidence = (
77
+ count / C[1 if len(antecedent) == 1 else len(antecedent)][antecedent]
78
+ )
79
+ if confidence >= min_conf:
80
+ rules.append((antecedent, consequent, support, confidence))
81
+
82
+ for k, v in C.items():
83
+ print(
84
+ f"\nC{k}:\n",
85
+ "Empty" if not v else "\n".join(f"{set(x)} : {y}" for x, y in v.items()),
86
+ )
87
+ for k, v in L.items():
88
+ print(f"\nL{k}:\n", "Empty" if not v else "\n".join(f"{set(x)} : {v[x]}" for x in v))
89
+
90
+ print(f"\nAssociation Rules (conf >= {min_conf:.0%}):")
91
+ for a, c, s, conf in rules:
92
+ print(f"{set(a)} => {set(c)} | support: {s:.2f}, confidence: {conf:.2f}")
93
+
94
+ from itertools import combinations, chain
95
+
96
+ transactions = {
97
+ "10": {"A", "C", "D"},
98
+ "20": {"B", "C", "E"},
99
+ "30": {"A", "B", "C", "E"},
100
+ "40": {"B", "E"},
101
+ }
102
+
103
+ min_support = 2
104
+ min_conf = 0.7
105
+
106
+ genL = lambda C: {k: v for k, v in C.items() if len(v) >= min_support}
107
+
108
+ C = {1: {}}
109
+ for tid, items in transactions.items():
110
+ for i in items:
111
+ C[1].setdefault(frozenset([i]), set()).add(tid)
112
+
113
+ L = {1: genL(C[1])}
114
+
115
+ k = 2
116
+ while L[k - 1]:
117
+ prev = list(L[k - 1].keys())
118
+ C[k] = {
119
+ frozenset(a | b): L[k - 1][a] & L[k - 1][b]
120
+ for i, a in enumerate(prev)
121
+ for b in prev[i + 1 :]
122
+ if sorted(a)[: k - 2] == sorted(b)[: k - 2]
123
+ }
124
+ L[k] = genL(C[k])
125
+ if not L[k]:
126
+ break
127
+ k += 1
128
+
129
+ frequent_itemsets = {k: v for Lk in L.values() for k, v in Lk.items()}
130
+ total = len(transactions)
131
+ rules = []
132
+ for itemset, tids in frequent_itemsets.items():
133
+ if len(itemset) < 2:
134
+ continue
135
+ for a in chain.from_iterable(
136
+ combinations(itemset, r) for r in range(1, len(itemset))
137
+ ):
138
+ antecedent = frozenset(a)
139
+ consequent = itemset - antecedent
140
+ if len(consequent) == 0:
141
+ continue
142
+ support = len(tids) / total
143
+ confidence = len(tids) / len(frequent_itemsets[antecedent])
144
+ if confidence >= min_conf:
145
+ rules.append((antecedent, consequent, support, confidence))
146
+
147
+ for k, v in C.items():
148
+ print(f"\nC{k}:")
149
+ print("Empty" if not v else "\n".join(f"{set(x)} : {y}" for x, y in v.items()))
150
+
151
+ for k, v in L.items():
152
+ print(f"\nL{k}:")
153
+ print("Empty" if not v else "\n".join(f"{set(x)} : {len(y)}" for x, y in v.items()))
154
+
155
+ print(f"\nAssociation Rules (conf >= {min_conf:.0%}):")
156
+ for a, c, s, conf in rules:
157
+ print(f"{set(a)} => {set(c)} | support: {s:.2f}, confidence: {conf:.2f}")
@@ -0,0 +1,107 @@
1
+ from collections import defaultdict
2
+ from itertools import combinations
3
+
4
+ def print_table(data, title):
5
+ print(f"\n--- {title} ---")
6
+ for itemset, count in data.items():
7
+ print(f"{itemset}: {count}")
8
+
9
+ C = {}
10
+ L = {}
11
+
12
+
13
+ def generate_candidates(prev_frequent_itemsets, k):
14
+
15
+ candidates = set()
16
+
17
+ for itemset1, itemset2 in combinations(prev_frequent_itemsets, 2):
18
+ union_set = set(itemset1).union(set(itemset2))
19
+ if len(union_set) == k:
20
+ candidates.add(tuple(sorted(union_set)))
21
+
22
+ return sorted(list(candidates))
23
+
24
+
25
+ def count_candidates(candidates, transactions):
26
+
27
+ candidate_count = defaultdict(int)
28
+
29
+ for candidate in candidates:
30
+ for transaction in transactions.values():
31
+ if all(item in transaction for item in candidate):
32
+ candidate_count[candidate] += 1
33
+
34
+ return candidate_count
35
+
36
+
37
+ def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
38
+
39
+ filtered_candidates = {}
40
+
41
+ for itemset, count in candidate_count.items():
42
+
43
+ if count >= min_support:
44
+ if prev_freq_itemsets is None or len(itemset) == 1:
45
+ filtered_candidates[itemset] = count
46
+ else:
47
+ subsets = combinations(itemset, len(itemset) - 1)
48
+ if all(tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets):
49
+ filtered_candidates[itemset] = count
50
+
51
+ return filtered_candidates
52
+
53
+
54
+ def apriori(transactions, min_support):
55
+
56
+ items = sorted(set(item for transaction in transactions.values() for item in transaction))
57
+ c1_list = [(item,) for item in items]
58
+
59
+ C[1] = count_candidates(c1_list, transactions)
60
+ L[1] = prune_candidates(C[1], min_support)
61
+
62
+ print_table(C[1], "Candidate 1-itemsets (C1)")
63
+ print_table(L[1], "Frequent 1-itemsets (L1)")
64
+
65
+ k = 2
66
+
67
+ while True:
68
+
69
+ candidates = generate_candidates(L[k-1].keys(), k)
70
+ if not candidates:
71
+ break
72
+
73
+ C[k] = count_candidates(candidates, transactions)
74
+ L[k] = prune_candidates(C[k], min_support, L[k-1].keys())
75
+
76
+ if not L[k]:
77
+ print_table(C[k], f"Candidate {k}-itemsets (C{k})")
78
+ print(f"\nNo frequent {k}-itemsets found. Terminating.\n\n")
79
+ break
80
+
81
+ print_table(C[k], f"Candidate {k}-itemsets (C{k})")
82
+ print_table(L[k], f"Frequent {k}-itemsets (L{k})")
83
+
84
+ k += 1
85
+
86
+
87
+ def main():
88
+
89
+ transactions = {
90
+ "T100": ["I1", "I2", "I5"],
91
+ "T200": ["I2", "I4"],
92
+ "T300": ["I2", "I3"],
93
+ "T400": ["I1", "I2", "I4"],
94
+ "T500": ["I1", "I3"],
95
+ "T600": ["I2", "I3"],
96
+ "T700": ["I1", "I3"],
97
+ "T800": ["I1", "I2", "I3", "I5"],
98
+ "T900": ["I1", "I2", "I3"],
99
+ }
100
+
101
+ min_support = 2
102
+
103
+ apriori(transactions, min_support)
104
+
105
+
106
+ if __name__ == "__main__":
107
+ main()
@@ -0,0 +1,145 @@
1
+ from collections import defaultdict
2
+ from itertools import combinations
3
+
4
+ def print_table(data, title):
5
+ print(f"\n--- {title} ---")
6
+ for itemset, count in data.items():
7
+ print(f"{itemset}: {count}")
8
+
9
+ C = {}
10
+ L = {}
11
+
12
+ class Bucket:
13
+ def __init__(self):
14
+ self.address: int
15
+ self.count: int = 0
16
+ self.itemsets: list[tuple] = []
17
+
18
+
19
+ def generate_candidates(prev_frequent_itemsets, k):
20
+
21
+ candidates = set()
22
+
23
+ for itemset1, itemset2 in combinations(prev_frequent_itemsets, 2):
24
+ union_set = set(itemset1).union(set(itemset2))
25
+ if len(union_set) == k:
26
+ candidates.add(tuple(sorted(union_set)))
27
+
28
+ return sorted(list(candidates))
29
+
30
+
31
+ def count_candidates(candidates, transactions):
32
+
33
+ candidate_count = defaultdict(int)
34
+
35
+ for candidate in candidates:
36
+ for transaction in transactions.values():
37
+ if all(item in transaction for item in candidate):
38
+ candidate_count[candidate] += 1
39
+
40
+ return candidate_count
41
+
42
+
43
+ def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
44
+
45
+ filtered_candidates = {}
46
+
47
+ for itemset, count in candidate_count.items():
48
+
49
+ if count >= min_support:
50
+ if prev_freq_itemsets is None or len(itemset) == 1:
51
+ filtered_candidates[itemset] = count
52
+ else:
53
+ subsets = combinations(itemset, len(itemset) - 1)
54
+ if all(tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets):
55
+ filtered_candidates[itemset] = count
56
+
57
+ return filtered_candidates
58
+
59
+
60
+ def apriori(transactions, min_support):
61
+
62
+ items = sorted(set(item for transaction in transactions.values() for item in transaction))
63
+ c1_list = [(item,) for item in items]
64
+
65
+ C[1] = count_candidates(c1_list, transactions)
66
+ L[1] = prune_candidates(C[1], min_support)
67
+
68
+ print_table(C[1], "Candidate 1-itemsets (C1)")
69
+ print_table(L[1], "Frequent 1-itemsets (L1)")
70
+
71
+ k = 2
72
+
73
+ transactions_combinations = {
74
+ transaction_id: combinations(items, k)
75
+ for transaction_id, items in transactions.items()
76
+ }
77
+
78
+ buckets = [Bucket(addr) for addr in range(7)]
79
+
80
+ items_list = sorted(set(item for transaction in transactions.values() for item in transaction))
81
+ ranks = {item: idx + 1 for idx, item in enumerate(items_list)}
82
+
83
+ hash_fn = lambda item1, item2: (ranks[item1] * 10 + ranks[item2]) % 7
84
+
85
+ for itemset in transactions_combinations.values():
86
+ for item in itemset:
87
+ item1, item2 = item[0], item[1]
88
+ address = hash_fn(item1, item2) % 7
89
+ buckets[address].count += 1
90
+ buckets[address].itemsets.append(item)
91
+
92
+ print("\n--- Hash Table Buckets ---")
93
+ for bucket in buckets:
94
+ print(f"Address: {bucket.address}, Count: {bucket.count}, Itemsets: {bucket.itemsets}")
95
+
96
+ # Filter
97
+ L2 = {itemset: bucket.count for bucket in buckets for itemset in bucket.itemsets if bucket.count >= min_support}
98
+ print_table(L2, "Frequent 2-itemsets after Hashing (L2)")
99
+
100
+ C["2"] = generate_candidates(L[k-1].keys(), k)
101
+ L["2"] = L2
102
+
103
+ k = 3
104
+
105
+ while True:
106
+
107
+ candidates = generate_candidates(L[k-1].keys(), k)
108
+ if not candidates:
109
+ break
110
+
111
+ C[k] = count_candidates(candidates, transactions)
112
+ L[k] = prune_candidates(C[k], min_support, L[k-1].keys())
113
+
114
+ if not L[k]:
115
+ print_table(C[k], f"Candidate {k}-itemsets (C{k})")
116
+ print(f"\nNo frequent {k}-itemsets found. Terminating.\n\n")
117
+ break
118
+
119
+ print_table(C[k], f"Candidate {k}-itemsets (C{k})")
120
+ print_table(L[k], f"Frequent {k}-itemsets (L{k})")
121
+
122
+ k += 1
123
+
124
+
125
+ def main():
126
+
127
+ transactions = {
128
+ "T100": ["I1", "I2", "I5"],
129
+ "T200": ["I2", "I4"],
130
+ "T300": ["I2", "I3"],
131
+ "T400": ["I1", "I2", "I4"],
132
+ "T500": ["I1", "I3"],
133
+ "T600": ["I2", "I3"],
134
+ "T700": ["I1", "I3"],
135
+ "T800": ["I1", "I2", "I3", "I5"],
136
+ "T900": ["I1", "I2", "I3"],
137
+ }
138
+
139
+ min_support = 2
140
+
141
+ apriori(transactions, min_support)
142
+
143
+
144
+ if __name__ == "__main__":
145
+ main()
@@ -0,0 +1,35 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ # 1. Reading from CSV
5
+ def load_csv(file_path):
6
+ df = pd.read_csv(file_path)
7
+ return df
8
+
9
+ # 2. Reading from Excel
10
+ def load_excel(file_path, sheet_name=0):
11
+ df = pd.read_excel(file_path, sheet_name=sheet_name)
12
+ return df
13
+
14
+ # 3. Mean and Median Fill (For Numerical Columns)
15
+ def impute_numerical(df):
16
+ # Filling with Mean
17
+ df_mean = df.copy()
18
+ # Select only numeric columns for mean/median to avoid errors
19
+ numeric_cols = df_mean.select_dtypes(include=[np.number]).columns
20
+ df_mean[numeric_cols] = df_mean[numeric_cols].fillna(df_mean[numeric_cols].mean())
21
+
22
+ # Filling with Median
23
+ df_median = df.copy()
24
+ df_median[numeric_cols] = df_median[numeric_cols].fillna(df_median[numeric_cols].median())
25
+
26
+ return df_mean, df_median
27
+
28
+ # 4. General Fill NA (For Categorical/Transaction Data)
29
+ def fill_general_na(df, value="Unknown"):
30
+ # Often in transaction data, we fill NaNs with a placeholder or empty string
31
+ return df.fillna(value)
32
+
33
+ # Example Usage:
34
+ # df = load_csv('transactions.csv')
35
+ # df_filled = fill_general_na(df, value="Missing_Item")
@@ -0,0 +1,5 @@
1
+ """IR subpackage - Information Retrieval source code."""
2
+
3
+ from .all import all
4
+
5
+ __all__ = ["all"]