bm-preprocessing 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ """DM subpackage - Data Mining source code."""
2
+
3
+ from .all import all
4
+ from .apriori import apriori
5
+ from .hash import hash
6
+ from .preprocessing import preprocessing
7
+
8
+ __all__ = ["all", "apriori", "hash", "preprocessing"]
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/all.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding='utf-8')
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "all.py"
30
+ all = SourceCodeModule("DM.all", _source_file)
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/apriori.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding='utf-8')
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "apriori.py"
30
+ apriori = SourceCodeModule("DM.apriori", _source_file)
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/hash.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding='utf-8')
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "hash.py"
30
+ hash = SourceCodeModule("DM.hash", _source_file)
@@ -0,0 +1,30 @@
1
+ """Source code loader for DM/preprocessing.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding='utf-8')
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "preprocessing.py"
30
+ preprocessing = SourceCodeModule("DM.preprocessing", _source_file)
@@ -0,0 +1,157 @@
1
+ from itertools import combinations, chain
2
+ from collections import defaultdict
3
+
4
+ min_support = 2
5
+ min_conf = 0.7
6
+
7
+ transactions = {
8
+ "T1": {"I1","I2","I4","I5","I6"},
9
+ "T2": {"I2","I4","I6"},
10
+ "T3": {"I2","I3"},
11
+ "T4": {"I1","I2","I4"},
12
+ "T5": {"I1","I2","I3"},
13
+ "T6": {"I2","I3"},
14
+ "T7": {"I1","I3"},
15
+ "T8": {"I1","I2","I3","I5"},
16
+ "T9": {"I1","I2","I3"},
17
+ "T10": {"I1","I2","I4","I5"},
18
+ "T11": {"I5","I6"}
19
+ }
20
+
21
+ genL = lambda C: {k: v for k, v in C.items() if v >= min_support}
22
+
23
+ C, L = {}, {}
24
+
25
+ C[1] = defaultdict(int)
26
+ for t in transactions.values():
27
+ for i in t:
28
+ C[1][frozenset([i])] += 1
29
+ L[1] = genL(C[1])
30
+
31
+ h = lambda p, b=7: sum(sum(ord(c) for c in s) for s in p) % b
32
+ buckets = defaultdict(int)
33
+ for t in transactions.values():
34
+ for p in combinations(sorted(t), 2):
35
+ buckets[h(p)] += 1
36
+ freq = {b for b, c in buckets.items() if c >= min_support}
37
+
38
+ C[2] = defaultdict(int)
39
+ for t in transactions.values():
40
+ for p in combinations(sorted(t), 2):
41
+ if h(p) in freq:
42
+ C[2][frozenset(p)] += 1
43
+ L[2] = genL(C[2])
44
+
45
+ k = 3
46
+ while L[k - 1]:
47
+ C[k] = {
48
+ frozenset(a | b): 0
49
+ for a, b in combinations(L[k - 1], 2)
50
+ if sorted(a)[: k - 2] == sorted(b)[: k - 2]
51
+ and all(frozenset(s) in L[k - 1] for s in combinations(a | b, k - 1))
52
+ }
53
+ for t in transactions.values():
54
+ for c in C[k]:
55
+ if c.issubset(t):
56
+ C[k][c] += 1
57
+ L[k] = genL(C[k])
58
+ if not L[k]:
59
+ break
60
+ k += 1
61
+
62
+ frequent_itemsets = {k: v for Lk in L.values() for k, v in Lk.items()}
63
+ total = len(transactions)
64
+ rules = []
65
+ for itemset, count in frequent_itemsets.items():
66
+ if len(itemset) < 2:
67
+ continue
68
+ for a in chain.from_iterable(
69
+ combinations(itemset, r) for r in range(1, len(itemset))
70
+ ):
71
+ antecedent = frozenset(a)
72
+ consequent = itemset - antecedent
73
+ if not consequent:
74
+ continue
75
+ support = count / total
76
+ confidence = (
77
+ count / C[1 if len(antecedent) == 1 else len(antecedent)][antecedent]
78
+ )
79
+ if confidence >= min_conf:
80
+ rules.append((antecedent, consequent, support, confidence))
81
+
82
+ for k, v in C.items():
83
+ print(
84
+ f"\nC{k}:\n",
85
+ "Empty" if not v else "\n".join(f"{set(x)} : {y}" for x, y in v.items()),
86
+ )
87
+ for k, v in L.items():
88
+ print(f"\nL{k}:\n", "Empty" if not v else "\n".join(f"{set(x)} : {v[x]}" for x in v))
89
+
90
+ print(f"\nAssociation Rules (conf >= {min_conf:.0%}):")
91
+ for a, c, s, conf in rules:
92
+ print(f"{set(a)} => {set(c)} | support: {s:.2f}, confidence: {conf:.2f}")
93
+
94
+ from itertools import combinations, chain
95
+
96
+ transactions = {
97
+ "10": {"A", "C", "D"},
98
+ "20": {"B", "C", "E"},
99
+ "30": {"A", "B", "C", "E"},
100
+ "40": {"B", "E"},
101
+ }
102
+
103
+ min_support = 2
104
+ min_conf = 0.7
105
+
106
+ genL = lambda C: {k: v for k, v in C.items() if len(v) >= min_support}
107
+
108
+ C = {1: {}}
109
+ for tid, items in transactions.items():
110
+ for i in items:
111
+ C[1].setdefault(frozenset([i]), set()).add(tid)
112
+
113
+ L = {1: genL(C[1])}
114
+
115
+ k = 2
116
+ while L[k - 1]:
117
+ prev = list(L[k - 1].keys())
118
+ C[k] = {
119
+ frozenset(a | b): L[k - 1][a] & L[k - 1][b]
120
+ for i, a in enumerate(prev)
121
+ for b in prev[i + 1 :]
122
+ if sorted(a)[: k - 2] == sorted(b)[: k - 2]
123
+ }
124
+ L[k] = genL(C[k])
125
+ if not L[k]:
126
+ break
127
+ k += 1
128
+
129
+ frequent_itemsets = {k: v for Lk in L.values() for k, v in Lk.items()}
130
+ total = len(transactions)
131
+ rules = []
132
+ for itemset, tids in frequent_itemsets.items():
133
+ if len(itemset) < 2:
134
+ continue
135
+ for a in chain.from_iterable(
136
+ combinations(itemset, r) for r in range(1, len(itemset))
137
+ ):
138
+ antecedent = frozenset(a)
139
+ consequent = itemset - antecedent
140
+ if len(consequent) == 0:
141
+ continue
142
+ support = len(tids) / total
143
+ confidence = len(tids) / len(frequent_itemsets[antecedent])
144
+ if confidence >= min_conf:
145
+ rules.append((antecedent, consequent, support, confidence))
146
+
147
+ for k, v in C.items():
148
+ print(f"\nC{k}:")
149
+ print("Empty" if not v else "\n".join(f"{set(x)} : {y}" for x, y in v.items()))
150
+
151
+ for k, v in L.items():
152
+ print(f"\nL{k}:")
153
+ print("Empty" if not v else "\n".join(f"{set(x)} : {len(y)}" for x, y in v.items()))
154
+
155
+ print(f"\nAssociation Rules (conf >= {min_conf:.0%}):")
156
+ for a, c, s, conf in rules:
157
+ print(f"{set(a)} => {set(c)} | support: {s:.2f}, confidence: {conf:.2f}")
@@ -0,0 +1,107 @@
1
+ from collections import defaultdict
2
+ from itertools import combinations
3
+
4
+ def print_table(data, title):
5
+ print(f"\n--- {title} ---")
6
+ for itemset, count in data.items():
7
+ print(f"{itemset}: {count}")
8
+
9
+ C = {}
10
+ L = {}
11
+
12
+
13
+ def generate_candidates(prev_frequent_itemsets, k):
14
+
15
+ candidates = set()
16
+
17
+ for itemset1, itemset2 in combinations(prev_frequent_itemsets, 2):
18
+ union_set = set(itemset1).union(set(itemset2))
19
+ if len(union_set) == k:
20
+ candidates.add(tuple(sorted(union_set)))
21
+
22
+ return sorted(list(candidates))
23
+
24
+
25
+ def count_candidates(candidates, transactions):
26
+
27
+ candidate_count = defaultdict(int)
28
+
29
+ for candidate in candidates:
30
+ for transaction in transactions.values():
31
+ if all(item in transaction for item in candidate):
32
+ candidate_count[candidate] += 1
33
+
34
+ return candidate_count
35
+
36
+
37
+ def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
38
+
39
+ filtered_candidates = {}
40
+
41
+ for itemset, count in candidate_count.items():
42
+
43
+ if count >= min_support:
44
+ if prev_freq_itemsets is None or len(itemset) == 1:
45
+ filtered_candidates[itemset] = count
46
+ else:
47
+ subsets = combinations(itemset, len(itemset) - 1)
48
+ if all(tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets):
49
+ filtered_candidates[itemset] = count
50
+
51
+ return filtered_candidates
52
+
53
+
54
+ def apriori(transactions, min_support):
55
+
56
+ items = sorted(set(item for transaction in transactions.values() for item in transaction))
57
+ c1_list = [(item,) for item in items]
58
+
59
+ C[1] = count_candidates(c1_list, transactions)
60
+ L[1] = prune_candidates(C[1], min_support)
61
+
62
+ print_table(C[1], "Candidate 1-itemsets (C1)")
63
+ print_table(L[1], "Frequent 1-itemsets (L1)")
64
+
65
+ k = 2
66
+
67
+ while True:
68
+
69
+ candidates = generate_candidates(L[k-1].keys(), k)
70
+ if not candidates:
71
+ break
72
+
73
+ C[k] = count_candidates(candidates, transactions)
74
+ L[k] = prune_candidates(C[k], min_support, L[k-1].keys())
75
+
76
+ if not L[k]:
77
+ print_table(C[k], f"Candidate {k}-itemsets (C{k})")
78
+ print(f"\nNo frequent {k}-itemsets found. Terminating.\n\n")
79
+ break
80
+
81
+ print_table(C[k], f"Candidate {k}-itemsets (C{k})")
82
+ print_table(L[k], f"Frequent {k}-itemsets (L{k})")
83
+
84
+ k += 1
85
+
86
+
87
+ def main():
88
+
89
+ transactions = {
90
+ "T100": ["I1", "I2", "I5"],
91
+ "T200": ["I2", "I4"],
92
+ "T300": ["I2", "I3"],
93
+ "T400": ["I1", "I2", "I4"],
94
+ "T500": ["I1", "I3"],
95
+ "T600": ["I2", "I3"],
96
+ "T700": ["I1", "I3"],
97
+ "T800": ["I1", "I2", "I3", "I5"],
98
+ "T900": ["I1", "I2", "I3"],
99
+ }
100
+
101
+ min_support = 2
102
+
103
+ apriori(transactions, min_support)
104
+
105
+
106
+ if __name__ == "__main__":
107
+ main()
@@ -0,0 +1,145 @@
1
+ from collections import defaultdict
2
+ from itertools import combinations
3
+
4
+ def print_table(data, title):
5
+ print(f"\n--- {title} ---")
6
+ for itemset, count in data.items():
7
+ print(f"{itemset}: {count}")
8
+
9
+ C = {}
10
+ L = {}
11
+
12
+ class Bucket:
13
+ def __init__(self):
14
+ self.address: int
15
+ self.count: int = 0
16
+ self.itemsets: list[tuple] = []
17
+
18
+
19
+ def generate_candidates(prev_frequent_itemsets, k):
20
+
21
+ candidates = set()
22
+
23
+ for itemset1, itemset2 in combinations(prev_frequent_itemsets, 2):
24
+ union_set = set(itemset1).union(set(itemset2))
25
+ if len(union_set) == k:
26
+ candidates.add(tuple(sorted(union_set)))
27
+
28
+ return sorted(list(candidates))
29
+
30
+
31
+ def count_candidates(candidates, transactions):
32
+
33
+ candidate_count = defaultdict(int)
34
+
35
+ for candidate in candidates:
36
+ for transaction in transactions.values():
37
+ if all(item in transaction for item in candidate):
38
+ candidate_count[candidate] += 1
39
+
40
+ return candidate_count
41
+
42
+
43
+ def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
44
+
45
+ filtered_candidates = {}
46
+
47
+ for itemset, count in candidate_count.items():
48
+
49
+ if count >= min_support:
50
+ if prev_freq_itemsets is None or len(itemset) == 1:
51
+ filtered_candidates[itemset] = count
52
+ else:
53
+ subsets = combinations(itemset, len(itemset) - 1)
54
+ if all(tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets):
55
+ filtered_candidates[itemset] = count
56
+
57
+ return filtered_candidates
58
+
59
+
60
+ def apriori(transactions, min_support):
61
+
62
+ items = sorted(set(item for transaction in transactions.values() for item in transaction))
63
+ c1_list = [(item,) for item in items]
64
+
65
+ C[1] = count_candidates(c1_list, transactions)
66
+ L[1] = prune_candidates(C[1], min_support)
67
+
68
+ print_table(C[1], "Candidate 1-itemsets (C1)")
69
+ print_table(L[1], "Frequent 1-itemsets (L1)")
70
+
71
+ k = 2
72
+
73
+ transactions_combinations = {
74
+ transaction_id: combinations(items, k)
75
+ for transaction_id, items in transactions.items()
76
+ }
77
+
78
+ buckets = [Bucket(addr) for addr in range(7)]
79
+
80
+ items_list = sorted(set(item for transaction in transactions.values() for item in transaction))
81
+ ranks = {item: idx + 1 for idx, item in enumerate(items_list)}
82
+
83
+ hash_fn = lambda item1, item2: (ranks[item1] * 10 + ranks[item2]) % 7
84
+
85
+ for itemset in transactions_combinations.values():
86
+ for item in itemset:
87
+ item1, item2 = item[0], item[1]
88
+ address = hash_fn(item1, item2) % 7
89
+ buckets[address].count += 1
90
+ buckets[address].itemsets.append(item)
91
+
92
+ print("\n--- Hash Table Buckets ---")
93
+ for bucket in buckets:
94
+ print(f"Address: {bucket.address}, Count: {bucket.count}, Itemsets: {bucket.itemsets}")
95
+
96
+ # Filter
97
+ L2 = {itemset: bucket.count for bucket in buckets for itemset in bucket.itemsets if bucket.count >= min_support}
98
+ print_table(L2, "Frequent 2-itemsets after Hashing (L2)")
99
+
100
+ C["2"] = generate_candidates(L[k-1].keys(), k)
101
+ L["2"] = L2
102
+
103
+ k = 3
104
+
105
+ while True:
106
+
107
+ candidates = generate_candidates(L[k-1].keys(), k)
108
+ if not candidates:
109
+ break
110
+
111
+ C[k] = count_candidates(candidates, transactions)
112
+ L[k] = prune_candidates(C[k], min_support, L[k-1].keys())
113
+
114
+ if not L[k]:
115
+ print_table(C[k], f"Candidate {k}-itemsets (C{k})")
116
+ print(f"\nNo frequent {k}-itemsets found. Terminating.\n\n")
117
+ break
118
+
119
+ print_table(C[k], f"Candidate {k}-itemsets (C{k})")
120
+ print_table(L[k], f"Frequent {k}-itemsets (L{k})")
121
+
122
+ k += 1
123
+
124
+
125
+ def main():
126
+
127
+ transactions = {
128
+ "T100": ["I1", "I2", "I5"],
129
+ "T200": ["I2", "I4"],
130
+ "T300": ["I2", "I3"],
131
+ "T400": ["I1", "I2", "I4"],
132
+ "T500": ["I1", "I3"],
133
+ "T600": ["I2", "I3"],
134
+ "T700": ["I1", "I3"],
135
+ "T800": ["I1", "I2", "I3", "I5"],
136
+ "T900": ["I1", "I2", "I3"],
137
+ }
138
+
139
+ min_support = 2
140
+
141
+ apriori(transactions, min_support)
142
+
143
+
144
+ if __name__ == "__main__":
145
+ main()
@@ -0,0 +1,35 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ # 1. Reading from CSV
5
+ def load_csv(file_path):
6
+ df = pd.read_csv(file_path)
7
+ return df
8
+
9
+ # 2. Reading from Excel
10
+ def load_excel(file_path, sheet_name=0):
11
+ df = pd.read_excel(file_path, sheet_name=sheet_name)
12
+ return df
13
+
14
+ # 3. Mean and Median Fill (For Numerical Columns)
15
+ def impute_numerical(df):
16
+ # Filling with Mean
17
+ df_mean = df.copy()
18
+ # Select only numeric columns for mean/median to avoid errors
19
+ numeric_cols = df_mean.select_dtypes(include=[np.number]).columns
20
+ df_mean[numeric_cols] = df_mean[numeric_cols].fillna(df_mean[numeric_cols].mean())
21
+
22
+ # Filling with Median
23
+ df_median = df.copy()
24
+ df_median[numeric_cols] = df_median[numeric_cols].fillna(df_median[numeric_cols].median())
25
+
26
+ return df_mean, df_median
27
+
28
+ # 4. General Fill NA (For Categorical/Transaction Data)
29
+ def fill_general_na(df, value="Unknown"):
30
+ # Often in transaction data, we fill NaNs with a placeholder or empty string
31
+ return df.fillna(value)
32
+
33
+ # Example Usage:
34
+ # df = load_csv('transactions.csv')
35
+ # df_filled = fill_general_na(df, value="Missing_Item")
@@ -0,0 +1,5 @@
1
+ """IR subpackage - Information Retrieval source code."""
2
+
3
+ from .all import all
4
+
5
+ __all__ = ["all"]
@@ -0,0 +1,30 @@
1
+ """Source code loader for IR/all.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding='utf-8')
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "all.py"
30
+ all = SourceCodeModule("IR.all", _source_file)
@@ -0,0 +1,186 @@
1
+ import math
2
+ from collections import defaultdict, Counter
3
+ import nltk
4
+ from nltk.corpus import stopwords
5
+ from nltk.stem import PorterStemmer, WordNetLemmatizer
6
+
7
+ nltk.download('stopwords')
8
+ nltk.download('wordnet')
9
+
10
+ stop_words = set(stopwords.words('english'))
11
+ stemmer = PorterStemmer()
12
+ lemmatizer = WordNetLemmatizer()
13
+
14
+ def preprocess(text):
15
+ tokens = text.lower().split()
16
+ tokens = [t for t in tokens if t not in stop_words]
17
+ tokens = [stemmer.stem(t) for t in tokens]
18
+ tokens = [lemmatizer.lemmatize(t) for t in tokens]
19
+ return tokens
20
+
21
+ # ---------- Corpus ----------
22
+ docs = [
23
+ "information retrieval is fun",
24
+ "retrieval models are boolean vector probabilistic",
25
+ "information theory and probability",
26
+ "boolean retrieval is simple"
27
+ ]
28
+
29
+ processed_docs = [preprocess(doc) for doc in docs]
30
+ N = len(docs)
31
+
32
+ # ---------- 2. Term Incidence Matrix ----------
33
+ terms = sorted(set(term for doc in processed_docs for term in doc))
34
+
35
+ term_incidence = {
36
+ term: [1 if term in doc else 0 for doc in processed_docs]
37
+ for term in terms
38
+ }
39
+
40
+ print("\nTerm Incidence Matrix:")
41
+ for term, row in term_incidence.items():
42
+ print(term, row)
43
+
44
+ # ---------- 3. Inverted Index ----------
45
+ inverted_index = defaultdict(list)
46
+
47
+ for doc_id, doc in enumerate(processed_docs):
48
+ for term in set(doc):
49
+ inverted_index[term].append(doc_id)
50
+
51
+ print("\nInverted Index:")
52
+ for term, postings in inverted_index.items():
53
+ print(term, postings)
54
+
55
+ # ---------- Query ----------
56
+ query = "information AND NOT boolean"
57
+ query_terms = preprocess(query)
58
+
59
+ # ---------- 4. Boolean Model (AND / OR / NOT) ----------
60
+ def boolean_retrieval(query):
61
+ tokens = query.upper().split()
62
+ result = set()
63
+ current_op = None
64
+
65
+ i = 0
66
+ while i < len(tokens):
67
+ token = tokens[i]
68
+
69
+ if token in {"AND", "OR", "NOT"}:
70
+ current_op = token
71
+ else:
72
+ term = preprocess(token.lower())
73
+ postings = set()
74
+
75
+ if term and term[0] in inverted_index:
76
+ postings = set(inverted_index[term[0]])
77
+
78
+ if current_op == "NOT":
79
+ postings = set(range(N)) - postings
80
+ current_op = None
81
+
82
+ if not result:
83
+ result = postings
84
+ else:
85
+ if current_op == "AND":
86
+ result = result & postings
87
+ elif current_op == "OR":
88
+ result = result | postings
89
+
90
+ i += 1
91
+
92
+ return result
93
+
94
+ boolean_result = boolean_retrieval(query)
95
+ print("\nBoolean Retrieval Result:", boolean_result)
96
+
97
+ # ---------- 5. Vector Space Model (TF-IDF) ----------
98
+ def tf(doc):
99
+ return Counter(doc)
100
+
101
+ def idf(term):
102
+ df = sum(1 for d in processed_docs if term in d)
103
+ return math.log(N / (df + 1))
104
+
105
+ def tfidf(doc):
106
+ return {t: tf(doc)[t] * idf(t) for t in doc}
107
+
108
+ doc_vectors = [tfidf(doc) for doc in processed_docs]
109
+ query_vector = tfidf(preprocess("information retrieval"))
110
+
111
+ def cosine_similarity(v1, v2):
112
+ num = sum(v1.get(t, 0) * v2.get(t, 0) for t in set(v1) | set(v2))
113
+ den1 = math.sqrt(sum(v ** 2 for v in v1.values()))
114
+ den2 = math.sqrt(sum(v ** 2 for v in v2.values()))
115
+ return num / (den1 * den2) if den1 and den2 else 0
116
+
117
+ vsm_scores = {
118
+ i: cosine_similarity(query_vector, doc_vectors[i])
119
+ for i in range(N)
120
+ }
121
+
122
+ print("\nVector Space Model Scores:", vsm_scores)
123
+
124
+ # ---------- 6. Probabilistic Model (BIM with RSV) ----------
125
+ def bim_rsv(doc, query_terms):
126
+ rsv = 0.0
127
+ for term in query_terms:
128
+ if term in doc:
129
+ df = sum(1 for d in processed_docs if term in d)
130
+ rsv += math.log((N - df + 0.5) / (df + 0.5))
131
+ return rsv
132
+
133
+ bim_scores = {
134
+ i: bim_rsv(processed_docs[i], preprocess("information retrieval"))
135
+ for i in range(N)
136
+ }
137
+
138
+ print("\nBIM RSV Scores:", bim_scores)
139
+
140
+ # ---------- 8. Okapi BM25 ----------
141
+ avg_dl = sum(len(doc) for doc in processed_docs) / N
142
+ k1, b = 1.5, 0.75
143
+
144
+ def bm25(doc, query_terms):
145
+ score = 0.0
146
+ doc_len = len(doc)
147
+ freqs = Counter(doc)
148
+
149
+ for term in query_terms:
150
+ if term in freqs:
151
+ df = sum(1 for d in processed_docs if term in d)
152
+ idf = math.log((N - df + 0.5) / (df + 0.5))
153
+ tf = freqs[term]
154
+ score += idf * ((tf * (k1 + 1)) /
155
+ (tf + k1 * (1 - b + b * doc_len / avg_dl)))
156
+ return score
157
+
158
+ bm25_scores = {
159
+ i: bm25(processed_docs[i], preprocess("information retrieval"))
160
+ for i in range(N)
161
+ }
162
+
163
+ print("\nBM25 Scores:", bm25_scores)
164
+
165
+ # ---------- 7. Evaluation Metrics ----------
166
+ relevant_docs = {0, 3} # ground truth
167
+
168
+ def evaluate(retrieved):
169
+ retrieved = set(retrieved)
170
+ tp = len(retrieved & relevant_docs)
171
+ fp = len(retrieved - relevant_docs)
172
+ fn = len(relevant_docs - retrieved)
173
+
174
+ precision = tp / (tp + fp) if tp + fp else 0
175
+ recall = tp / (tp + fn) if tp + fn else 0
176
+ f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
177
+ accuracy = tp / N
178
+
179
+ return accuracy, precision, recall, f1
180
+
181
+ # ---------- 9. Compare Models ----------
182
+ print("\nEvaluation Metrics:")
183
+ print("Boolean:", evaluate(boolean_result))
184
+ print("VSM:", evaluate([i for i, s in vsm_scores.items() if s > 0]))
185
+ print("BIM:", evaluate([i for i, s in bim_scores.items() if s > 0]))
186
+ print("BM25:", evaluate([i for i, s in bm25_scores.items() if s > 0]))
@@ -0,0 +1,6 @@
1
+ """bm-preprocessing: A package to view source code from DM and IR modules."""
2
+
3
+ from . import IR
4
+ from . import DM
5
+
6
+ __all__ = ["IR", "DM"]
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.4
2
+ Name: bm-preprocessing
3
+ Version: 0.1.0
4
+ Summary: A package to view source code from DM and IR modules
5
+ Requires-Python: >=3.8
6
+ Requires-Dist: build>=1.2.2.post1
7
+ Requires-Dist: twine>=6.1.0
8
+ Description-Content-Type: text/markdown
9
+
10
+ # bm-preprocessing
11
+
12
+ A Python package that displays source code from DM and IR modules.
13
+
14
+ ## Usage
15
+
16
+ ```python
17
+ from bm_preprocessing.IR import all
18
+ print(all) # Prints the entire source code of IR/all.py
19
+
20
+ from bm_preprocessing.DM import apriori
21
+ print(apriori) # Prints the entire source code of DM/apriori.py
22
+ ```
23
+
24
+ ## Available Modules
25
+
26
+ ### IR
27
+ - `all` - Information Retrieval algorithms
28
+
29
+ ### DM
30
+ - `all` - Data Mining algorithms
31
+ - `apriori` - Apriori algorithm implementation
32
+ - `hash` - Hash-based mining
33
+ - `preprocessing` - Data preprocessing utilities
@@ -0,0 +1,16 @@
1
+ bm_preprocessing/__init__.py,sha256=Df9ccHGwwTDamkW2y_t9Vwq6r975WqfakEHhDTr0nko,143
2
+ bm_preprocessing/DM/__init__.py,sha256=ufUf_cL0MIk_xLsqJvfLybVo-1BwsXtksElCOZLUAe0,225
3
+ bm_preprocessing/DM/all.py,sha256=hL8SptuvZ7HVf4G7e8UvuY-8nDYTF4laJvxvGed720o,855
4
+ bm_preprocessing/DM/apriori.py,sha256=qhtvNW9BY154YZJtAiz-1iOKJcR9AGD3mTXnTlcIeys,871
5
+ bm_preprocessing/DM/hash.py,sha256=FvoqRwVUOy69DnnlUpv5SscXr0L-yz7yl_StgGM8QWQ,859
6
+ bm_preprocessing/DM/preprocessing.py,sha256=bYV2rm5lyjm586pii1esie1K69zzqLCQXrZGDADvpVA,895
7
+ bm_preprocessing/DM/sources/all.py,sha256=YfViWG8ZJXXkjUP8HtMJGH0vK-3agNv2c_7K9R-PiIU,4571
8
+ bm_preprocessing/DM/sources/apriori.py,sha256=8oPKLzKO9vbr1JYdtjPVmDgrgn7S7bFSW-xm1GS-2u0,2984
9
+ bm_preprocessing/DM/sources/hash.py,sha256=PWZUZ1pNUNXPb_CYtN_gXvEichOAvXs4lzb8G715PSY,4305
10
+ bm_preprocessing/DM/sources/preprocessing.py,sha256=qxtvKO14xcQr8V2YI5D436PFWYGxc4D8Fv6vnaYzEww,1156
11
+ bm_preprocessing/IR/__init__.py,sha256=L4iQk_tDloI4qHD9Ym8XDD-_tXOOHiEz2rRAKpUSk4c,103
12
+ bm_preprocessing/IR/all.py,sha256=z_vATjxIn53wBpnzryLB19RvjDj6ZW-U-chhaSB01ac,855
13
+ bm_preprocessing/IR/sources/all.py,sha256=ejTScGnayqZ8Vk6_Nz8NauHtsbDo3lfobH7VPWRv8Ow,5484
14
+ bm_preprocessing-0.1.0.dist-info/METADATA,sha256=DX37CmfxynI-npXPrpdi-nERO0-VKQ6yo3VeQZj6Lw4,798
15
+ bm_preprocessing-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
16
+ bm_preprocessing-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any