PyPI - bm-preprocessing - Versions diffs - 0.1.0__py3-none-any.whl - Mend

bm-preprocessing 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

bm_preprocessing/DM/__init__.py +8 -0
bm_preprocessing/DM/all.py +30 -0
bm_preprocessing/DM/apriori.py +30 -0
bm_preprocessing/DM/hash.py +30 -0
bm_preprocessing/DM/preprocessing.py +30 -0
bm_preprocessing/DM/sources/all.py +157 -0
bm_preprocessing/DM/sources/apriori.py +107 -0
bm_preprocessing/DM/sources/hash.py +145 -0
bm_preprocessing/DM/sources/preprocessing.py +35 -0
bm_preprocessing/IR/__init__.py +5 -0
bm_preprocessing/IR/all.py +30 -0
bm_preprocessing/IR/sources/all.py +186 -0
bm_preprocessing/__init__.py +6 -0
bm_preprocessing-0.1.0.dist-info/METADATA +33 -0
bm_preprocessing-0.1.0.dist-info/RECORD +16 -0
bm_preprocessing-0.1.0.dist-info/WHEEL +4 -0

bm_preprocessing/DM/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""DM subpackage - Data Mining source code."""
+from .all import all
+from .apriori import apriori
+from .hash import hash
+from .preprocessing import preprocessing
+__all__ = ["all", "apriori", "hash", "preprocessing"]

bm_preprocessing/DM/all.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Source code loader for DM/all.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding='utf-8')
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "all.py"
+all = SourceCodeModule("DM.all", _source_file)

bm_preprocessing/DM/apriori.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Source code loader for DM/apriori.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding='utf-8')
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "apriori.py"
+apriori = SourceCodeModule("DM.apriori", _source_file)

bm_preprocessing/DM/hash.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Source code loader for DM/hash.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding='utf-8')
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "hash.py"
+hash = SourceCodeModule("DM.hash", _source_file)

bm_preprocessing/DM/preprocessing.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Source code loader for DM/preprocessing.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding='utf-8')
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "preprocessing.py"
+preprocessing = SourceCodeModule("DM.preprocessing", _source_file)

bm_preprocessing/DM/sources/all.py ADDED Viewed

@@ -0,0 +1,157 @@
+from itertools import combinations, chain
+from collections import defaultdict
+min_support = 2
+min_conf = 0.7
+transactions = {
+    "T1": {"I1","I2","I4","I5","I6"},
+    "T2": {"I2","I4","I6"},
+    "T3": {"I2","I3"},
+    "T4": {"I1","I2","I4"},
+    "T5": {"I1","I2","I3"},
+    "T6": {"I2","I3"},
+    "T7": {"I1","I3"},
+    "T8": {"I1","I2","I3","I5"},
+    "T9": {"I1","I2","I3"},
+    "T10": {"I1","I2","I4","I5"},
+    "T11": {"I5","I6"}
+}
+genL = lambda C: {k: v for k, v in C.items() if v >= min_support}
+C, L = {}, {}
+C[1] = defaultdict(int)
+for t in transactions.values():
+    for i in t:
+        C[1][frozenset([i])] += 1
+L[1] = genL(C[1])
+h = lambda p, b=7: sum(sum(ord(c) for c in s) for s in p) % b
+buckets = defaultdict(int)
+for t in transactions.values():
+    for p in combinations(sorted(t), 2):
+        buckets[h(p)] += 1
+freq = {b for b, c in buckets.items() if c >= min_support}
+C[2] = defaultdict(int)
+for t in transactions.values():
+    for p in combinations(sorted(t), 2):
+        if h(p) in freq:
+            C[2][frozenset(p)] += 1
+L[2] = genL(C[2])
+k = 3
+while L[k - 1]:
+    C[k] = {
+        frozenset(a | b): 0
+        for a, b in combinations(L[k - 1], 2)
+        if sorted(a)[: k - 2] == sorted(b)[: k - 2]
+        and all(frozenset(s) in L[k - 1] for s in combinations(a | b, k - 1))
+    }
+    for t in transactions.values():
+        for c in C[k]:
+            if c.issubset(t):
+                C[k][c] += 1
+    L[k] = genL(C[k])
+    if not L[k]:
+        break
+    k += 1
+frequent_itemsets = {k: v for Lk in L.values() for k, v in Lk.items()}
+total = len(transactions)
+rules = []
+for itemset, count in frequent_itemsets.items():
+    if len(itemset) < 2:
+        continue
+    for a in chain.from_iterable(
+        combinations(itemset, r) for r in range(1, len(itemset))
+    ):
+        antecedent = frozenset(a)
+        consequent = itemset - antecedent
+        if not consequent:
+            continue
+        support = count / total
+        confidence = (
+            count / C[1 if len(antecedent) == 1 else len(antecedent)][antecedent]
+        )
+        if confidence >= min_conf:
+            rules.append((antecedent, consequent, support, confidence))
+for k, v in C.items():
+    print(
+        f"\nC{k}:\n",
+        "Empty" if not v else "\n".join(f"{set(x)} : {y}" for x, y in v.items()),
+    )
+for k, v in L.items():
+    print(f"\nL{k}:\n", "Empty" if not v else "\n".join(f"{set(x)} : {v[x]}" for x in v))
+print(f"\nAssociation Rules (conf >= {min_conf:.0%}):")
+for a, c, s, conf in rules:
+    print(f"{set(a)} => {set(c)} | support: {s:.2f}, confidence: {conf:.2f}")
+from itertools import combinations, chain
+transactions = {
+    "10": {"A", "C", "D"},
+    "20": {"B", "C", "E"},
+    "30": {"A", "B", "C", "E"},
+    "40": {"B", "E"},
+}
+min_support = 2
+min_conf = 0.7
+genL = lambda C: {k: v for k, v in C.items() if len(v) >= min_support}
+C = {1: {}}
+for tid, items in transactions.items():
+    for i in items:
+        C[1].setdefault(frozenset([i]), set()).add(tid)
+L = {1: genL(C[1])}
+k = 2
+while L[k - 1]:
+    prev = list(L[k - 1].keys())
+    C[k] = {
+        frozenset(a | b): L[k - 1][a] & L[k - 1][b]
+        for i, a in enumerate(prev)
+        for b in prev[i + 1 :]
+        if sorted(a)[: k - 2] == sorted(b)[: k - 2]
+    }
+    L[k] = genL(C[k])
+    if not L[k]:
+        break
+    k += 1
+frequent_itemsets = {k: v for Lk in L.values() for k, v in Lk.items()}
+total = len(transactions)
+rules = []
+for itemset, tids in frequent_itemsets.items():
+    if len(itemset) < 2:
+        continue
+    for a in chain.from_iterable(
+        combinations(itemset, r) for r in range(1, len(itemset))
+    ):
+        antecedent = frozenset(a)
+        consequent = itemset - antecedent
+        if len(consequent) == 0:
+            continue
+        support = len(tids) / total
+        confidence = len(tids) / len(frequent_itemsets[antecedent])
+        if confidence >= min_conf:
+            rules.append((antecedent, consequent, support, confidence))
+for k, v in C.items():
+    print(f"\nC{k}:")
+    print("Empty" if not v else "\n".join(f"{set(x)} : {y}" for x, y in v.items()))
+for k, v in L.items():
+    print(f"\nL{k}:")
+    print("Empty" if not v else "\n".join(f"{set(x)} : {len(y)}" for x, y in v.items()))
+print(f"\nAssociation Rules (conf >= {min_conf:.0%}):")
+for a, c, s, conf in rules:
+    print(f"{set(a)} => {set(c)} | support: {s:.2f}, confidence: {conf:.2f}")

bm_preprocessing/DM/sources/apriori.py ADDED Viewed

@@ -0,0 +1,107 @@
+from collections import defaultdict
+from itertools import combinations
+def print_table(data, title):
+    print(f"\n--- {title} ---")
+    for itemset, count in data.items():
+        print(f"{itemset}: {count}")
+C = {}
+L = {}
+def generate_candidates(prev_frequent_itemsets, k):
+    candidates = set()
+    for itemset1, itemset2 in combinations(prev_frequent_itemsets, 2):
+        union_set = set(itemset1).union(set(itemset2))
+        if len(union_set) == k:
+            candidates.add(tuple(sorted(union_set)))
+    return sorted(list(candidates))
+def count_candidates(candidates, transactions):
+    candidate_count = defaultdict(int)
+    for candidate in candidates:
+        for transaction in transactions.values():
+            if all(item in transaction for item in candidate):
+                candidate_count[candidate] += 1
+    return candidate_count
+def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
+    filtered_candidates = {}
+    for itemset, count in candidate_count.items():
+        if count >= min_support:
+            if prev_freq_itemsets is None or len(itemset) == 1:
+                filtered_candidates[itemset] = count
+            else:
+                subsets = combinations(itemset, len(itemset) - 1)
+                if all(tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets):
+                    filtered_candidates[itemset] = count
+    return filtered_candidates
+def apriori(transactions, min_support):
+    items = sorted(set(item for transaction in transactions.values() for item in transaction))
+    c1_list = [(item,) for item in items]
+    C[1] = count_candidates(c1_list, transactions)
+    L[1] = prune_candidates(C[1], min_support)
+    print_table(C[1], "Candidate 1-itemsets (C1)")
+    print_table(L[1], "Frequent 1-itemsets (L1)")
+    k = 2
+    while True:
+        candidates = generate_candidates(L[k-1].keys(), k)
+        if not candidates:
+            break
+        C[k] = count_candidates(candidates, transactions)
+        L[k] = prune_candidates(C[k], min_support, L[k-1].keys())
+        if not L[k]:
+            print_table(C[k], f"Candidate {k}-itemsets (C{k})")
+            print(f"\nNo frequent {k}-itemsets found. Terminating.\n\n")
+            break
+        print_table(C[k], f"Candidate {k}-itemsets (C{k})")
+        print_table(L[k], f"Frequent {k}-itemsets (L{k})")
+        k += 1
+def main():
+    transactions = {
+        "T100": ["I1", "I2", "I5"],
+        "T200": ["I2", "I4"],
+        "T300": ["I2", "I3"],
+        "T400": ["I1", "I2", "I4"],
+        "T500": ["I1", "I3"],
+        "T600": ["I2", "I3"],
+        "T700": ["I1", "I3"],
+        "T800": ["I1", "I2", "I3", "I5"],
+        "T900": ["I1", "I2", "I3"],
+    }
+    min_support = 2
+    apriori(transactions, min_support)
+if __name__ == "__main__":
+    main()

bm_preprocessing/DM/sources/hash.py ADDED Viewed

@@ -0,0 +1,145 @@
+from collections import defaultdict
+from itertools import combinations
+def print_table(data, title):
+    print(f"\n--- {title} ---")
+    for itemset, count in data.items():
+        print(f"{itemset}: {count}")
+C = {}
+L = {}
+class Bucket:
+    def __init__(self):
+        self.address: int
+        self.count: int = 0
+        self.itemsets: list[tuple] = []
+def generate_candidates(prev_frequent_itemsets, k):
+    candidates = set()
+    for itemset1, itemset2 in combinations(prev_frequent_itemsets, 2):
+        union_set = set(itemset1).union(set(itemset2))
+        if len(union_set) == k:
+            candidates.add(tuple(sorted(union_set)))
+    return sorted(list(candidates))
+def count_candidates(candidates, transactions):
+    candidate_count = defaultdict(int)
+    for candidate in candidates:
+        for transaction in transactions.values():
+            if all(item in transaction for item in candidate):
+                candidate_count[candidate] += 1
+    return candidate_count
+def prune_candidates(candidate_count, min_support, prev_freq_itemsets=None):
+    filtered_candidates = {}
+    for itemset, count in candidate_count.items():
+        if count >= min_support:
+            if prev_freq_itemsets is None or len(itemset) == 1:
+                filtered_candidates[itemset] = count
+            else:
+                subsets = combinations(itemset, len(itemset) - 1)
+                if all(tuple(sorted(subset)) in prev_freq_itemsets for subset in subsets):
+                    filtered_candidates[itemset] = count
+    return filtered_candidates
+def apriori(transactions, min_support):
+    items = sorted(set(item for transaction in transactions.values() for item in transaction))
+    c1_list = [(item,) for item in items]
+    C[1] = count_candidates(c1_list, transactions)
+    L[1] = prune_candidates(C[1], min_support)
+    print_table(C[1], "Candidate 1-itemsets (C1)")
+    print_table(L[1], "Frequent 1-itemsets (L1)")
+    k = 2
+    transactions_combinations = {
+        transaction_id: combinations(items, k)
+        for transaction_id, items in transactions.items()
+    }
+    buckets = [Bucket(addr) for addr in range(7)]
+    items_list = sorted(set(item for transaction in transactions.values() for item in transaction))
+    ranks = {item: idx + 1 for idx, item in enumerate(items_list)}
+    hash_fn = lambda item1, item2: (ranks[item1] * 10 + ranks[item2]) % 7
+    for itemset in transactions_combinations.values():
+        for item in itemset:
+            item1, item2 = item[0], item[1]
+            address = hash_fn(item1, item2) % 7
+            buckets[address].count += 1
+            buckets[address].itemsets.append(item)
+    print("\n--- Hash Table Buckets ---")
+    for bucket in buckets:
+        print(f"Address: {bucket.address}, Count: {bucket.count}, Itemsets: {bucket.itemsets}")
+    # Filter
+    L2 = {itemset: bucket.count for bucket in buckets for itemset in bucket.itemsets if bucket.count >= min_support}
+    print_table(L2, "Frequent 2-itemsets after Hashing (L2)")
+    C["2"] = generate_candidates(L[k-1].keys(), k)
+    L["2"] = L2
+    k = 3
+    while True:
+        candidates = generate_candidates(L[k-1].keys(), k)
+        if not candidates:
+            break
+        C[k] = count_candidates(candidates, transactions)
+        L[k] = prune_candidates(C[k], min_support, L[k-1].keys())
+        if not L[k]:
+            print_table(C[k], f"Candidate {k}-itemsets (C{k})")
+            print(f"\nNo frequent {k}-itemsets found. Terminating.\n\n")
+            break
+        print_table(C[k], f"Candidate {k}-itemsets (C{k})")
+        print_table(L[k], f"Frequent {k}-itemsets (L{k})")
+        k += 1
+def main():
+    transactions = {
+        "T100": ["I1", "I2", "I5"],
+        "T200": ["I2", "I4"],
+        "T300": ["I2", "I3"],
+        "T400": ["I1", "I2", "I4"],
+        "T500": ["I1", "I3"],
+        "T600": ["I2", "I3"],
+        "T700": ["I1", "I3"],
+        "T800": ["I1", "I2", "I3", "I5"],
+        "T900": ["I1", "I2", "I3"],
+    }
+    min_support = 2
+    apriori(transactions, min_support)
+if __name__ == "__main__":
+    main()

bm_preprocessing/DM/sources/preprocessing.py ADDED Viewed

@@ -0,0 +1,35 @@
+import pandas as pd
+import numpy as np
+# 1. Reading from CSV
+def load_csv(file_path):
+    df = pd.read_csv(file_path)
+    return df
+# 2. Reading from Excel
+def load_excel(file_path, sheet_name=0):
+    df = pd.read_excel(file_path, sheet_name=sheet_name)
+    return df
+# 3. Mean and Median Fill (For Numerical Columns)
+def impute_numerical(df):
+    # Filling with Mean
+    df_mean = df.copy()
+    # Select only numeric columns for mean/median to avoid errors
+    numeric_cols = df_mean.select_dtypes(include=[np.number]).columns
+    df_mean[numeric_cols] = df_mean[numeric_cols].fillna(df_mean[numeric_cols].mean())
+    # Filling with Median
+    df_median = df.copy()
+    df_median[numeric_cols] = df_median[numeric_cols].fillna(df_median[numeric_cols].median())
+    return df_mean, df_median
+# 4. General Fill NA (For Categorical/Transaction Data)
+def fill_general_na(df, value="Unknown"):
+    # Often in transaction data, we fill NaNs with a placeholder or empty string
+    return df.fillna(value)
+# Example Usage:
+# df = load_csv('transactions.csv')
+# df_filled = fill_general_na(df, value="Missing_Item")

bm_preprocessing/IR/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""IR subpackage - Information Retrieval source code."""
+from .all import all
+__all__ = ["all"]

bm_preprocessing/IR/all.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Source code loader for IR/all.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding='utf-8')
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "all.py"
+all = SourceCodeModule("IR.all", _source_file)

bm_preprocessing/IR/sources/all.py ADDED Viewed

@@ -0,0 +1,186 @@
+import math
+from collections import defaultdict, Counter
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer, WordNetLemmatizer
+nltk.download('stopwords')
+nltk.download('wordnet')
+stop_words = set(stopwords.words('english'))
+stemmer = PorterStemmer()
+lemmatizer = WordNetLemmatizer()
+def preprocess(text):
+    tokens = text.lower().split()
+    tokens = [t for t in tokens if t not in stop_words]
+    tokens = [stemmer.stem(t) for t in tokens]
+    tokens = [lemmatizer.lemmatize(t) for t in tokens]
+    return tokens
+# ---------- Corpus ----------
+docs = [
+    "information retrieval is fun",
+    "retrieval models are boolean vector probabilistic",
+    "information theory and probability",
+    "boolean retrieval is simple"
+]
+processed_docs = [preprocess(doc) for doc in docs]
+N = len(docs)
+# ---------- 2. Term Incidence Matrix ----------
+terms = sorted(set(term for doc in processed_docs for term in doc))
+term_incidence = {
+    term: [1 if term in doc else 0 for doc in processed_docs]
+    for term in terms
+}
+print("\nTerm Incidence Matrix:")
+for term, row in term_incidence.items():
+    print(term, row)
+# ---------- 3. Inverted Index ----------
+inverted_index = defaultdict(list)
+for doc_id, doc in enumerate(processed_docs):
+    for term in set(doc):
+        inverted_index[term].append(doc_id)
+print("\nInverted Index:")
+for term, postings in inverted_index.items():
+    print(term, postings)
+# ---------- Query ----------
+query = "information AND NOT boolean"
+query_terms = preprocess(query)
+# ---------- 4. Boolean Model (AND / OR / NOT) ----------
+def boolean_retrieval(query):
+    tokens = query.upper().split()
+    result = set()
+    current_op = None
+    i = 0
+    while i < len(tokens):
+        token = tokens[i]
+        if token in {"AND", "OR", "NOT"}:
+            current_op = token
+        else:
+            term = preprocess(token.lower())
+            postings = set()
+            if term and term[0] in inverted_index:
+                postings = set(inverted_index[term[0]])
+            if current_op == "NOT":
+                postings = set(range(N)) - postings
+                current_op = None
+            if not result:
+                result = postings
+            else:
+                if current_op == "AND":
+                    result = result & postings
+                elif current_op == "OR":
+                    result = result | postings
+        i += 1
+    return result
+boolean_result = boolean_retrieval(query)
+print("\nBoolean Retrieval Result:", boolean_result)
+# ---------- 5. Vector Space Model (TF-IDF) ----------
+def tf(doc):
+    return Counter(doc)
+def idf(term):
+    df = sum(1 for d in processed_docs if term in d)
+    return math.log(N / (df + 1))
+def tfidf(doc):
+    return {t: tf(doc)[t] * idf(t) for t in doc}
+doc_vectors = [tfidf(doc) for doc in processed_docs]
+query_vector = tfidf(preprocess("information retrieval"))
+def cosine_similarity(v1, v2):
+    num = sum(v1.get(t, 0) * v2.get(t, 0) for t in set(v1) | set(v2))
+    den1 = math.sqrt(sum(v ** 2 for v in v1.values()))
+    den2 = math.sqrt(sum(v ** 2 for v in v2.values()))
+    return num / (den1 * den2) if den1 and den2 else 0
+vsm_scores = {
+    i: cosine_similarity(query_vector, doc_vectors[i])
+    for i in range(N)
+}
+print("\nVector Space Model Scores:", vsm_scores)
+# ---------- 6. Probabilistic Model (BIM with RSV) ----------
+def bim_rsv(doc, query_terms):
+    rsv = 0.0
+    for term in query_terms:
+        if term in doc:
+            df = sum(1 for d in processed_docs if term in d)
+            rsv += math.log((N - df + 0.5) / (df + 0.5))
+    return rsv
+bim_scores = {
+    i: bim_rsv(processed_docs[i], preprocess("information retrieval"))
+    for i in range(N)
+}
+print("\nBIM RSV Scores:", bim_scores)
+# ---------- 8. Okapi BM25 ----------
+avg_dl = sum(len(doc) for doc in processed_docs) / N
+k1, b = 1.5, 0.75
+def bm25(doc, query_terms):
+    score = 0.0
+    doc_len = len(doc)
+    freqs = Counter(doc)
+    for term in query_terms:
+        if term in freqs:
+            df = sum(1 for d in processed_docs if term in d)
+            idf = math.log((N - df + 0.5) / (df + 0.5))
+            tf = freqs[term]
+            score += idf * ((tf * (k1 + 1)) /
+                     (tf + k1 * (1 - b + b * doc_len / avg_dl)))
+    return score
+bm25_scores = {
+    i: bm25(processed_docs[i], preprocess("information retrieval"))
+    for i in range(N)
+}
+print("\nBM25 Scores:", bm25_scores)
+# ---------- 7. Evaluation Metrics ----------
+relevant_docs = {0, 3}  # ground truth
+def evaluate(retrieved):
+    retrieved = set(retrieved)
+    tp = len(retrieved & relevant_docs)
+    fp = len(retrieved - relevant_docs)
+    fn = len(relevant_docs - retrieved)
+    precision = tp / (tp + fp) if tp + fp else 0
+    recall = tp / (tp + fn) if tp + fn else 0
+    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
+    accuracy = tp / N
+    return accuracy, precision, recall, f1
+# ---------- 9. Compare Models ----------
+print("\nEvaluation Metrics:")
+print("Boolean:", evaluate(boolean_result))
+print("VSM:", evaluate([i for i, s in vsm_scores.items() if s > 0]))
+print("BIM:", evaluate([i for i, s in bim_scores.items() if s > 0]))
+print("BM25:", evaluate([i for i, s in bm25_scores.items() if s > 0]))

bm_preprocessing/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""bm-preprocessing: A package to view source code from DM and IR modules."""
+from . import IR
+from . import DM
+__all__ = ["IR", "DM"]

bm_preprocessing-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,33 @@
+Metadata-Version: 2.4
+Name: bm-preprocessing
+Version: 0.1.0
+Summary: A package to view source code from DM and IR modules
+Requires-Python: >=3.8
+Requires-Dist: build>=1.2.2.post1
+Requires-Dist: twine>=6.1.0
+Description-Content-Type: text/markdown
+# bm-preprocessing
+A Python package that displays source code from DM and IR modules.
+## Usage
+```python
+from bm_preprocessing.IR import all
+print(all)  # Prints the entire source code of IR/all.py
+from bm_preprocessing.DM import apriori
+print(apriori)  # Prints the entire source code of DM/apriori.py
+```
+## Available Modules
+### IR
+- `all` - Information Retrieval algorithms
+### DM
+- `all` - Data Mining algorithms
+- `apriori` - Apriori algorithm implementation
+- `hash` - Hash-based mining
+- `preprocessing` - Data preprocessing utilities

bm_preprocessing-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+bm_preprocessing/__init__.py,sha256=Df9ccHGwwTDamkW2y_t9Vwq6r975WqfakEHhDTr0nko,143
+bm_preprocessing/DM/__init__.py,sha256=ufUf_cL0MIk_xLsqJvfLybVo-1BwsXtksElCOZLUAe0,225
+bm_preprocessing/DM/all.py,sha256=hL8SptuvZ7HVf4G7e8UvuY-8nDYTF4laJvxvGed720o,855
+bm_preprocessing/DM/apriori.py,sha256=qhtvNW9BY154YZJtAiz-1iOKJcR9AGD3mTXnTlcIeys,871
+bm_preprocessing/DM/hash.py,sha256=FvoqRwVUOy69DnnlUpv5SscXr0L-yz7yl_StgGM8QWQ,859
+bm_preprocessing/DM/preprocessing.py,sha256=bYV2rm5lyjm586pii1esie1K69zzqLCQXrZGDADvpVA,895
+bm_preprocessing/DM/sources/all.py,sha256=YfViWG8ZJXXkjUP8HtMJGH0vK-3agNv2c_7K9R-PiIU,4571
+bm_preprocessing/DM/sources/apriori.py,sha256=8oPKLzKO9vbr1JYdtjPVmDgrgn7S7bFSW-xm1GS-2u0,2984
+bm_preprocessing/DM/sources/hash.py,sha256=PWZUZ1pNUNXPb_CYtN_gXvEichOAvXs4lzb8G715PSY,4305
+bm_preprocessing/DM/sources/preprocessing.py,sha256=qxtvKO14xcQr8V2YI5D436PFWYGxc4D8Fv6vnaYzEww,1156
+bm_preprocessing/IR/__init__.py,sha256=L4iQk_tDloI4qHD9Ym8XDD-_tXOOHiEz2rRAKpUSk4c,103
+bm_preprocessing/IR/all.py,sha256=z_vATjxIn53wBpnzryLB19RvjDj6ZW-U-chhaSB01ac,855
+bm_preprocessing/IR/sources/all.py,sha256=ejTScGnayqZ8Vk6_Nz8NauHtsbDo3lfobH7VPWRv8Ow,5484
+bm_preprocessing-0.1.0.dist-info/METADATA,sha256=DX37CmfxynI-npXPrpdi-nERO0-VKQ6yo3VeQZj6Lw4,798
+bm_preprocessing-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+bm_preprocessing-0.1.0.dist-info/RECORD,,

bm_preprocessing-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.28.0
+Root-Is-Purelib: true
+Tag: py3-none-any