PyPI - bm-preprocessing - Versions diffs - 1.0.0__tar.gz → 1.2.0__tar.gz - Mend

bm-preprocessing 1.0.0tar.gz → 1.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

{bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bm-preprocessing
-Version: 1.0.0
+Version: 1.2.0
 Summary: A package to preprocess text data
 Requires-Python: >=3.8
 Requires-Dist: build>=1.2.2.post1

{bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/USAGE.md RENAMED Viewed

@@ -14,7 +14,7 @@ Create a file `example.py`:
 ```python
 # Import modules
-from bm_preprocessing.IR import all
+from bm_preprocessing.IR import all, all_vis, eval_metrics, ndd, rel
 from bm_preprocessing.DM import adaboost, apriori, bagging, hash, hunts, hunts_test, id3, id3_test, lib_doc, metrics, preprocessing
 from bm_preprocessing.DM import all, all_vis
@@ -108,6 +108,10 @@ Then in the Python REPL:
 ```bash
 python -c "from bm_preprocessing.IR import all; print(all)"
+python -c "from bm_preprocessing.IR import all_vis; print(all_vis)"
+python -c "from bm_preprocessing.IR import eval_metrics; print(eval_metrics)"
+python -c "from bm_preprocessing.IR import ndd; print(ndd)"
+python -c "from bm_preprocessing.IR import rel; print(rel)"
 python -c "from bm_preprocessing.DM import all; print(all)"
 python -c "from bm_preprocessing.DM import all_vis; print(all_vis)"
 python -c "from bm_preprocessing.DM import apriori; print(apriori)"
@@ -129,7 +133,11 @@ python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
 | Import | Description |
 |--------|-------------|
-| `from bm_preprocessing.IR import all` | Information Retrieval (BM25, TF-IDF, Boolean) |
+| `from bm_preprocessing.IR import all` | Information Retrieval (MinHash, LSH, Rocchio, Jaccard, VS) |
+| `from bm_preprocessing.IR import all_vis` | IR algorithms with Matplotlib visualizations |
+| `from bm_preprocessing.IR import eval_metrics` | Jaccard, PRF, Compression Ratio, MAP metrics & plots |
+| `from bm_preprocessing.IR import ndd` | Near Duplicate Documents (MinHash & LSH) |
+| `from bm_preprocessing.IR import rel` | Relevance feedback & query expansion (Rocchio & LCA) |
 | `from bm_preprocessing.DM import all` | All DM algorithms (Hunt's, ID3, Bagging, AdaBoost, metrics) |
 | `from bm_preprocessing.DM import all_vis` | All DM algorithms + graphviz & full visualization |
 | `from bm_preprocessing.DM import apriori` | Apriori algorithm |

{bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "bm-preprocessing"
-version = "1.0.0"
+version = "1.2.0"
 description = "A package to preprocess text data"
 readme = "README.md"
 requires-python = ">=3.8"

bm_preprocessing-1.2.0/src/bm_preprocessing/IR/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""IR subpackage - Information Retrieval source code."""
+from .all import all
+from .all_vis import all_vis
+from .eval_metrics import eval_metrics
+from .ndd import ndd
+from .rel import rel
+__all__ = ["all", "all_vis", "eval_metrics", "ndd", "rel"]

bm_preprocessing-1.2.0/src/bm_preprocessing/IR/all_vis.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Source code loader for IR/all_vis.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding="utf-8")
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "all_vis.py"
+all_vis = SourceCodeModule("IR.all_vis", _source_file)

bm_preprocessing-1.2.0/src/bm_preprocessing/IR/eval_metrics.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""Source code loader for IR/eval_metrics.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding="utf-8")
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "eval_metrics.py"
+eval_metrics = SourceCodeModule("IR.eval_metrics", _source_file)

bm_preprocessing-1.2.0/src/bm_preprocessing/IR/ndd.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""Source code loader for IR/ndd.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding="utf-8")
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "ndd.py"
+ndd = SourceCodeModule("IR.ndd", _source_file)

bm_preprocessing-1.2.0/src/bm_preprocessing/IR/rel.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""Source code loader for IR/rel.py"""
+from pathlib import Path
+class SourceCodeModule:
+    """A class that displays source code when printed."""
+    def __init__(self, name: str, source_path: Path):
+        self.name = name
+        self._source_path = source_path
+        self._source_code = None
+    @property
+    def source_code(self) -> str:
+        """Lazily load source code."""
+        if self._source_code is None:
+            self._source_code = self._source_path.read_text(encoding="utf-8")
+        return self._source_code
+    def __repr__(self) -> str:
+        return self.source_code
+    def __str__(self) -> str:
+        return self.source_code
+# Get the path to the source file
+_source_file = Path(__file__).parent / "sources" / "rel.py"
+rel = SourceCodeModule("IR.rel", _source_file)

bm_preprocessing-1.2.0/src/bm_preprocessing/IR/sources/all_vis.py ADDED Viewed

@@ -0,0 +1,294 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import random, hashlib, nltk
+from itertools import combinations
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+nltk.download('punkt_tab')
+nltk.download('stopwords')
+random.seed(42)
+np.random.seed(42)
+docs = [
+    "information retrieval is the process of obtaining relevant documents",
+    "search engines use ranking algorithms for information retrieval",
+    "information retrieval systems index and rank documents",
+    "retrieval models help search engines find relevant documents",
+    "inverted index is widely used in information retrieval",
+    "query expansion improves retrieval effectiveness",
+    "query expansion adds related terms to the query",
+    "expansion techniques improve search results",
+    "duplicate documents appear frequently in search engines",
+    "near duplicate detection improves indexing"
+]
+stop_words = set(stopwords.words('english'))
+stemmer = PorterStemmer()
+def preprocess(text):
+    return [stemmer.stem(w) for w in word_tokenize(text.lower()) if w.isalnum() and w not in stop_words]
+def sim_df(mat, title):
+    df = pd.DataFrame(np.round(np.asarray(mat), 3),
+                      index=[f"Doc{i}" for i in range(len(docs))],
+                      columns=[f"Doc{i}" for i in range(len(docs))])
+    print(f"\n{title}")
+    print(df)
+    return df
+def prf(tp, fp, fn):
+    p = tp / (tp + fp) if tp + fp else 0
+    r = tp / (tp + fn) if tp + fn else 0
+    f = 2 * p * r / (p + r) if p + r else 0
+    return round(p, 3), round(r, 3), round(f, 3)
+processed_docs = [" ".join(preprocess(doc)) for doc in docs]
+shingles = [set(preprocess(doc)) for doc in docs]
+# MinHash
+num_hash, max_shingle = 50, 1000
+hash_funcs = [(random.randint(1, max_shingle), random.randint(0, max_shingle)) for _ in range(num_hash)]
+vocab = list(set(word for doc in shingles for word in doc))
+shingle_index = {w: i for i, w in enumerate(vocab)}
+def h(x, a, b): return (a * x + b) % max_shingle
+signature = np.full((num_hash, len(docs)), np.inf)
+for d, doc in enumerate(shingles):
+    for word in doc:
+        idx = shingle_index[word]
+        for i, (a, b) in enumerate(hash_funcs):
+            signature[i, d] = min(signature[i, d], h(idx, a, b))
+signature = signature.astype(int)
+minhash_sim = np.matrix([[np.mean(signature[:, i] == signature[:, j]) for j in range(len(docs))] for i in range(len(docs))])
+sim_df(minhash_sim, "MinHash Similarity Table")
+# LSH
+def get_lsh_candidates(sig, bands):
+    rows = sig.shape[0] // bands
+    buckets, candidates = {}, set()
+    for b in range(bands):
+        for d in range(sig.shape[1]):
+            band = tuple(sig[b * rows:(b + 1) * rows, d])
+            key = hashlib.md5(str(band).encode()).hexdigest()
+            buckets.setdefault((b, key), []).append(d)
+    for group in buckets.values():
+        if len(group) > 1:
+            for pair in combinations(group, 2):
+                candidates.add(tuple(sorted(pair)))
+    return candidates
+candidates = get_lsh_candidates(signature, 10)
+lsh_df = pd.DataFrame([(f"Doc{i}", f"Doc{j}") for i, j in sorted(candidates)], columns=["Document 1", "Document 2"])
+print("\nLSH Candidate Pairs Table")
+print(lsh_df)
+# Rocchio
+vectorizer = TfidfVectorizer()
+tfidf = vectorizer.fit_transform(processed_docs)
+query = "information retrieval"
+q_vec = vectorizer.transform([" ".join(preprocess(query))])
+scores = cosine_similarity(q_vec, tfidf)[0]
+top_docs = scores.argsort()[::-1][:3]
+relevant = tfidf[top_docs]
+non_relevant = tfidf[[i for i in range(len(docs)) if i not in top_docs]]
+alpha, beta, gamma = 1, 0.75, 0.15
+new_query = alpha * q_vec + beta * np.asarray(relevant.mean(axis=0)) - gamma * np.asarray(non_relevant.mean(axis=0))
+new_scores = cosine_similarity(np.asarray(new_query), tfidf)[0]
+rocchio_df = pd.DataFrame({
+    "Document": [f"Doc{i}" for i in range(len(docs))],
+    "Original Score": np.round(scores, 3),
+    "Updated Score": np.round(new_scores, 3)
+})
+print("\nRocchio Score Table")
+print(rocchio_df)
+# LCA
+top_k = scores.argsort()[::-1][:5]
+term_freq = {}
+for doc in [processed_docs[i] for i in top_k]:
+    for word in doc.split():
+        term_freq[word] = term_freq.get(word, 0) + 1
+expanded_terms = sorted(term_freq, key=term_freq.get, reverse=True)[:5]
+expanded_query = " ".join(preprocess(query)) + " " + " ".join(expanded_terms)
+expanded_scores = cosine_similarity(vectorizer.transform([expanded_query]), tfidf)[0]
+print("\nLCA Expanded Query")
+print(expanded_query)
+lca_df = pd.DataFrame({
+    "Document": [f"Doc{i}" for i in range(len(docs))],
+    "LCA Score": np.round(expanded_scores, 3)
+})
+print("\nLCA Score Table")
+print(lca_df)
+# Jaccard
+jaccard = lambda a, b: len(a & b) / len(a | b)
+jaccard_matrix = np.matrix([[jaccard(shingles[i], shingles[j]) for j in range(len(docs))] for i in range(len(docs))])
+sim_df(jaccard_matrix, "Jaccard Similarity Table")
+# Precision Recall Fscore with different bucket sizes
+threshold = 0.30
+ground_truth = {(i, j) for i in range(len(docs)) for j in range(i + 1, len(docs)) if float(jaccard_matrix[i, j]) >= threshold}
+bucket_rows = []
+for b in [5, 10, 25]:
+    if num_hash % b == 0:
+        cand = get_lsh_candidates(signature, b)
+        tp = len(cand & ground_truth)
+        fp = len(cand - ground_truth)
+        fn = len(ground_truth - cand)
+        p, r, f = prf(tp, fp, fn)
+        bucket_rows.append([b, len(cand), tp, fp, fn, p, r, f])
+bucket_df = pd.DataFrame(bucket_rows, columns=["Bucket Size", "Candidate Pairs", "TP", "FP", "FN", "Precision", "Recall", "Fscore"])
+print("\nPrecision Recall Fscore with Different Bucket Size")
+print(bucket_df)
+# Signature Size Compression Ratio Accuracy
+original_size = len(vocab) * len(docs)
+comp_rows = []
+for rows_used in [10, 20, 30, 40, 50]:
+    sub_sig = signature[:rows_used, :]
+    correct, total = 0, 0
+    for i in range(len(docs)):
+        for j in range(i + 1, len(docs)):
+            approx = np.mean(sub_sig[:, i] == sub_sig[:, j]) >= threshold
+            actual = float(jaccard_matrix[i, j]) >= threshold
+            correct += int(approx == actual)
+            total += 1
+    comp_rows.append([
+        rows_used,
+        sub_sig.size,
+        round(sub_sig.size / original_size, 3),
+        round(correct / total, 3)
+    ])
+compression_df = pd.DataFrame(comp_rows, columns=["Signature Rows Used", "Signature Size", "Compression Ratio", "Accuracy"])
+print("\nSignature Size Compression Ratio Accuracy Table")
+print(compression_df)
+# MAP change for different term reweighting
+training_queries = ["information retrieval", "query expansion", "search engines", "duplicate detection"]
+query_relevance = {
+    "information retrieval": {0, 1, 2, 3, 4},
+    "query expansion": {5, 6, 7},
+    "search engines": {1, 3, 8},
+    "duplicate detection": {8, 9}
+}
+settings = [(1.0, 0.75, 0.15), (1.0, 0.50, 0.25), (1.0, 1.00, 0.50)]
+def avg_precision(score_vector, relevant_ids):
+    ranked = np.argsort(score_vector)[::-1]
+    hits, s = 0, 0
+    for rank, d in enumerate(ranked, 1):
+        if d in relevant_ids:
+            hits += 1
+            s += hits / rank
+    return s / len(relevant_ids)
+map_rows = []
+for a, b, g in settings:
+    before_list, after_list = [], []
+    for tq in training_queries:
+        tq_vec = vectorizer.transform([" ".join(preprocess(tq))])
+        base = cosine_similarity(tq_vec, tfidf)[0]
+        top = base.argsort()[::-1][:3]
+        rel = tfidf[top]
+        nonrel = tfidf[[i for i in range(len(docs)) if i not in top]]
+        rq = a * tq_vec + b * np.asarray(rel.mean(axis=0)) - g * np.asarray(nonrel.mean(axis=0))
+        updated = cosine_similarity(np.asarray(rq), tfidf)[0]
+        before_list.append(avg_precision(base, query_relevance[tq]))
+        after_list.append(avg_precision(updated, query_relevance[tq]))
+    mb, ma = np.mean(before_list), np.mean(after_list)
+    change = ((ma - mb) / mb) * 100 if mb else 0
+    map_rows.append([a, b, g, round(mb, 3), round(ma, 3), round(change, 3)])
+map_df = pd.DataFrame(map_rows, columns=["Alpha", "Beta", "Gamma", "MAP Before", "MAP After", "Percent Change in MAP"])
+print("\nPercent Change in Mean Average Precision on Training Queries for Different Term Reweighting")
+print(map_df)
+# Graphs
+# Additional Graphs
+plt.figure()
+plt.imshow(np.asarray(minhash_sim), cmap='viridis')
+plt.colorbar()
+plt.title("MinHash Similarity Heatmap")
+plt.xlabel("Documents")
+plt.ylabel("Documents")
+plt.show()
+plt.figure()
+plt.imshow(np.asarray(jaccard_matrix), cmap='plasma')
+plt.colorbar()
+plt.title("Jaccard Similarity Heatmap")
+plt.xlabel("Documents")
+plt.ylabel("Documents")
+plt.show()
+plt.figure()
+plt.bar(["Before Rocchio", "After Rocchio"], [np.mean(scores), np.mean(new_scores)])
+plt.title("MAP Change After Rocchio")
+plt.ylabel("MAP")
+plt.show()
+precision_val = bucket_df["Precision"].mean()
+recall_val = bucket_df["Recall"].mean()
+fscore_val = bucket_df["Fscore"].mean()
+plt.figure()
+plt.bar(["Precision", "Recall", "Fscore"], [precision_val, recall_val, fscore_val])
+plt.title("Average Evaluation Metrics")
+plt.ylabel("Value")
+plt.show()
+plt.figure()
+plt.plot(bucket_df["Bucket Size"], bucket_df["Precision"], marker='o', label="Precision")
+plt.plot(bucket_df["Bucket Size"], bucket_df["Recall"], marker='s', label="Recall")
+plt.plot(bucket_df["Bucket Size"], bucket_df["Fscore"], marker='^', label="Fscore")
+plt.title("PRF vs Bucket Size")
+plt.xlabel("Bucket Size")
+plt.ylabel("Value")
+plt.legend()
+plt.show()
+plt.figure()
+plt.plot(compression_df["Signature Rows Used"], compression_df["Compression Ratio"], marker='o', label="Compression Ratio")
+plt.plot(compression_df["Signature Rows Used"], compression_df["Accuracy"], marker='s', label="Accuracy")
+plt.title("Compression Ratio and Accuracy")
+plt.xlabel("Signature Rows Used")
+plt.ylabel("Value")
+plt.legend()
+plt.show()
+labels = [f"a={r['Alpha']}, b={r['Beta']}, g={r['Gamma']}" for _, r in map_df.iterrows()]
+plt.figure()
+plt.plot(labels, map_df["MAP Before"], marker='o', label="MAP Before")
+plt.plot(labels, map_df["MAP After"], marker='s', label="MAP After")
+plt.title("MAP for Different Reweighting")
+plt.xlabel("Term Reweighting")
+plt.ylabel("MAP")
+plt.xticks(rotation=20)
+plt.legend()
+plt.show()
+plt.figure()
+plt.bar(labels, map_df["Percent Change in MAP"])
+plt.title("Percent Change in MAP")
+plt.xlabel("Term Reweighting")
+plt.ylabel("Percent Change")
+plt.xticks(rotation=20)
+plt.show()

bm_preprocessing-1.2.0/src/bm_preprocessing/IR/sources/eval_metrics.py ADDED Viewed

@@ -0,0 +1,224 @@
+import numpy as np
+import pandas as pd
+import random
+import hashlib
+import nltk
+from itertools import combinations
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import matplotlib.pyplot as plt
+nltk.download('punkt_tab', quiet=True)
+nltk.download('stopwords', quiet=True)
+random.seed(42)
+np.random.seed(42)
+docs = [
+    "information retrieval is the process of obtaining relevant documents",
+    "search engines use ranking algorithms for information retrieval",
+    "information retrieval systems index and rank documents",
+    "retrieval models help search engines find relevant documents",
+    "inverted index is widely used in information retrieval",
+    "query expansion improves retrieval effectiveness",
+    "query expansion adds related terms to the query",
+    "expansion techniques improve search results",
+    "duplicate documents appear frequently in search engines",
+    "near duplicate detection improves indexing"
+]
+stop_words = set(stopwords.words('english'))
+stemmer = PorterStemmer()
+def preprocess(text):
+    return [stemmer.stem(w) for w in word_tokenize(text.lower()) if w.isalnum() and w not in stop_words]
+def sim_df(mat, title):
+    df = pd.DataFrame(np.round(np.asarray(mat), 3),
+                      index=[f"Doc{i}" for i in range(len(docs))],
+                      columns=[f"Doc{i}" for i in range(len(docs))])
+    print(f"\n{title}")
+    # print(df)
+    return df
+def prf(tp, fp, fn):
+    p = tp / (tp + fp) if tp + fp else 0
+    r = tp / (tp + fn) if tp + fn else 0
+    f = 2 * p * r / (p + r) if p + r else 0
+    return round(p, 3), round(r, 3), round(f, 3)
+processed_docs = [" ".join(preprocess(doc)) for doc in docs]
+shingles = [set(preprocess(doc)) for doc in docs]
+num_hash, max_shingle = 50, 1000
+hash_funcs = [(random.randint(1, max_shingle), random.randint(0, max_shingle)) for _ in range(num_hash)]
+vocab = list(set(word for doc in shingles for word in doc))
+shingle_index = {w: i for i, w in enumerate(vocab)}
+def h(x, a, b): return (a * x + b) % max_shingle
+signature = np.full((num_hash, len(docs)), np.inf)
+for d, doc in enumerate(shingles):
+    for word in doc:
+        idx = shingle_index[word]
+        for i, (a, b) in enumerate(hash_funcs):
+            signature[i, d] = min(signature[i, d], h(idx, a, b))
+signature = signature.astype(int)
+# =====================================================================
+# 1. JACCARD SIMILARITY
+# =====================================================================
+jaccard = lambda a, b: len(a & b) / len(a | b)
+jaccard_matrix = np.matrix([[jaccard(shingles[i], shingles[j]) for j in range(len(docs))] for i in range(len(docs))])
+sim_df(jaccard_matrix, "JACCARD SIMILARITY TABLE")
+df_jaccard = pd.DataFrame(np.round(np.asarray(jaccard_matrix), 3))
+print("=" * 60)
+print(df_jaccard)
+# =====================================================================
+# 2. PRECISION, RECALL, FSCORE WITH DIFFERENT BUCKET SIZES (LSH)
+# =====================================================================
+def get_lsh_candidates(sig, bands):
+    rows = sig.shape[0] // bands
+    buckets, candidates = {}, set()
+    for b in range(bands):
+        for d in range(sig.shape[1]):
+            band = tuple(sig[b * rows:(b + 1) * rows, d])
+            key = hashlib.md5(str(band).encode()).hexdigest()
+            buckets.setdefault((b, key), []).append(d)
+    for group in buckets.values():
+        if len(group) > 1:
+            for pair in combinations(group, 2):
+                candidates.add(tuple(sorted(pair)))
+    return candidates
+threshold = 0.30
+ground_truth = {(i, j) for i in range(len(docs)) for j in range(i + 1, len(docs)) if float(jaccard_matrix[i, j]) >= threshold}
+bucket_rows = []
+for b in [5, 10, 25]:
+    if num_hash % b == 0:
+        cand = get_lsh_candidates(signature, b)
+        tp = len(cand & ground_truth)
+        fp = len(cand - ground_truth)
+        fn = len(ground_truth - cand)
+        p, r, f = prf(tp, fp, fn)
+        bucket_rows.append([b, len(cand), tp, fp, fn, p, r, f])
+bucket_df = pd.DataFrame(bucket_rows, columns=["Bucket Size", "Candidate Pairs", "TP", "FP", "FN", "Precision", "Recall", "Fscore"])
+print("\nPRECISION, RECALL, FSCORE WITH DIFFERENT BUCKET SIZES")
+print("=" * 60)
+print(bucket_df)
+# =====================================================================
+# 3. SIGNATURE SIZE / COMPRESSION RATIO AND ACCURACY
+# =====================================================================
+original_size = len(vocab) * len(docs)
+comp_rows = []
+for rows_used in [10, 20, 30, 40, 50]:
+    sub_sig = signature[:rows_used, :]
+    correct, total = 0, 0
+    for i in range(len(docs)):
+        for j in range(i + 1, len(docs)):
+            approx = np.mean(sub_sig[:, i] == sub_sig[:, j]) >= threshold
+            actual = float(jaccard_matrix[i, j]) >= threshold
+            correct += int(approx == actual)
+            total += 1
+    comp_rows.append([
+        rows_used,
+        sub_sig.size,
+        round(sub_sig.size / original_size, 3),
+        round(correct / total, 3)
+    ])
+compression_df = pd.DataFrame(comp_rows, columns=["Signature Rows Used", "Signature Size", "Compression Ratio", "Accuracy"])
+print("\nSIGNATURE SIZE, COMPRESSION RATIO & ACCURACY TABLE")
+print("=" * 60)
+print(compression_df)
+# =====================================================================
+# 4. PERCENT CHANGE IN MEAN AVERAGE PRECISION ON TRAINING QUERIES
+# =====================================================================
+vectorizer = TfidfVectorizer()
+tfidf = vectorizer.fit_transform(processed_docs)
+training_queries = ["information retrieval", "query expansion", "search engines", "duplicate detection"]
+# Ground truth relevant documents mapping for training queries
+query_relevance = {
+    "information retrieval": {0, 1, 2, 3, 4},
+    "query expansion": {5, 6, 7},
+    "search engines": {1, 3, 8},
+    "duplicate detection": {8, 9}
+}
+# (Alpha, Beta, Gamma) settings for Term Reweighting
+settings = [(1.0, 0.75, 0.15), (1.0, 0.50, 0.25), (1.0, 1.00, 0.50)]
+def avg_precision(score_vector, relevant_ids):
+    ranked = np.argsort(score_vector)[::-1]
+    hits, s = 0, 0
+    for rank, d in enumerate(ranked, 1):
+        if d in relevant_ids:
+            hits += 1
+            s += hits / rank
+    return s / len(relevant_ids) if len(relevant_ids) > 0 else 0
+map_rows = []
+for a, b, g in settings:
+    before_list, after_list = [], []
+    for tq in training_queries:
+        tq_vec = vectorizer.transform([" ".join(preprocess(tq))])
+        base = cosine_similarity(tq_vec, tfidf)[0]
+        # Pseudo-relevance for Rocchio on this query
+        top = base.argsort()[::-1][:3]
+        rel = tfidf[top]
+        nonrel = tfidf[[i for i in range(len(docs)) if i not in top]]
+        rq = a * tq_vec + b * np.asarray(rel.mean(axis=0)) - g * np.asarray(nonrel.mean(axis=0))
+        updated = cosine_similarity(np.asarray(rq), tfidf)[0]
+        # Calculate Average Precision
+        before_list.append(avg_precision(base, query_relevance[tq]))
+        after_list.append(avg_precision(updated, query_relevance[tq]))
+    mb, ma = np.mean(before_list), np.mean(after_list)
+    change = ((ma - mb) / mb) * 100 if mb else 0
+    map_rows.append([a, b, g, round(mb, 3), round(ma, 3), round(change, 3)])
+map_df = pd.DataFrame(map_rows, columns=["Alpha", "Beta", "Gamma", "MAP Before", "MAP After", "Percent Change in MAP"])
+print("\nPERCENT CHANGE IN MEAN AVERAGE PRECISION ON TRAINING QUERIES")
+print("=" * 60)
+print(map_df)
+# =====================================================================
+# 5. VISUALIZATIONS
+# =====================================================================
+plt.figure(figsize=(12, 5))
+plt.subplot(1, 2, 1)
+plt.plot(bucket_df["Bucket Size"], bucket_df["Precision"], marker='o', label="Precision")
+plt.plot(bucket_df["Bucket Size"], bucket_df["Recall"], marker='s', label="Recall")
+plt.plot(bucket_df["Bucket Size"], bucket_df["Fscore"], marker='^', label="Fscore")
+plt.title("PRF vs Bucket Size")
+plt.xlabel("Bucket Size")
+plt.ylabel("Value")
+plt.legend()
+plt.subplot(1, 2, 2)
+plt.plot(compression_df["Signature Rows Used"], compression_df["Compression Ratio"], marker='o', label="Compression")
+plt.plot(compression_df["Signature Rows Used"], compression_df["Accuracy"], marker='s', label="Accuracy")
+plt.title("Compression Ratio and Accuracy")
+plt.xlabel("Signature Rows Used")
+plt.ylabel("Value")
+plt.legend()
+plt.tight_layout()
+plt.savefig("eval_metrics_plots.png", dpi=150)
+print("\nMetrics plots saved to 'eval_metrics_plots.png'.")
+plt.show()

bm_preprocessing-1.2.0/src/bm_preprocessing/IR/sources/ndd.py ADDED Viewed

@@ -0,0 +1,105 @@
+import numpy as np
+import pandas as pd
+import random
+import hashlib
+import nltk
+from itertools import combinations
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+nltk.download('punkt_tab', quiet=True)
+nltk.download('stopwords', quiet=True)
+random.seed(42)
+np.random.seed(42)
+# =====================================================================
+# READING FROM CORPUS (Example Code)
+# =====================================================================
+"""
+To read documents from a local corpus directory instead of the hardcoded list below,
+you can use the following snippet:
+import os
+corpus_dir = "path/to/your/corpus/folder"
+docs = []
+for filename in os.listdir(corpus_dir):
+    if filename.endswith(".txt"):
+        with open(os.path.join(corpus_dir, filename), "r", encoding="utf-8") as f:
+            docs.append(f.read())
+"""
+docs = [
+    "information retrieval is the process of obtaining relevant documents",
+    "search engines use ranking algorithms for information retrieval",
+    "information retrieval systems index and rank documents",
+    "retrieval models help search engines find relevant documents",
+    "inverted index is widely used in information retrieval",
+    "query expansion improves retrieval effectiveness",
+    "query expansion adds related terms to the query",
+    "expansion techniques improve search results",
+    "duplicate documents appear frequently in search engines",
+    "near duplicate detection improves indexing"
+]
+stop_words = set(stopwords.words('english'))
+stemmer = PorterStemmer()
+def preprocess(text):
+    return [stemmer.stem(w) for w in word_tokenize(text.lower()) if w.isalnum() and w not in stop_words]
+def sim_df(mat, title):
+    df = pd.DataFrame(np.round(np.asarray(mat), 3),
+                      index=[f"Doc{i}" for i in range(len(docs))],
+                      columns=[f"Doc{i}" for i in range(len(docs))])
+    print(f"\n{title}")
+    print(df)
+    return df
+processed_docs = [" ".join(preprocess(doc)) for doc in docs]
+shingles = [set(preprocess(doc)) for doc in docs]
+# =====================================================================
+# 1. MINHASH
+# =====================================================================
+num_hash, max_shingle = 50, 1000
+hash_funcs = [(random.randint(1, max_shingle), random.randint(0, max_shingle)) for _ in range(num_hash)]
+vocab = list(set(word for doc in shingles for word in doc))
+shingle_index = {w: i for i, w in enumerate(vocab)}
+def h(x, a, b): return (a * x + b) % max_shingle
+signature = np.full((num_hash, len(docs)), np.inf)
+for d, doc in enumerate(shingles):
+    for word in doc:
+        idx = shingle_index[word]
+        for i, (a, b) in enumerate(hash_funcs):
+            signature[i, d] = min(signature[i, d], h(idx, a, b))
+signature = signature.astype(int)
+minhash_sim = np.matrix([[np.mean(signature[:, i] == signature[:, j]) for j in range(len(docs))] for i in range(len(docs))])
+sim_df(minhash_sim, "MinHash Similarity Table")
+# =====================================================================
+# 2. LOCALITY SENSITIVE HASHING (LSH)
+# =====================================================================
+def get_lsh_candidates(sig, bands):
+    rows = sig.shape[0] // bands
+    buckets, candidates = {}, set()
+    for b in range(bands):
+        for d in range(sig.shape[1]):
+            band = tuple(sig[b * rows:(b + 1) * rows, d])
+            key = hashlib.md5(str(band).encode()).hexdigest()
+            buckets.setdefault((b, key), []).append(d)
+    for group in buckets.values():
+        if len(group) > 1:
+            for pair in combinations(group, 2):
+                candidates.add(tuple(sorted(pair)))
+    return candidates
+bands = 10
+candidates = get_lsh_candidates(signature, bands)
+lsh_df = pd.DataFrame([(f"Doc{i}", f"Doc{j}") for i, j in sorted(candidates)], columns=["Document 1", "Document 2"])
+print(f"\nLSH Candidate Pairs Table (Bands={bands})")
+print(lsh_df)

bm_preprocessing-1.2.0/src/bm_preprocessing/IR/sources/rel.py ADDED Viewed

@@ -0,0 +1,116 @@
+import numpy as np
+import pandas as pd
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+nltk.download('punkt_tab', quiet=True)
+nltk.download('stopwords', quiet=True)
+# =====================================================================
+# READING FROM CORPUS (Example Code)
+# =====================================================================
+"""
+To read documents from a local corpus directory:
+import os
+corpus_dir = "path/to/your/corpus/folder"
+docs = []
+for filename in os.listdir(corpus_dir):
+    if filename.endswith(".txt"):
+        with open(os.path.join(corpus_dir, filename), "r", encoding="utf-8") as f:
+            docs.append(f.read())
+"""
+docs = [
+    "information retrieval is the process of obtaining relevant documents",
+    "search engines use ranking algorithms for information retrieval",
+    "information retrieval systems index and rank documents",
+    "retrieval models help search engines find relevant documents",
+    "inverted index is widely used in information retrieval",
+    "query expansion improves retrieval effectiveness",
+    "query expansion adds related terms to the query",
+    "expansion techniques improve search results",
+    "duplicate documents appear frequently in search engines",
+    "near duplicate detection improves indexing"
+]
+stop_words = set(stopwords.words('english'))
+stemmer = PorterStemmer()
+def preprocess(text):
+    return [stemmer.stem(w) for w in word_tokenize(text.lower()) if w.isalnum() and w not in stop_words]
+processed_docs = [" ".join(preprocess(doc)) for doc in docs]
+vectorizer = TfidfVectorizer()
+tfidf = vectorizer.fit_transform(processed_docs)
+query = "information retrieval"
+processed_query = " ".join(preprocess(query))
+q_vec = vectorizer.transform([processed_query])
+# =====================================================================
+# 1. ROCCHIO'S FEEDBACK ALGORITHM
+# =====================================================================
+scores = cosine_similarity(q_vec, tfidf)[0]
+# Pseudo-relevance assumption: top 3 docs are relevant (in a real system, top 10-20)
+num_pseudo_relevant = 3
+top_docs = scores.argsort()[::-1][:num_pseudo_relevant]
+alpha, beta, gamma = 1.0, 0.75, 0.15
+relevant = tfidf[top_docs]
+non_relevant = tfidf[[i for i in range(len(docs)) if i not in top_docs]]
+new_query = alpha * q_vec + beta * np.asarray(relevant.mean(axis=0)) - gamma * np.asarray(non_relevant.mean(axis=0))
+new_scores = cosine_similarity(np.asarray(new_query), tfidf)[0]
+rocchio_df = pd.DataFrame({
+    "Document": [f"Doc{i}" for i in range(len(docs))],
+    "Original Score": np.round(scores, 3),
+    "Updated Score (Rocchio)": np.round(new_scores, 3)
+})
+print("\nROCCHIO ALGORITHM SCORE TABLE")
+print("=" * 60)
+print(rocchio_df)
+# =====================================================================
+# 2. LOCAL CONTEXT ANALYSIS (LCA)
+# =====================================================================
+# Measures the co-occurrence of a term with all query terms based on information
+# from pseudo-relevant documents (top 10-20 documents returned by initial search).
+# Since our corpus is small, we'll use top 5 pseudo-relevant documents.
+num_lca_pseudo_relevant = 5
+top_k_lca = scores.argsort()[::-1][:num_lca_pseudo_relevant]
+top_docs_lca = [processed_docs[i] for i in top_k_lca]
+term_freq = {}
+for doc in top_docs_lca:
+    for word in doc.split():
+        term_freq[word] = term_freq.get(word, 0) + 1
+# Extract top expanded terms from pseudo-relevant docs
+num_expansion_terms = 5
+expanded_terms = sorted(term_freq, key=term_freq.get, reverse=True)[:num_expansion_terms]
+expanded_query = processed_query + " " + " ".join(expanded_terms)
+expanded_vec = vectorizer.transform([expanded_query])
+expanded_scores = cosine_similarity(expanded_vec, tfidf)[0]
+print("\nLOCAL CONTEXT ANALYSIS (LCA)")
+print("=" * 60)
+print(f"Original Query: {processed_query}")
+print(f"Expanded Query: {expanded_query}")
+lca_df = pd.DataFrame({
+    "Document": [f"Doc{i}" for i in range(len(docs))],
+    "Original Score": np.round(scores, 3),
+    "LCA Expanded Score": np.round(expanded_scores, 3)
+})
+print("\nLCA SCORE TABLE")
+print(lca_df)